diff --git "a/resnet50/checkpoint-237000/trainer_state.json" "b/resnet50/checkpoint-237000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/resnet50/checkpoint-237000/trainer_state.json" @@ -0,0 +1,170200 @@ +{ + "best_global_step": 165500, + "best_metric": 0.9933871685636168, + "best_model_checkpoint": "/workspace/output/resnet50/checkpoint-165500", + "epoch": 33.64088005677786, + "eval_steps": 500, + "global_step": 237000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014194464158978, + "grad_norm": 3.2342276573181152, + "learning_rate": 9.999872249822569e-05, + "loss": 5.98863525390625, + "step": 10 + }, + { + "epoch": 0.0028388928317956, + "grad_norm": 3.3994972705841064, + "learning_rate": 9.99973030518098e-05, + "loss": 5.97633056640625, + "step": 20 + }, + { + "epoch": 0.0042583392476933995, + "grad_norm": 3.3180341720581055, + "learning_rate": 9.99958836053939e-05, + "loss": 5.97711181640625, + "step": 30 + }, + { + "epoch": 0.0056777856635912, + "grad_norm": 2.9379143714904785, + "learning_rate": 9.999446415897801e-05, + "loss": 5.9991455078125, + "step": 40 + }, + { + "epoch": 0.007097232079488999, + "grad_norm": 2.2698018550872803, + "learning_rate": 9.99930447125621e-05, + "loss": 5.96363525390625, + "step": 50 + }, + { + "epoch": 0.008516678495386799, + "grad_norm": 2.0626659393310547, + "learning_rate": 9.99916252661462e-05, + "loss": 5.96995849609375, + "step": 60 + }, + { + "epoch": 0.0099361249112846, + "grad_norm": 2.814460277557373, + "learning_rate": 9.999020581973031e-05, + "loss": 5.9493408203125, + "step": 70 + }, + { + "epoch": 0.0113555713271824, + "grad_norm": 2.871051788330078, + "learning_rate": 9.998878637331441e-05, + "loss": 5.9510498046875, + "step": 80 + }, + { + "epoch": 0.0127750177430802, + "grad_norm": 2.3897151947021484, + "learning_rate": 9.998736692689852e-05, + "loss": 5.94254150390625, + "step": 90 + }, + { + "epoch": 0.014194464158977998, + "grad_norm": 2.9910531044006348, + "learning_rate": 9.99859474804826e-05, + "loss": 5.9062255859375, + "step": 100 + }, + { + "epoch": 0.015613910574875798, + "grad_norm": 3.137518882751465, + "learning_rate": 9.998452803406672e-05, + "loss": 5.9070068359375, + "step": 110 + }, + { + "epoch": 0.017033356990773598, + "grad_norm": 3.021024703979492, + "learning_rate": 9.998310858765082e-05, + "loss": 5.87197265625, + "step": 120 + }, + { + "epoch": 0.018452803406671398, + "grad_norm": 3.499450445175171, + "learning_rate": 9.998168914123493e-05, + "loss": 5.8237548828125, + "step": 130 + }, + { + "epoch": 0.0198722498225692, + "grad_norm": 3.87576961517334, + "learning_rate": 9.998026969481902e-05, + "loss": 5.754150390625, + "step": 140 + }, + { + "epoch": 0.021291696238467, + "grad_norm": 3.9846458435058594, + "learning_rate": 9.997885024840313e-05, + "loss": 5.697198486328125, + "step": 150 + }, + { + "epoch": 0.0227111426543648, + "grad_norm": 4.339130878448486, + "learning_rate": 9.997743080198723e-05, + "loss": 5.63760986328125, + "step": 160 + }, + { + "epoch": 0.0241305890702626, + "grad_norm": 4.891483783721924, + "learning_rate": 9.997601135557133e-05, + "loss": 5.5271728515625, + "step": 170 + }, + { + "epoch": 0.0255500354861604, + "grad_norm": 5.147222995758057, + "learning_rate": 9.997459190915544e-05, + "loss": 5.45938720703125, + "step": 180 + }, + { + "epoch": 0.0269694819020582, + "grad_norm": 5.365755558013916, + "learning_rate": 9.997317246273954e-05, + "loss": 5.355255126953125, + "step": 190 + }, + { + "epoch": 0.028388928317955996, + "grad_norm": 5.888001918792725, + "learning_rate": 9.997175301632365e-05, + "loss": 5.1554931640625, + "step": 200 + }, + { + "epoch": 0.029808374733853796, + "grad_norm": 6.100172996520996, + "learning_rate": 9.997033356990773e-05, + "loss": 5.035284423828125, + "step": 210 + }, + { + "epoch": 0.031227821149751596, + "grad_norm": 6.491486549377441, + "learning_rate": 9.996891412349184e-05, + "loss": 4.899530029296875, + "step": 220 + }, + { + "epoch": 0.032647267565649396, + "grad_norm": 6.916806697845459, + "learning_rate": 9.996749467707594e-05, + "loss": 4.851350402832031, + "step": 230 + }, + { + "epoch": 0.034066713981547196, + "grad_norm": 6.837950706481934, + "learning_rate": 9.996607523066005e-05, + "loss": 4.726431274414063, + "step": 240 + }, + { + "epoch": 0.035486160397444996, + "grad_norm": 7.554074287414551, + "learning_rate": 9.996465578424415e-05, + "loss": 4.4839630126953125, + "step": 250 + }, + { + "epoch": 0.036905606813342796, + "grad_norm": 7.574995994567871, + "learning_rate": 9.996323633782825e-05, + "loss": 4.506732177734375, + "step": 260 + }, + { + "epoch": 0.0383250532292406, + "grad_norm": 7.498238563537598, + "learning_rate": 9.996181689141236e-05, + "loss": 4.319998168945313, + "step": 270 + }, + { + "epoch": 0.0397444996451384, + "grad_norm": 7.978142261505127, + "learning_rate": 9.996039744499645e-05, + "loss": 4.214613342285157, + "step": 280 + }, + { + "epoch": 0.0411639460610362, + "grad_norm": 8.194511413574219, + "learning_rate": 9.995897799858057e-05, + "loss": 4.212762451171875, + "step": 290 + }, + { + "epoch": 0.042583392476934, + "grad_norm": 8.136639595031738, + "learning_rate": 9.995755855216466e-05, + "loss": 4.009028625488281, + "step": 300 + }, + { + "epoch": 0.0440028388928318, + "grad_norm": 8.684012413024902, + "learning_rate": 9.995613910574876e-05, + "loss": 3.9817459106445314, + "step": 310 + }, + { + "epoch": 0.0454222853087296, + "grad_norm": 8.888952255249023, + "learning_rate": 9.995471965933286e-05, + "loss": 3.94019775390625, + "step": 320 + }, + { + "epoch": 0.0468417317246274, + "grad_norm": 8.79919719696045, + "learning_rate": 9.995330021291697e-05, + "loss": 3.9265777587890627, + "step": 330 + }, + { + "epoch": 0.0482611781405252, + "grad_norm": 8.571785926818848, + "learning_rate": 9.995188076650107e-05, + "loss": 3.7262115478515625, + "step": 340 + }, + { + "epoch": 0.049680624556423, + "grad_norm": 8.640142440795898, + "learning_rate": 9.995046132008518e-05, + "loss": 3.644915771484375, + "step": 350 + }, + { + "epoch": 0.0511000709723208, + "grad_norm": 9.322779655456543, + "learning_rate": 9.994904187366927e-05, + "loss": 3.644049072265625, + "step": 360 + }, + { + "epoch": 0.0525195173882186, + "grad_norm": 8.790424346923828, + "learning_rate": 9.994762242725337e-05, + "loss": 3.4869285583496095, + "step": 370 + }, + { + "epoch": 0.0539389638041164, + "grad_norm": 9.344154357910156, + "learning_rate": 9.994620298083748e-05, + "loss": 3.55142822265625, + "step": 380 + }, + { + "epoch": 0.05535841022001419, + "grad_norm": 8.807840347290039, + "learning_rate": 9.994478353442158e-05, + "loss": 3.4293190002441407, + "step": 390 + }, + { + "epoch": 0.05677785663591199, + "grad_norm": 9.36971378326416, + "learning_rate": 9.994336408800569e-05, + "loss": 3.429082489013672, + "step": 400 + }, + { + "epoch": 0.05819730305180979, + "grad_norm": 9.73521900177002, + "learning_rate": 9.994194464158977e-05, + "loss": 3.408639907836914, + "step": 410 + }, + { + "epoch": 0.05961674946770759, + "grad_norm": 9.646844863891602, + "learning_rate": 9.994052519517389e-05, + "loss": 3.1950119018554686, + "step": 420 + }, + { + "epoch": 0.06103619588360539, + "grad_norm": 9.722207069396973, + "learning_rate": 9.993910574875798e-05, + "loss": 3.4140243530273438, + "step": 430 + }, + { + "epoch": 0.06245564229950319, + "grad_norm": 10.609601020812988, + "learning_rate": 9.99376863023421e-05, + "loss": 3.320109558105469, + "step": 440 + }, + { + "epoch": 0.063875088715401, + "grad_norm": 10.271575927734375, + "learning_rate": 9.993626685592619e-05, + "loss": 3.232251739501953, + "step": 450 + }, + { + "epoch": 0.06529453513129879, + "grad_norm": 9.766585350036621, + "learning_rate": 9.993484740951029e-05, + "loss": 3.149517059326172, + "step": 460 + }, + { + "epoch": 0.0667139815471966, + "grad_norm": 10.358244895935059, + "learning_rate": 9.99334279630944e-05, + "loss": 3.1863967895507814, + "step": 470 + }, + { + "epoch": 0.06813342796309439, + "grad_norm": 10.473136901855469, + "learning_rate": 9.99320085166785e-05, + "loss": 3.222390365600586, + "step": 480 + }, + { + "epoch": 0.0695528743789922, + "grad_norm": 9.905110359191895, + "learning_rate": 9.993058907026261e-05, + "loss": 3.1823768615722656, + "step": 490 + }, + { + "epoch": 0.07097232079488999, + "grad_norm": 9.858973503112793, + "learning_rate": 9.99291696238467e-05, + "loss": 2.9202560424804687, + "step": 500 + }, + { + "epoch": 0.07097232079488999, + "eval_accuracy": 0.1867489031601704, + "eval_loss": 3.0744524002075195, + "eval_runtime": 31.2289, + "eval_samples_per_second": 503.605, + "eval_steps_per_second": 15.755, + "step": 500 + }, + { + "epoch": 0.0723917672107878, + "grad_norm": 10.224215507507324, + "learning_rate": 9.992775017743082e-05, + "loss": 3.0410499572753906, + "step": 510 + }, + { + "epoch": 0.07381121362668559, + "grad_norm": 9.867650032043457, + "learning_rate": 9.99263307310149e-05, + "loss": 3.116912078857422, + "step": 520 + }, + { + "epoch": 0.07523066004258339, + "grad_norm": 10.343064308166504, + "learning_rate": 9.992491128459901e-05, + "loss": 3.06390266418457, + "step": 530 + }, + { + "epoch": 0.0766501064584812, + "grad_norm": 10.38116455078125, + "learning_rate": 9.992349183818311e-05, + "loss": 2.973680114746094, + "step": 540 + }, + { + "epoch": 0.07806955287437899, + "grad_norm": 10.979643821716309, + "learning_rate": 9.992207239176722e-05, + "loss": 3.0906436920166014, + "step": 550 + }, + { + "epoch": 0.0794889992902768, + "grad_norm": 10.06657886505127, + "learning_rate": 9.992065294535132e-05, + "loss": 3.0091484069824217, + "step": 560 + }, + { + "epoch": 0.08090844570617459, + "grad_norm": 10.663322448730469, + "learning_rate": 9.991923349893541e-05, + "loss": 2.862255859375, + "step": 570 + }, + { + "epoch": 0.0823278921220724, + "grad_norm": 9.277785301208496, + "learning_rate": 9.991781405251952e-05, + "loss": 2.8638259887695314, + "step": 580 + }, + { + "epoch": 0.08374733853797019, + "grad_norm": 10.807332038879395, + "learning_rate": 9.991639460610362e-05, + "loss": 2.732352066040039, + "step": 590 + }, + { + "epoch": 0.085166784953868, + "grad_norm": 9.970373153686523, + "learning_rate": 9.991497515968773e-05, + "loss": 2.736968231201172, + "step": 600 + }, + { + "epoch": 0.08658623136976579, + "grad_norm": 11.008269309997559, + "learning_rate": 9.991355571327183e-05, + "loss": 2.7735246658325194, + "step": 610 + }, + { + "epoch": 0.0880056777856636, + "grad_norm": 8.758193969726562, + "learning_rate": 9.991213626685593e-05, + "loss": 2.5436214447021483, + "step": 620 + }, + { + "epoch": 0.08942512420156139, + "grad_norm": 11.253259658813477, + "learning_rate": 9.991071682044003e-05, + "loss": 2.748835563659668, + "step": 630 + }, + { + "epoch": 0.0908445706174592, + "grad_norm": 10.979547500610352, + "learning_rate": 9.990929737402414e-05, + "loss": 2.7314834594726562, + "step": 640 + }, + { + "epoch": 0.09226401703335699, + "grad_norm": 11.182887077331543, + "learning_rate": 9.990787792760823e-05, + "loss": 2.645678901672363, + "step": 650 + }, + { + "epoch": 0.0936834634492548, + "grad_norm": 10.636208534240723, + "learning_rate": 9.990645848119234e-05, + "loss": 2.5704013824462892, + "step": 660 + }, + { + "epoch": 0.09510290986515259, + "grad_norm": 10.351170539855957, + "learning_rate": 9.990503903477644e-05, + "loss": 2.5628406524658205, + "step": 670 + }, + { + "epoch": 0.0965223562810504, + "grad_norm": 9.914809226989746, + "learning_rate": 9.990361958836054e-05, + "loss": 2.5872230529785156, + "step": 680 + }, + { + "epoch": 0.09794180269694819, + "grad_norm": 10.839837074279785, + "learning_rate": 9.990220014194465e-05, + "loss": 2.490940475463867, + "step": 690 + }, + { + "epoch": 0.099361249112846, + "grad_norm": 11.259613990783691, + "learning_rate": 9.990078069552875e-05, + "loss": 2.64483585357666, + "step": 700 + }, + { + "epoch": 0.10078069552874379, + "grad_norm": 11.213078498840332, + "learning_rate": 9.989936124911286e-05, + "loss": 2.5397150039672853, + "step": 710 + }, + { + "epoch": 0.1022001419446416, + "grad_norm": 10.366206169128418, + "learning_rate": 9.989794180269694e-05, + "loss": 2.457781219482422, + "step": 720 + }, + { + "epoch": 0.10361958836053939, + "grad_norm": 11.44458293914795, + "learning_rate": 9.989652235628105e-05, + "loss": 2.5090484619140625, + "step": 730 + }, + { + "epoch": 0.1050390347764372, + "grad_norm": 11.689805030822754, + "learning_rate": 9.989510290986515e-05, + "loss": 2.409171485900879, + "step": 740 + }, + { + "epoch": 0.10645848119233499, + "grad_norm": 10.568279266357422, + "learning_rate": 9.989368346344926e-05, + "loss": 2.3308380126953123, + "step": 750 + }, + { + "epoch": 0.1078779276082328, + "grad_norm": 11.917696952819824, + "learning_rate": 9.989226401703337e-05, + "loss": 2.3733493804931642, + "step": 760 + }, + { + "epoch": 0.10929737402413059, + "grad_norm": 9.960722923278809, + "learning_rate": 9.989098651525906e-05, + "loss": 2.4058095932006838, + "step": 770 + }, + { + "epoch": 0.11071682044002838, + "grad_norm": 11.068999290466309, + "learning_rate": 9.988956706884315e-05, + "loss": 2.4371658325195313, + "step": 780 + }, + { + "epoch": 0.11213626685592619, + "grad_norm": 10.340009689331055, + "learning_rate": 9.988814762242725e-05, + "loss": 2.2587520599365236, + "step": 790 + }, + { + "epoch": 0.11355571327182398, + "grad_norm": 9.941303253173828, + "learning_rate": 9.988672817601136e-05, + "loss": 2.268446350097656, + "step": 800 + }, + { + "epoch": 0.11497515968772179, + "grad_norm": 11.490272521972656, + "learning_rate": 9.988530872959546e-05, + "loss": 2.471067428588867, + "step": 810 + }, + { + "epoch": 0.11639460610361958, + "grad_norm": 10.67241382598877, + "learning_rate": 9.988388928317957e-05, + "loss": 2.3497791290283203, + "step": 820 + }, + { + "epoch": 0.11781405251951739, + "grad_norm": 10.710894584655762, + "learning_rate": 9.988246983676367e-05, + "loss": 2.1724626541137697, + "step": 830 + }, + { + "epoch": 0.11923349893541518, + "grad_norm": 10.985452651977539, + "learning_rate": 9.988105039034778e-05, + "loss": 2.1848114013671873, + "step": 840 + }, + { + "epoch": 0.12065294535131299, + "grad_norm": 10.063145637512207, + "learning_rate": 9.987963094393186e-05, + "loss": 2.180558776855469, + "step": 850 + }, + { + "epoch": 0.12207239176721078, + "grad_norm": 11.236614227294922, + "learning_rate": 9.987821149751597e-05, + "loss": 2.282668876647949, + "step": 860 + }, + { + "epoch": 0.12349183818310859, + "grad_norm": 10.98898983001709, + "learning_rate": 9.987679205110007e-05, + "loss": 2.235186767578125, + "step": 870 + }, + { + "epoch": 0.12491128459900638, + "grad_norm": 11.805492401123047, + "learning_rate": 9.987537260468418e-05, + "loss": 2.2264921188354494, + "step": 880 + }, + { + "epoch": 0.1263307310149042, + "grad_norm": 10.717041015625, + "learning_rate": 9.987395315826828e-05, + "loss": 2.1385255813598634, + "step": 890 + }, + { + "epoch": 0.127750177430802, + "grad_norm": 9.613192558288574, + "learning_rate": 9.987253371185238e-05, + "loss": 2.1964336395263673, + "step": 900 + }, + { + "epoch": 0.12916962384669978, + "grad_norm": 10.594833374023438, + "learning_rate": 9.987111426543649e-05, + "loss": 2.050688362121582, + "step": 910 + }, + { + "epoch": 0.13058907026259758, + "grad_norm": 11.596671104431152, + "learning_rate": 9.986969481902059e-05, + "loss": 2.077385139465332, + "step": 920 + }, + { + "epoch": 0.1320085166784954, + "grad_norm": 10.779032707214355, + "learning_rate": 9.98682753726047e-05, + "loss": 2.0280479431152343, + "step": 930 + }, + { + "epoch": 0.1334279630943932, + "grad_norm": 10.522924423217773, + "learning_rate": 9.98668559261888e-05, + "loss": 1.9384689331054688, + "step": 940 + }, + { + "epoch": 0.13484740951029098, + "grad_norm": 9.86844539642334, + "learning_rate": 9.986543647977289e-05, + "loss": 2.0612548828125, + "step": 950 + }, + { + "epoch": 0.13626685592618878, + "grad_norm": 12.521405220031738, + "learning_rate": 9.986401703335699e-05, + "loss": 2.139466094970703, + "step": 960 + }, + { + "epoch": 0.1376863023420866, + "grad_norm": 11.292656898498535, + "learning_rate": 9.98625975869411e-05, + "loss": 2.077956199645996, + "step": 970 + }, + { + "epoch": 0.1391057487579844, + "grad_norm": 11.186986923217773, + "learning_rate": 9.98611781405252e-05, + "loss": 2.028730010986328, + "step": 980 + }, + { + "epoch": 0.14052519517388218, + "grad_norm": 10.553022384643555, + "learning_rate": 9.985975869410931e-05, + "loss": 1.9375551223754883, + "step": 990 + }, + { + "epoch": 0.14194464158977999, + "grad_norm": 11.089204788208008, + "learning_rate": 9.98583392476934e-05, + "loss": 2.0689823150634767, + "step": 1000 + }, + { + "epoch": 0.14194464158977999, + "eval_accuracy": 0.42239460799898265, + "eval_loss": 1.9010688066482544, + "eval_runtime": 31.4593, + "eval_samples_per_second": 499.916, + "eval_steps_per_second": 15.639, + "step": 1000 + }, + { + "epoch": 0.1433640880056778, + "grad_norm": 10.988676071166992, + "learning_rate": 9.98569198012775e-05, + "loss": 1.9830604553222657, + "step": 1010 + }, + { + "epoch": 0.1447835344215756, + "grad_norm": 11.2459077835083, + "learning_rate": 9.985550035486161e-05, + "loss": 1.9190074920654296, + "step": 1020 + }, + { + "epoch": 0.14620298083747338, + "grad_norm": 10.437894821166992, + "learning_rate": 9.985408090844571e-05, + "loss": 1.8999460220336915, + "step": 1030 + }, + { + "epoch": 0.14762242725337119, + "grad_norm": 10.94793701171875, + "learning_rate": 9.985266146202982e-05, + "loss": 1.8579456329345703, + "step": 1040 + }, + { + "epoch": 0.149041873669269, + "grad_norm": 11.168233871459961, + "learning_rate": 9.98512420156139e-05, + "loss": 1.8979732513427734, + "step": 1050 + }, + { + "epoch": 0.15046132008516677, + "grad_norm": 10.14195728302002, + "learning_rate": 9.984982256919802e-05, + "loss": 1.7833553314208985, + "step": 1060 + }, + { + "epoch": 0.15188076650106458, + "grad_norm": 9.160737991333008, + "learning_rate": 9.984840312278211e-05, + "loss": 1.8624576568603515, + "step": 1070 + }, + { + "epoch": 0.1533002129169624, + "grad_norm": 11.151049613952637, + "learning_rate": 9.984698367636623e-05, + "loss": 1.8210905075073243, + "step": 1080 + }, + { + "epoch": 0.1547196593328602, + "grad_norm": 10.053725242614746, + "learning_rate": 9.984556422995032e-05, + "loss": 1.7738643646240235, + "step": 1090 + }, + { + "epoch": 0.15613910574875797, + "grad_norm": 10.97727108001709, + "learning_rate": 9.984414478353442e-05, + "loss": 1.866429328918457, + "step": 1100 + }, + { + "epoch": 0.15755855216465578, + "grad_norm": 12.384384155273438, + "learning_rate": 9.984272533711853e-05, + "loss": 1.8680984497070312, + "step": 1110 + }, + { + "epoch": 0.1589779985805536, + "grad_norm": 11.387879371643066, + "learning_rate": 9.984130589070263e-05, + "loss": 1.8034194946289062, + "step": 1120 + }, + { + "epoch": 0.1603974449964514, + "grad_norm": 10.6587495803833, + "learning_rate": 9.983988644428674e-05, + "loss": 1.772690773010254, + "step": 1130 + }, + { + "epoch": 0.16181689141234917, + "grad_norm": 12.721858024597168, + "learning_rate": 9.983846699787084e-05, + "loss": 1.7724496841430664, + "step": 1140 + }, + { + "epoch": 0.16323633782824698, + "grad_norm": 11.116838455200195, + "learning_rate": 9.983704755145493e-05, + "loss": 1.7527042388916017, + "step": 1150 + }, + { + "epoch": 0.1646557842441448, + "grad_norm": 10.033406257629395, + "learning_rate": 9.983562810503903e-05, + "loss": 1.674898338317871, + "step": 1160 + }, + { + "epoch": 0.1660752306600426, + "grad_norm": 11.121773719787598, + "learning_rate": 9.983420865862314e-05, + "loss": 1.741505241394043, + "step": 1170 + }, + { + "epoch": 0.16749467707594037, + "grad_norm": 11.052094459533691, + "learning_rate": 9.983278921220724e-05, + "loss": 1.7749841690063477, + "step": 1180 + }, + { + "epoch": 0.16891412349183818, + "grad_norm": 10.183452606201172, + "learning_rate": 9.983136976579135e-05, + "loss": 1.6881484985351562, + "step": 1190 + }, + { + "epoch": 0.170333569907736, + "grad_norm": 11.106999397277832, + "learning_rate": 9.982995031937545e-05, + "loss": 1.814961051940918, + "step": 1200 + }, + { + "epoch": 0.1717530163236338, + "grad_norm": 12.08647632598877, + "learning_rate": 9.982853087295955e-05, + "loss": 1.682515525817871, + "step": 1210 + }, + { + "epoch": 0.17317246273953157, + "grad_norm": 13.744584083557129, + "learning_rate": 9.982711142654366e-05, + "loss": 1.6713733673095703, + "step": 1220 + }, + { + "epoch": 0.17459190915542938, + "grad_norm": 9.970173835754395, + "learning_rate": 9.982569198012775e-05, + "loss": 1.711156463623047, + "step": 1230 + }, + { + "epoch": 0.1760113555713272, + "grad_norm": 11.027495384216309, + "learning_rate": 9.982427253371186e-05, + "loss": 1.759619140625, + "step": 1240 + }, + { + "epoch": 0.177430801987225, + "grad_norm": 10.876315116882324, + "learning_rate": 9.982285308729596e-05, + "loss": 1.618482780456543, + "step": 1250 + }, + { + "epoch": 0.17885024840312277, + "grad_norm": 10.26490592956543, + "learning_rate": 9.982143364088006e-05, + "loss": 1.6674427032470702, + "step": 1260 + }, + { + "epoch": 0.18026969481902058, + "grad_norm": 11.872292518615723, + "learning_rate": 9.982001419446416e-05, + "loss": 1.6325908660888673, + "step": 1270 + }, + { + "epoch": 0.1816891412349184, + "grad_norm": 9.946234703063965, + "learning_rate": 9.981859474804827e-05, + "loss": 1.5453743934631348, + "step": 1280 + }, + { + "epoch": 0.18310858765081617, + "grad_norm": 11.03128719329834, + "learning_rate": 9.981717530163236e-05, + "loss": 1.658684539794922, + "step": 1290 + }, + { + "epoch": 0.18452803406671398, + "grad_norm": 12.145915031433105, + "learning_rate": 9.981575585521648e-05, + "loss": 1.5792274475097656, + "step": 1300 + }, + { + "epoch": 0.18594748048261178, + "grad_norm": 11.820379257202148, + "learning_rate": 9.981433640880057e-05, + "loss": 1.5301803588867187, + "step": 1310 + }, + { + "epoch": 0.1873669268985096, + "grad_norm": 11.046746253967285, + "learning_rate": 9.981291696238467e-05, + "loss": 1.6124080657958983, + "step": 1320 + }, + { + "epoch": 0.18878637331440737, + "grad_norm": 9.545868873596191, + "learning_rate": 9.981149751596878e-05, + "loss": 1.5502593994140625, + "step": 1330 + }, + { + "epoch": 0.19020581973030518, + "grad_norm": 11.999979019165039, + "learning_rate": 9.981007806955288e-05, + "loss": 1.5360203742980958, + "step": 1340 + }, + { + "epoch": 0.19162526614620298, + "grad_norm": 9.949675559997559, + "learning_rate": 9.980865862313699e-05, + "loss": 1.353858470916748, + "step": 1350 + }, + { + "epoch": 0.1930447125621008, + "grad_norm": 11.573400497436523, + "learning_rate": 9.980723917672107e-05, + "loss": 1.3946660995483398, + "step": 1360 + }, + { + "epoch": 0.19446415897799857, + "grad_norm": 10.249485969543457, + "learning_rate": 9.980581973030518e-05, + "loss": 1.518262004852295, + "step": 1370 + }, + { + "epoch": 0.19588360539389638, + "grad_norm": 10.011629104614258, + "learning_rate": 9.980440028388928e-05, + "loss": 1.5000194549560546, + "step": 1380 + }, + { + "epoch": 0.19730305180979418, + "grad_norm": 12.186440467834473, + "learning_rate": 9.980298083747339e-05, + "loss": 1.554741382598877, + "step": 1390 + }, + { + "epoch": 0.198722498225692, + "grad_norm": 11.845844268798828, + "learning_rate": 9.980156139105749e-05, + "loss": 1.4599843978881837, + "step": 1400 + }, + { + "epoch": 0.20014194464158977, + "grad_norm": 10.98592472076416, + "learning_rate": 9.980014194464159e-05, + "loss": 1.4062080383300781, + "step": 1410 + }, + { + "epoch": 0.20156139105748758, + "grad_norm": 11.54171371459961, + "learning_rate": 9.97987224982257e-05, + "loss": 1.5128003120422364, + "step": 1420 + }, + { + "epoch": 0.20298083747338538, + "grad_norm": 10.248682022094727, + "learning_rate": 9.97973030518098e-05, + "loss": 1.5022719383239747, + "step": 1430 + }, + { + "epoch": 0.2044002838892832, + "grad_norm": 8.78536319732666, + "learning_rate": 9.97958836053939e-05, + "loss": 1.4118841171264649, + "step": 1440 + }, + { + "epoch": 0.20581973030518097, + "grad_norm": 9.993626594543457, + "learning_rate": 9.9794464158978e-05, + "loss": 1.3945957183837892, + "step": 1450 + }, + { + "epoch": 0.20723917672107878, + "grad_norm": 11.31412124633789, + "learning_rate": 9.97930447125621e-05, + "loss": 1.26229887008667, + "step": 1460 + }, + { + "epoch": 0.20865862313697658, + "grad_norm": 11.182840347290039, + "learning_rate": 9.97916252661462e-05, + "loss": 1.3171740531921388, + "step": 1470 + }, + { + "epoch": 0.2100780695528744, + "grad_norm": 12.25224781036377, + "learning_rate": 9.979020581973031e-05, + "loss": 1.3310781478881837, + "step": 1480 + }, + { + "epoch": 0.21149751596877217, + "grad_norm": 11.81201457977295, + "learning_rate": 9.978878637331441e-05, + "loss": 1.3043070793151856, + "step": 1490 + }, + { + "epoch": 0.21291696238466998, + "grad_norm": 10.484480857849121, + "learning_rate": 9.978736692689852e-05, + "loss": 1.2629288673400878, + "step": 1500 + }, + { + "epoch": 0.21291696238466998, + "eval_accuracy": 0.5395180263241559, + "eval_loss": 1.438815712928772, + "eval_runtime": 32.1456, + "eval_samples_per_second": 489.242, + "eval_steps_per_second": 15.305, + "step": 1500 + }, + { + "epoch": 0.21433640880056778, + "grad_norm": 10.796157836914062, + "learning_rate": 9.978594748048262e-05, + "loss": 1.3752121925354004, + "step": 1510 + }, + { + "epoch": 0.2157558552164656, + "grad_norm": 10.1256742477417, + "learning_rate": 9.978452803406671e-05, + "loss": 1.3005435943603516, + "step": 1520 + }, + { + "epoch": 0.21717530163236337, + "grad_norm": 11.182530403137207, + "learning_rate": 9.978310858765082e-05, + "loss": 1.3048934936523438, + "step": 1530 + }, + { + "epoch": 0.21859474804826118, + "grad_norm": 10.190278053283691, + "learning_rate": 9.978168914123492e-05, + "loss": 1.3993605613708495, + "step": 1540 + }, + { + "epoch": 0.22001419446415899, + "grad_norm": 10.497735977172852, + "learning_rate": 9.978026969481903e-05, + "loss": 1.303945541381836, + "step": 1550 + }, + { + "epoch": 0.22143364088005676, + "grad_norm": 10.535606384277344, + "learning_rate": 9.977885024840313e-05, + "loss": 1.2210904121398927, + "step": 1560 + }, + { + "epoch": 0.22285308729595457, + "grad_norm": 11.385029792785645, + "learning_rate": 9.977743080198723e-05, + "loss": 1.3508376121520995, + "step": 1570 + }, + { + "epoch": 0.22427253371185238, + "grad_norm": 9.528643608093262, + "learning_rate": 9.977601135557132e-05, + "loss": 1.2278815269470216, + "step": 1580 + }, + { + "epoch": 0.22569198012775019, + "grad_norm": 13.161009788513184, + "learning_rate": 9.977459190915544e-05, + "loss": 1.254448413848877, + "step": 1590 + }, + { + "epoch": 0.22711142654364797, + "grad_norm": 11.288809776306152, + "learning_rate": 9.977317246273953e-05, + "loss": 1.271047878265381, + "step": 1600 + }, + { + "epoch": 0.22853087295954577, + "grad_norm": 11.30105209350586, + "learning_rate": 9.977175301632364e-05, + "loss": 1.3242988586425781, + "step": 1610 + }, + { + "epoch": 0.22995031937544358, + "grad_norm": 10.600774765014648, + "learning_rate": 9.977033356990774e-05, + "loss": 1.3170942306518554, + "step": 1620 + }, + { + "epoch": 0.2313697657913414, + "grad_norm": 10.652543067932129, + "learning_rate": 9.976891412349184e-05, + "loss": 1.3998719215393067, + "step": 1630 + }, + { + "epoch": 0.23278921220723917, + "grad_norm": 11.354793548583984, + "learning_rate": 9.976749467707595e-05, + "loss": 1.270443820953369, + "step": 1640 + }, + { + "epoch": 0.23420865862313697, + "grad_norm": 9.926568031311035, + "learning_rate": 9.976607523066005e-05, + "loss": 1.117215347290039, + "step": 1650 + }, + { + "epoch": 0.23562810503903478, + "grad_norm": 11.167335510253906, + "learning_rate": 9.976465578424416e-05, + "loss": 1.348717212677002, + "step": 1660 + }, + { + "epoch": 0.2370475514549326, + "grad_norm": 11.364425659179688, + "learning_rate": 9.976323633782824e-05, + "loss": 1.2113998413085938, + "step": 1670 + }, + { + "epoch": 0.23846699787083037, + "grad_norm": 10.315034866333008, + "learning_rate": 9.976181689141235e-05, + "loss": 1.2621678352355956, + "step": 1680 + }, + { + "epoch": 0.23988644428672817, + "grad_norm": 11.332146644592285, + "learning_rate": 9.976039744499645e-05, + "loss": 1.2919418334960937, + "step": 1690 + }, + { + "epoch": 0.24130589070262598, + "grad_norm": 9.863037109375, + "learning_rate": 9.975897799858056e-05, + "loss": 1.262222957611084, + "step": 1700 + }, + { + "epoch": 0.2427253371185238, + "grad_norm": 13.898163795471191, + "learning_rate": 9.975755855216467e-05, + "loss": 1.349098300933838, + "step": 1710 + }, + { + "epoch": 0.24414478353442157, + "grad_norm": 9.008386611938477, + "learning_rate": 9.975613910574876e-05, + "loss": 1.1653017044067382, + "step": 1720 + }, + { + "epoch": 0.24556422995031937, + "grad_norm": 9.755669593811035, + "learning_rate": 9.975471965933287e-05, + "loss": 1.304057788848877, + "step": 1730 + }, + { + "epoch": 0.24698367636621718, + "grad_norm": 10.742278099060059, + "learning_rate": 9.975330021291696e-05, + "loss": 1.1656038284301757, + "step": 1740 + }, + { + "epoch": 0.248403122782115, + "grad_norm": 11.937880516052246, + "learning_rate": 9.975188076650107e-05, + "loss": 1.2565963745117188, + "step": 1750 + }, + { + "epoch": 0.24982256919801277, + "grad_norm": 9.80545711517334, + "learning_rate": 9.975046132008517e-05, + "loss": 1.1316876411437988, + "step": 1760 + }, + { + "epoch": 0.2512420156139106, + "grad_norm": 11.162557601928711, + "learning_rate": 9.974904187366927e-05, + "loss": 1.2094581604003907, + "step": 1770 + }, + { + "epoch": 0.2526614620298084, + "grad_norm": 12.278450965881348, + "learning_rate": 9.974762242725337e-05, + "loss": 1.2499947547912598, + "step": 1780 + }, + { + "epoch": 0.2540809084457062, + "grad_norm": 10.95953369140625, + "learning_rate": 9.974620298083748e-05, + "loss": 1.1540046691894532, + "step": 1790 + }, + { + "epoch": 0.255500354861604, + "grad_norm": 7.865696430206299, + "learning_rate": 9.974478353442159e-05, + "loss": 1.1665989875793457, + "step": 1800 + }, + { + "epoch": 0.25691980127750175, + "grad_norm": 12.1609468460083, + "learning_rate": 9.974336408800569e-05, + "loss": 1.120746898651123, + "step": 1810 + }, + { + "epoch": 0.25833924769339955, + "grad_norm": 9.554359436035156, + "learning_rate": 9.974194464158978e-05, + "loss": 1.3381189346313476, + "step": 1820 + }, + { + "epoch": 0.25975869410929736, + "grad_norm": 9.497129440307617, + "learning_rate": 9.974052519517388e-05, + "loss": 1.1758546829223633, + "step": 1830 + }, + { + "epoch": 0.26117814052519517, + "grad_norm": 10.584992408752441, + "learning_rate": 9.973910574875799e-05, + "loss": 1.0787659645080567, + "step": 1840 + }, + { + "epoch": 0.262597586941093, + "grad_norm": 9.558980941772461, + "learning_rate": 9.973768630234209e-05, + "loss": 0.9334567070007325, + "step": 1850 + }, + { + "epoch": 0.2640170333569908, + "grad_norm": 9.41112995147705, + "learning_rate": 9.97362668559262e-05, + "loss": 1.1376053810119628, + "step": 1860 + }, + { + "epoch": 0.2654364797728886, + "grad_norm": 11.666831970214844, + "learning_rate": 9.973484740951028e-05, + "loss": 1.207914447784424, + "step": 1870 + }, + { + "epoch": 0.2668559261887864, + "grad_norm": 11.217955589294434, + "learning_rate": 9.97334279630944e-05, + "loss": 1.052849578857422, + "step": 1880 + }, + { + "epoch": 0.26827537260468415, + "grad_norm": 8.3615083694458, + "learning_rate": 9.97320085166785e-05, + "loss": 0.9782976150512696, + "step": 1890 + }, + { + "epoch": 0.26969481902058196, + "grad_norm": 10.69944953918457, + "learning_rate": 9.97305890702626e-05, + "loss": 0.9639101982116699, + "step": 1900 + }, + { + "epoch": 0.27111426543647976, + "grad_norm": 11.15194034576416, + "learning_rate": 9.972916962384671e-05, + "loss": 1.0744239807128906, + "step": 1910 + }, + { + "epoch": 0.27253371185237757, + "grad_norm": 10.363690376281738, + "learning_rate": 9.972775017743081e-05, + "loss": 1.1180108070373536, + "step": 1920 + }, + { + "epoch": 0.2739531582682754, + "grad_norm": 10.816513061523438, + "learning_rate": 9.972633073101491e-05, + "loss": 1.118791103363037, + "step": 1930 + }, + { + "epoch": 0.2753726046841732, + "grad_norm": 8.64388656616211, + "learning_rate": 9.9724911284599e-05, + "loss": 1.1368459701538085, + "step": 1940 + }, + { + "epoch": 0.276792051100071, + "grad_norm": 9.002252578735352, + "learning_rate": 9.972349183818312e-05, + "loss": 1.1344121932983398, + "step": 1950 + }, + { + "epoch": 0.2782114975159688, + "grad_norm": 11.083386421203613, + "learning_rate": 9.972207239176721e-05, + "loss": 1.1827295303344727, + "step": 1960 + }, + { + "epoch": 0.27963094393186655, + "grad_norm": 8.360145568847656, + "learning_rate": 9.972065294535133e-05, + "loss": 0.9954969406127929, + "step": 1970 + }, + { + "epoch": 0.28105039034776436, + "grad_norm": 12.982026100158691, + "learning_rate": 9.971923349893542e-05, + "loss": 0.9865982055664062, + "step": 1980 + }, + { + "epoch": 0.28246983676366216, + "grad_norm": 9.3854341506958, + "learning_rate": 9.971781405251952e-05, + "loss": 0.9238475799560547, + "step": 1990 + }, + { + "epoch": 0.28388928317955997, + "grad_norm": 10.693597793579102, + "learning_rate": 9.971639460610363e-05, + "loss": 0.9660484313964843, + "step": 2000 + }, + { + "epoch": 0.28388928317955997, + "eval_accuracy": 0.6596935206968907, + "eval_loss": 1.0827350616455078, + "eval_runtime": 31.44, + "eval_samples_per_second": 500.222, + "eval_steps_per_second": 15.649, + "step": 2000 + }, + { + "epoch": 0.2853087295954578, + "grad_norm": 11.403952598571777, + "learning_rate": 9.971497515968773e-05, + "loss": 0.987885856628418, + "step": 2010 + }, + { + "epoch": 0.2867281760113556, + "grad_norm": 11.068461418151855, + "learning_rate": 9.971355571327184e-05, + "loss": 1.0180384635925293, + "step": 2020 + }, + { + "epoch": 0.2881476224272534, + "grad_norm": 10.536505699157715, + "learning_rate": 9.971213626685592e-05, + "loss": 1.0148059844970703, + "step": 2030 + }, + { + "epoch": 0.2895670688431512, + "grad_norm": 9.358129501342773, + "learning_rate": 9.971071682044003e-05, + "loss": 0.9920819282531739, + "step": 2040 + }, + { + "epoch": 0.29098651525904895, + "grad_norm": 10.33521842956543, + "learning_rate": 9.970929737402413e-05, + "loss": 1.0010162353515626, + "step": 2050 + }, + { + "epoch": 0.29240596167494676, + "grad_norm": 10.490190505981445, + "learning_rate": 9.970787792760824e-05, + "loss": 0.9781021118164063, + "step": 2060 + }, + { + "epoch": 0.29382540809084456, + "grad_norm": 9.507524490356445, + "learning_rate": 9.970645848119234e-05, + "loss": 0.9722440719604493, + "step": 2070 + }, + { + "epoch": 0.29524485450674237, + "grad_norm": 10.77835464477539, + "learning_rate": 9.970503903477644e-05, + "loss": 0.9851055145263672, + "step": 2080 + }, + { + "epoch": 0.2966643009226402, + "grad_norm": 9.847874641418457, + "learning_rate": 9.970361958836055e-05, + "loss": 0.8958380699157715, + "step": 2090 + }, + { + "epoch": 0.298083747338538, + "grad_norm": 11.703569412231445, + "learning_rate": 9.970220014194465e-05, + "loss": 0.9267073631286621, + "step": 2100 + }, + { + "epoch": 0.2995031937544358, + "grad_norm": 6.974740028381348, + "learning_rate": 9.970078069552876e-05, + "loss": 0.787592887878418, + "step": 2110 + }, + { + "epoch": 0.30092264017033354, + "grad_norm": 5.989770889282227, + "learning_rate": 9.969936124911285e-05, + "loss": 0.8309663772583008, + "step": 2120 + }, + { + "epoch": 0.30234208658623135, + "grad_norm": 8.477362632751465, + "learning_rate": 9.969794180269695e-05, + "loss": 0.8926510810852051, + "step": 2130 + }, + { + "epoch": 0.30376153300212916, + "grad_norm": 8.412622451782227, + "learning_rate": 9.969652235628105e-05, + "loss": 0.8592344284057617, + "step": 2140 + }, + { + "epoch": 0.30518097941802697, + "grad_norm": 10.356178283691406, + "learning_rate": 9.969510290986516e-05, + "loss": 0.8827583312988281, + "step": 2150 + }, + { + "epoch": 0.3066004258339248, + "grad_norm": 7.666086673736572, + "learning_rate": 9.969368346344926e-05, + "loss": 0.9427967071533203, + "step": 2160 + }, + { + "epoch": 0.3080198722498226, + "grad_norm": 11.2577486038208, + "learning_rate": 9.969226401703337e-05, + "loss": 0.8580154418945313, + "step": 2170 + }, + { + "epoch": 0.3094393186657204, + "grad_norm": 10.915003776550293, + "learning_rate": 9.969084457061746e-05, + "loss": 0.9019613265991211, + "step": 2180 + }, + { + "epoch": 0.3108587650816182, + "grad_norm": 9.683639526367188, + "learning_rate": 9.968942512420156e-05, + "loss": 0.8982448577880859, + "step": 2190 + }, + { + "epoch": 0.31227821149751595, + "grad_norm": 8.5520601272583, + "learning_rate": 9.968800567778567e-05, + "loss": 0.7967979431152343, + "step": 2200 + }, + { + "epoch": 0.31369765791341375, + "grad_norm": 11.931614875793457, + "learning_rate": 9.968658623136977e-05, + "loss": 0.9725972175598144, + "step": 2210 + }, + { + "epoch": 0.31511710432931156, + "grad_norm": 11.004504203796387, + "learning_rate": 9.968516678495388e-05, + "loss": 0.9189895629882813, + "step": 2220 + }, + { + "epoch": 0.31653655074520937, + "grad_norm": 9.460184097290039, + "learning_rate": 9.968374733853797e-05, + "loss": 0.9253165245056152, + "step": 2230 + }, + { + "epoch": 0.3179559971611072, + "grad_norm": 9.675958633422852, + "learning_rate": 9.968232789212208e-05, + "loss": 0.7977495670318604, + "step": 2240 + }, + { + "epoch": 0.319375443577005, + "grad_norm": 8.858159065246582, + "learning_rate": 9.968090844570617e-05, + "loss": 0.9056186676025391, + "step": 2250 + }, + { + "epoch": 0.3207948899929028, + "grad_norm": 10.144878387451172, + "learning_rate": 9.967948899929028e-05, + "loss": 0.9273244857788085, + "step": 2260 + }, + { + "epoch": 0.3222143364088006, + "grad_norm": 9.78799819946289, + "learning_rate": 9.967806955287438e-05, + "loss": 0.8192936897277832, + "step": 2270 + }, + { + "epoch": 0.32363378282469835, + "grad_norm": 8.891179084777832, + "learning_rate": 9.967665010645849e-05, + "loss": 0.8823507308959961, + "step": 2280 + }, + { + "epoch": 0.32505322924059615, + "grad_norm": 9.303411483764648, + "learning_rate": 9.967523066004259e-05, + "loss": 0.8374591827392578, + "step": 2290 + }, + { + "epoch": 0.32647267565649396, + "grad_norm": 8.408880233764648, + "learning_rate": 9.967381121362669e-05, + "loss": 0.849891471862793, + "step": 2300 + }, + { + "epoch": 0.32789212207239177, + "grad_norm": 9.384819030761719, + "learning_rate": 9.96723917672108e-05, + "loss": 0.7750972747802735, + "step": 2310 + }, + { + "epoch": 0.3293115684882896, + "grad_norm": 9.170500755310059, + "learning_rate": 9.96709723207949e-05, + "loss": 0.7687624454498291, + "step": 2320 + }, + { + "epoch": 0.3307310149041874, + "grad_norm": 8.488929748535156, + "learning_rate": 9.966955287437901e-05, + "loss": 0.8498885154724121, + "step": 2330 + }, + { + "epoch": 0.3321504613200852, + "grad_norm": 10.291971206665039, + "learning_rate": 9.966813342796309e-05, + "loss": 0.7807302474975586, + "step": 2340 + }, + { + "epoch": 0.33356990773598294, + "grad_norm": 11.644806861877441, + "learning_rate": 9.96667139815472e-05, + "loss": 0.7917065143585205, + "step": 2350 + }, + { + "epoch": 0.33498935415188075, + "grad_norm": 13.938374519348145, + "learning_rate": 9.96652945351313e-05, + "loss": 0.8718063354492187, + "step": 2360 + }, + { + "epoch": 0.33640880056777855, + "grad_norm": 10.399706840515137, + "learning_rate": 9.966387508871541e-05, + "loss": 0.8703582763671875, + "step": 2370 + }, + { + "epoch": 0.33782824698367636, + "grad_norm": 7.870115756988525, + "learning_rate": 9.966245564229951e-05, + "loss": 0.8549924850463867, + "step": 2380 + }, + { + "epoch": 0.33924769339957417, + "grad_norm": 9.777918815612793, + "learning_rate": 9.96610361958836e-05, + "loss": 1.0234166145324708, + "step": 2390 + }, + { + "epoch": 0.340667139815472, + "grad_norm": 10.103452682495117, + "learning_rate": 9.965961674946772e-05, + "loss": 0.9040670394897461, + "step": 2400 + }, + { + "epoch": 0.3420865862313698, + "grad_norm": 10.497400283813477, + "learning_rate": 9.965819730305181e-05, + "loss": 0.7955552577972412, + "step": 2410 + }, + { + "epoch": 0.3435060326472676, + "grad_norm": 8.0149564743042, + "learning_rate": 9.965677785663592e-05, + "loss": 0.856791877746582, + "step": 2420 + }, + { + "epoch": 0.34492547906316534, + "grad_norm": 8.111480712890625, + "learning_rate": 9.965535841022002e-05, + "loss": 0.8129085540771485, + "step": 2430 + }, + { + "epoch": 0.34634492547906315, + "grad_norm": 7.93813419342041, + "learning_rate": 9.965393896380412e-05, + "loss": 0.809941291809082, + "step": 2440 + }, + { + "epoch": 0.34776437189496096, + "grad_norm": 10.88427448272705, + "learning_rate": 9.965251951738822e-05, + "loss": 0.7622882843017578, + "step": 2450 + }, + { + "epoch": 0.34918381831085876, + "grad_norm": 9.509648323059082, + "learning_rate": 9.965110007097233e-05, + "loss": 0.7235064029693603, + "step": 2460 + }, + { + "epoch": 0.35060326472675657, + "grad_norm": 10.343646049499512, + "learning_rate": 9.964968062455642e-05, + "loss": 0.7792426586151123, + "step": 2470 + }, + { + "epoch": 0.3520227111426544, + "grad_norm": 11.936261177062988, + "learning_rate": 9.964826117814054e-05, + "loss": 0.8023401260375976, + "step": 2480 + }, + { + "epoch": 0.3534421575585522, + "grad_norm": 8.382633209228516, + "learning_rate": 9.964684173172463e-05, + "loss": 0.7960898399353027, + "step": 2490 + }, + { + "epoch": 0.35486160397445, + "grad_norm": 11.01586627960205, + "learning_rate": 9.964542228530873e-05, + "loss": 0.7975746631622315, + "step": 2500 + }, + { + "epoch": 0.35486160397445, + "eval_accuracy": 0.730527118967381, + "eval_loss": 0.8166059255599976, + "eval_runtime": 32.3274, + "eval_samples_per_second": 486.491, + "eval_steps_per_second": 15.219, + "step": 2500 + }, + { + "epoch": 0.35628105039034774, + "grad_norm": 8.113981246948242, + "learning_rate": 9.964400283889284e-05, + "loss": 0.7863178253173828, + "step": 2510 + }, + { + "epoch": 0.35770049680624555, + "grad_norm": 9.127975463867188, + "learning_rate": 9.964258339247694e-05, + "loss": 0.8487259864807128, + "step": 2520 + }, + { + "epoch": 0.35911994322214336, + "grad_norm": 8.597822189331055, + "learning_rate": 9.964116394606105e-05, + "loss": 0.8151129722595215, + "step": 2530 + }, + { + "epoch": 0.36053938963804116, + "grad_norm": 8.069273948669434, + "learning_rate": 9.963974449964513e-05, + "loss": 0.6664574623107911, + "step": 2540 + }, + { + "epoch": 0.36195883605393897, + "grad_norm": 8.314419746398926, + "learning_rate": 9.963832505322924e-05, + "loss": 0.8365516662597656, + "step": 2550 + }, + { + "epoch": 0.3633782824698368, + "grad_norm": 9.172304153442383, + "learning_rate": 9.963690560681334e-05, + "loss": 0.7865428924560547, + "step": 2560 + }, + { + "epoch": 0.3647977288857346, + "grad_norm": 9.639200210571289, + "learning_rate": 9.963548616039745e-05, + "loss": 0.7925633430480957, + "step": 2570 + }, + { + "epoch": 0.36621717530163234, + "grad_norm": 8.856132507324219, + "learning_rate": 9.963406671398155e-05, + "loss": 0.7005198955535888, + "step": 2580 + }, + { + "epoch": 0.36763662171753014, + "grad_norm": 7.9700422286987305, + "learning_rate": 9.963264726756566e-05, + "loss": 0.6712905883789062, + "step": 2590 + }, + { + "epoch": 0.36905606813342795, + "grad_norm": 9.465399742126465, + "learning_rate": 9.963122782114976e-05, + "loss": 0.7288703441619873, + "step": 2600 + }, + { + "epoch": 0.37047551454932576, + "grad_norm": 8.769003868103027, + "learning_rate": 9.962980837473386e-05, + "loss": 0.7671696662902832, + "step": 2610 + }, + { + "epoch": 0.37189496096522356, + "grad_norm": 6.981420040130615, + "learning_rate": 9.962838892831797e-05, + "loss": 0.6548487663269043, + "step": 2620 + }, + { + "epoch": 0.37331440738112137, + "grad_norm": 8.440009117126465, + "learning_rate": 9.962696948190206e-05, + "loss": 0.705223274230957, + "step": 2630 + }, + { + "epoch": 0.3747338537970192, + "grad_norm": 12.392814636230469, + "learning_rate": 9.962555003548617e-05, + "loss": 0.8219353675842285, + "step": 2640 + }, + { + "epoch": 0.376153300212917, + "grad_norm": 9.1260404586792, + "learning_rate": 9.962413058907026e-05, + "loss": 0.7202134132385254, + "step": 2650 + }, + { + "epoch": 0.37757274662881474, + "grad_norm": 9.437945365905762, + "learning_rate": 9.962271114265437e-05, + "loss": 0.7196836471557617, + "step": 2660 + }, + { + "epoch": 0.37899219304471254, + "grad_norm": 8.03176212310791, + "learning_rate": 9.962129169623847e-05, + "loss": 0.6017679214477539, + "step": 2670 + }, + { + "epoch": 0.38041163946061035, + "grad_norm": 11.21246337890625, + "learning_rate": 9.961987224982258e-05, + "loss": 0.8073585510253907, + "step": 2680 + }, + { + "epoch": 0.38183108587650816, + "grad_norm": 8.937601089477539, + "learning_rate": 9.961845280340667e-05, + "loss": 0.5765426635742188, + "step": 2690 + }, + { + "epoch": 0.38325053229240597, + "grad_norm": 10.750785827636719, + "learning_rate": 9.961703335699077e-05, + "loss": 0.850700569152832, + "step": 2700 + }, + { + "epoch": 0.3846699787083038, + "grad_norm": 8.476407051086426, + "learning_rate": 9.961561391057488e-05, + "loss": 0.6841172695159912, + "step": 2710 + }, + { + "epoch": 0.3860894251242016, + "grad_norm": 8.174555778503418, + "learning_rate": 9.961419446415898e-05, + "loss": 0.6521795272827149, + "step": 2720 + }, + { + "epoch": 0.3875088715400994, + "grad_norm": 6.744903564453125, + "learning_rate": 9.961277501774309e-05, + "loss": 0.7194175720214844, + "step": 2730 + }, + { + "epoch": 0.38892831795599714, + "grad_norm": 7.107284069061279, + "learning_rate": 9.961135557132719e-05, + "loss": 0.7437104701995849, + "step": 2740 + }, + { + "epoch": 0.39034776437189495, + "grad_norm": 12.026649475097656, + "learning_rate": 9.960993612491129e-05, + "loss": 0.7481307029724121, + "step": 2750 + }, + { + "epoch": 0.39176721078779275, + "grad_norm": 10.131022453308105, + "learning_rate": 9.960851667849538e-05, + "loss": 0.6669661521911621, + "step": 2760 + }, + { + "epoch": 0.39318665720369056, + "grad_norm": 7.589590072631836, + "learning_rate": 9.96070972320795e-05, + "loss": 0.5883037567138671, + "step": 2770 + }, + { + "epoch": 0.39460610361958837, + "grad_norm": 8.32777214050293, + "learning_rate": 9.960567778566359e-05, + "loss": 0.6772464752197266, + "step": 2780 + }, + { + "epoch": 0.3960255500354862, + "grad_norm": 6.111226558685303, + "learning_rate": 9.96042583392477e-05, + "loss": 0.6943521976470948, + "step": 2790 + }, + { + "epoch": 0.397444996451384, + "grad_norm": 10.40073299407959, + "learning_rate": 9.96028388928318e-05, + "loss": 0.6597262382507324, + "step": 2800 + }, + { + "epoch": 0.3988644428672818, + "grad_norm": 11.990081787109375, + "learning_rate": 9.96014194464159e-05, + "loss": 0.6846660614013672, + "step": 2810 + }, + { + "epoch": 0.40028388928317954, + "grad_norm": 7.820896625518799, + "learning_rate": 9.960000000000001e-05, + "loss": 0.5972445487976075, + "step": 2820 + }, + { + "epoch": 0.40170333569907735, + "grad_norm": 9.078740119934082, + "learning_rate": 9.95985805535841e-05, + "loss": 0.7440935611724854, + "step": 2830 + }, + { + "epoch": 0.40312278211497515, + "grad_norm": 8.869423866271973, + "learning_rate": 9.959716110716822e-05, + "loss": 0.7406916141510009, + "step": 2840 + }, + { + "epoch": 0.40454222853087296, + "grad_norm": 9.250556945800781, + "learning_rate": 9.95957416607523e-05, + "loss": 0.6444163799285889, + "step": 2850 + }, + { + "epoch": 0.40596167494677077, + "grad_norm": 12.534906387329102, + "learning_rate": 9.959432221433641e-05, + "loss": 0.7008297920227051, + "step": 2860 + }, + { + "epoch": 0.4073811213626686, + "grad_norm": 10.320120811462402, + "learning_rate": 9.959290276792051e-05, + "loss": 0.6261786460876465, + "step": 2870 + }, + { + "epoch": 0.4088005677785664, + "grad_norm": 7.483973979949951, + "learning_rate": 9.959148332150462e-05, + "loss": 0.6434149742126465, + "step": 2880 + }, + { + "epoch": 0.41022001419446413, + "grad_norm": 9.007946014404297, + "learning_rate": 9.959006387508872e-05, + "loss": 0.6796345233917236, + "step": 2890 + }, + { + "epoch": 0.41163946061036194, + "grad_norm": 8.191641807556152, + "learning_rate": 9.958864442867281e-05, + "loss": 0.5003190994262695, + "step": 2900 + }, + { + "epoch": 0.41305890702625975, + "grad_norm": 9.307744979858398, + "learning_rate": 9.958722498225693e-05, + "loss": 0.6988365173339843, + "step": 2910 + }, + { + "epoch": 0.41447835344215755, + "grad_norm": 6.16031551361084, + "learning_rate": 9.958580553584102e-05, + "loss": 0.6487136840820312, + "step": 2920 + }, + { + "epoch": 0.41589779985805536, + "grad_norm": 9.785910606384277, + "learning_rate": 9.958438608942513e-05, + "loss": 0.6544306755065918, + "step": 2930 + }, + { + "epoch": 0.41731724627395317, + "grad_norm": 12.08917236328125, + "learning_rate": 9.958296664300923e-05, + "loss": 0.6119012832641602, + "step": 2940 + }, + { + "epoch": 0.418736692689851, + "grad_norm": 10.118932723999023, + "learning_rate": 9.958154719659334e-05, + "loss": 0.5515688896179199, + "step": 2950 + }, + { + "epoch": 0.4201561391057488, + "grad_norm": 10.645463943481445, + "learning_rate": 9.958012775017743e-05, + "loss": 0.6795665740966796, + "step": 2960 + }, + { + "epoch": 0.42157558552164653, + "grad_norm": 8.745086669921875, + "learning_rate": 9.957870830376154e-05, + "loss": 0.6612170219421387, + "step": 2970 + }, + { + "epoch": 0.42299503193754434, + "grad_norm": 7.060173511505127, + "learning_rate": 9.957728885734563e-05, + "loss": 0.635819387435913, + "step": 2980 + }, + { + "epoch": 0.42441447835344215, + "grad_norm": 11.630016326904297, + "learning_rate": 9.957586941092975e-05, + "loss": 0.5891122341156005, + "step": 2990 + }, + { + "epoch": 0.42583392476933996, + "grad_norm": 11.667549133300781, + "learning_rate": 9.957444996451386e-05, + "loss": 0.7183985233306884, + "step": 3000 + }, + { + "epoch": 0.42583392476933996, + "eval_accuracy": 0.7175557957652445, + "eval_loss": 0.8312568068504333, + "eval_runtime": 32.7465, + "eval_samples_per_second": 480.265, + "eval_steps_per_second": 15.024, + "step": 3000 + }, + { + "epoch": 0.42725337118523776, + "grad_norm": 10.770739555358887, + "learning_rate": 9.957303051809794e-05, + "loss": 0.606045913696289, + "step": 3010 + }, + { + "epoch": 0.42867281760113557, + "grad_norm": 8.715160369873047, + "learning_rate": 9.957161107168205e-05, + "loss": 0.6968401908874512, + "step": 3020 + }, + { + "epoch": 0.4300922640170334, + "grad_norm": 10.227581977844238, + "learning_rate": 9.957019162526615e-05, + "loss": 0.5089622497558594, + "step": 3030 + }, + { + "epoch": 0.4315117104329312, + "grad_norm": 8.32385540008545, + "learning_rate": 9.956877217885026e-05, + "loss": 0.6402715682983399, + "step": 3040 + }, + { + "epoch": 0.43293115684882894, + "grad_norm": 10.973727226257324, + "learning_rate": 9.956735273243436e-05, + "loss": 0.7282869338989257, + "step": 3050 + }, + { + "epoch": 0.43435060326472674, + "grad_norm": 8.994437217712402, + "learning_rate": 9.956593328601845e-05, + "loss": 0.5776423454284668, + "step": 3060 + }, + { + "epoch": 0.43577004968062455, + "grad_norm": 7.597539901733398, + "learning_rate": 9.956451383960255e-05, + "loss": 0.5537106990814209, + "step": 3070 + }, + { + "epoch": 0.43718949609652236, + "grad_norm": 7.695132732391357, + "learning_rate": 9.956309439318666e-05, + "loss": 0.5561283588409424, + "step": 3080 + }, + { + "epoch": 0.43860894251242016, + "grad_norm": 10.008833885192871, + "learning_rate": 9.956167494677077e-05, + "loss": 0.6571722030639648, + "step": 3090 + }, + { + "epoch": 0.44002838892831797, + "grad_norm": 6.440252304077148, + "learning_rate": 9.956025550035487e-05, + "loss": 0.4972050189971924, + "step": 3100 + }, + { + "epoch": 0.4414478353442158, + "grad_norm": 11.92957878112793, + "learning_rate": 9.955883605393897e-05, + "loss": 0.6483690738677979, + "step": 3110 + }, + { + "epoch": 0.44286728176011353, + "grad_norm": 8.40812873840332, + "learning_rate": 9.955741660752307e-05, + "loss": 0.602755069732666, + "step": 3120 + }, + { + "epoch": 0.44428672817601134, + "grad_norm": 6.782786846160889, + "learning_rate": 9.955599716110718e-05, + "loss": 0.6320923328399658, + "step": 3130 + }, + { + "epoch": 0.44570617459190914, + "grad_norm": 12.326107025146484, + "learning_rate": 9.955457771469127e-05, + "loss": 0.5779653549194336, + "step": 3140 + }, + { + "epoch": 0.44712562100780695, + "grad_norm": 12.876483917236328, + "learning_rate": 9.955315826827538e-05, + "loss": 0.7216415882110596, + "step": 3150 + }, + { + "epoch": 0.44854506742370476, + "grad_norm": 6.984850883483887, + "learning_rate": 9.955173882185947e-05, + "loss": 0.4415611267089844, + "step": 3160 + }, + { + "epoch": 0.44996451383960256, + "grad_norm": 6.711297512054443, + "learning_rate": 9.955031937544358e-05, + "loss": 0.5505913734436035, + "step": 3170 + }, + { + "epoch": 0.45138396025550037, + "grad_norm": 7.127682685852051, + "learning_rate": 9.954889992902769e-05, + "loss": 0.5563027858734131, + "step": 3180 + }, + { + "epoch": 0.4528034066713982, + "grad_norm": 9.826492309570312, + "learning_rate": 9.954748048261179e-05, + "loss": 0.5204686641693115, + "step": 3190 + }, + { + "epoch": 0.45422285308729593, + "grad_norm": 14.011224746704102, + "learning_rate": 9.95460610361959e-05, + "loss": 0.5676139831542969, + "step": 3200 + }, + { + "epoch": 0.45564229950319374, + "grad_norm": 10.502514839172363, + "learning_rate": 9.954464158977998e-05, + "loss": 0.6593122482299805, + "step": 3210 + }, + { + "epoch": 0.45706174591909154, + "grad_norm": 9.966157913208008, + "learning_rate": 9.95432221433641e-05, + "loss": 0.5757305145263671, + "step": 3220 + }, + { + "epoch": 0.45848119233498935, + "grad_norm": 7.551996231079102, + "learning_rate": 9.954180269694819e-05, + "loss": 0.5537711620330811, + "step": 3230 + }, + { + "epoch": 0.45990063875088716, + "grad_norm": 10.630086898803711, + "learning_rate": 9.95403832505323e-05, + "loss": 0.5302771091461181, + "step": 3240 + }, + { + "epoch": 0.46132008516678497, + "grad_norm": 12.471774101257324, + "learning_rate": 9.95389638041164e-05, + "loss": 0.6347667694091796, + "step": 3250 + }, + { + "epoch": 0.4627395315826828, + "grad_norm": 9.668441772460938, + "learning_rate": 9.95375443577005e-05, + "loss": 0.5615960121154785, + "step": 3260 + }, + { + "epoch": 0.4641589779985806, + "grad_norm": 9.092421531677246, + "learning_rate": 9.953612491128461e-05, + "loss": 0.5531889438629151, + "step": 3270 + }, + { + "epoch": 0.46557842441447833, + "grad_norm": 8.55390453338623, + "learning_rate": 9.95347054648687e-05, + "loss": 0.5149998188018798, + "step": 3280 + }, + { + "epoch": 0.46699787083037614, + "grad_norm": 9.092056274414062, + "learning_rate": 9.953328601845282e-05, + "loss": 0.49632701873779295, + "step": 3290 + }, + { + "epoch": 0.46841731724627395, + "grad_norm": 9.66268253326416, + "learning_rate": 9.953186657203691e-05, + "loss": 0.5612505912780762, + "step": 3300 + }, + { + "epoch": 0.46983676366217175, + "grad_norm": 6.583611011505127, + "learning_rate": 9.953044712562102e-05, + "loss": 0.5805669307708741, + "step": 3310 + }, + { + "epoch": 0.47125621007806956, + "grad_norm": 8.160282135009766, + "learning_rate": 9.952902767920511e-05, + "loss": 0.4320365428924561, + "step": 3320 + }, + { + "epoch": 0.47267565649396737, + "grad_norm": 10.05884075164795, + "learning_rate": 9.952760823278922e-05, + "loss": 0.5736487865447998, + "step": 3330 + }, + { + "epoch": 0.4740951029098652, + "grad_norm": 9.000593185424805, + "learning_rate": 9.952618878637332e-05, + "loss": 0.5238205432891846, + "step": 3340 + }, + { + "epoch": 0.4755145493257629, + "grad_norm": 9.076302528381348, + "learning_rate": 9.952476933995743e-05, + "loss": 0.5925283432006836, + "step": 3350 + }, + { + "epoch": 0.47693399574166073, + "grad_norm": 8.275947570800781, + "learning_rate": 9.952334989354152e-05, + "loss": 0.4787450313568115, + "step": 3360 + }, + { + "epoch": 0.47835344215755854, + "grad_norm": 12.550822257995605, + "learning_rate": 9.952193044712562e-05, + "loss": 0.49250407218933107, + "step": 3370 + }, + { + "epoch": 0.47977288857345635, + "grad_norm": 6.8708176612854, + "learning_rate": 9.952051100070973e-05, + "loss": 0.585394811630249, + "step": 3380 + }, + { + "epoch": 0.48119233498935415, + "grad_norm": 6.129304885864258, + "learning_rate": 9.951909155429383e-05, + "loss": 0.5862763881683349, + "step": 3390 + }, + { + "epoch": 0.48261178140525196, + "grad_norm": 7.1515045166015625, + "learning_rate": 9.951767210787794e-05, + "loss": 0.46143798828125, + "step": 3400 + }, + { + "epoch": 0.48403122782114977, + "grad_norm": 5.421439170837402, + "learning_rate": 9.951625266146204e-05, + "loss": 0.6040849685668945, + "step": 3410 + }, + { + "epoch": 0.4854506742370476, + "grad_norm": 10.418113708496094, + "learning_rate": 9.951483321504614e-05, + "loss": 0.55996732711792, + "step": 3420 + }, + { + "epoch": 0.4868701206529453, + "grad_norm": 9.697559356689453, + "learning_rate": 9.951341376863023e-05, + "loss": 0.5332645893096923, + "step": 3430 + }, + { + "epoch": 0.48828956706884313, + "grad_norm": 9.79345703125, + "learning_rate": 9.951199432221434e-05, + "loss": 0.5983724117279052, + "step": 3440 + }, + { + "epoch": 0.48970901348474094, + "grad_norm": 7.977105617523193, + "learning_rate": 9.951057487579844e-05, + "loss": 0.6096511840820312, + "step": 3450 + }, + { + "epoch": 0.49112845990063875, + "grad_norm": 6.851355075836182, + "learning_rate": 9.950915542938255e-05, + "loss": 0.4748993396759033, + "step": 3460 + }, + { + "epoch": 0.49254790631653655, + "grad_norm": 4.706153392791748, + "learning_rate": 9.950773598296665e-05, + "loss": 0.544727087020874, + "step": 3470 + }, + { + "epoch": 0.49396735273243436, + "grad_norm": 9.061712265014648, + "learning_rate": 9.950631653655075e-05, + "loss": 0.5076655387878418, + "step": 3480 + }, + { + "epoch": 0.49538679914833217, + "grad_norm": 7.619383335113525, + "learning_rate": 9.950489709013486e-05, + "loss": 0.5011069297790527, + "step": 3490 + }, + { + "epoch": 0.49680624556423, + "grad_norm": 6.629651069641113, + "learning_rate": 9.950347764371896e-05, + "loss": 0.5038942337036133, + "step": 3500 + }, + { + "epoch": 0.49680624556423, + "eval_accuracy": 0.8245056272652127, + "eval_loss": 0.526730477809906, + "eval_runtime": 32.5263, + "eval_samples_per_second": 483.517, + "eval_steps_per_second": 15.126, + "step": 3500 + }, + { + "epoch": 0.4982256919801277, + "grad_norm": 6.535589694976807, + "learning_rate": 9.950205819730307e-05, + "loss": 0.5255190849304199, + "step": 3510 + }, + { + "epoch": 0.49964513839602553, + "grad_norm": 10.481846809387207, + "learning_rate": 9.950063875088715e-05, + "loss": 0.4977625846862793, + "step": 3520 + }, + { + "epoch": 0.5010645848119234, + "grad_norm": 6.455493450164795, + "learning_rate": 9.949921930447126e-05, + "loss": 0.4624650955200195, + "step": 3530 + }, + { + "epoch": 0.5024840312278211, + "grad_norm": 12.190658569335938, + "learning_rate": 9.949779985805536e-05, + "loss": 0.45445499420166013, + "step": 3540 + }, + { + "epoch": 0.5039034776437189, + "grad_norm": 6.512971878051758, + "learning_rate": 9.949638041163947e-05, + "loss": 0.48822684288024903, + "step": 3550 + }, + { + "epoch": 0.5053229240596168, + "grad_norm": 8.259076118469238, + "learning_rate": 9.949496096522357e-05, + "loss": 0.4896749496459961, + "step": 3560 + }, + { + "epoch": 0.5067423704755145, + "grad_norm": 10.809083938598633, + "learning_rate": 9.949354151880766e-05, + "loss": 0.5267855644226074, + "step": 3570 + }, + { + "epoch": 0.5081618168914124, + "grad_norm": 11.164665222167969, + "learning_rate": 9.949212207239178e-05, + "loss": 0.6478964328765869, + "step": 3580 + }, + { + "epoch": 0.5095812633073101, + "grad_norm": 10.553145408630371, + "learning_rate": 9.949070262597587e-05, + "loss": 0.5322469711303711, + "step": 3590 + }, + { + "epoch": 0.511000709723208, + "grad_norm": 12.578235626220703, + "learning_rate": 9.948928317955998e-05, + "loss": 0.559388542175293, + "step": 3600 + }, + { + "epoch": 0.5124201561391057, + "grad_norm": 7.2467474937438965, + "learning_rate": 9.948786373314408e-05, + "loss": 0.5477664470672607, + "step": 3610 + }, + { + "epoch": 0.5138396025550035, + "grad_norm": 5.959977626800537, + "learning_rate": 9.948644428672818e-05, + "loss": 0.41798744201660154, + "step": 3620 + }, + { + "epoch": 0.5152590489709014, + "grad_norm": 11.72385025024414, + "learning_rate": 9.948502484031228e-05, + "loss": 0.5879819869995118, + "step": 3630 + }, + { + "epoch": 0.5166784953867991, + "grad_norm": 7.881444454193115, + "learning_rate": 9.948360539389639e-05, + "loss": 0.5005061149597168, + "step": 3640 + }, + { + "epoch": 0.518097941802697, + "grad_norm": 7.005399703979492, + "learning_rate": 9.948218594748048e-05, + "loss": 0.5412337303161621, + "step": 3650 + }, + { + "epoch": 0.5195173882185947, + "grad_norm": 13.495038032531738, + "learning_rate": 9.94807665010646e-05, + "loss": 0.5668565273284912, + "step": 3660 + }, + { + "epoch": 0.5209368346344926, + "grad_norm": 8.42395305633545, + "learning_rate": 9.947934705464869e-05, + "loss": 0.6085368633270264, + "step": 3670 + }, + { + "epoch": 0.5223562810503903, + "grad_norm": 8.754134178161621, + "learning_rate": 9.947792760823279e-05, + "loss": 0.5360457420349121, + "step": 3680 + }, + { + "epoch": 0.5237757274662882, + "grad_norm": 5.868712425231934, + "learning_rate": 9.94765081618169e-05, + "loss": 0.6166606903076172, + "step": 3690 + }, + { + "epoch": 0.525195173882186, + "grad_norm": 4.342434883117676, + "learning_rate": 9.9475088715401e-05, + "loss": 0.4890284061431885, + "step": 3700 + }, + { + "epoch": 0.5266146202980837, + "grad_norm": 8.200478553771973, + "learning_rate": 9.947366926898511e-05, + "loss": 0.5135448455810547, + "step": 3710 + }, + { + "epoch": 0.5280340667139816, + "grad_norm": 6.076674938201904, + "learning_rate": 9.94722498225692e-05, + "loss": 0.37685840129852294, + "step": 3720 + }, + { + "epoch": 0.5294535131298793, + "grad_norm": 8.206668853759766, + "learning_rate": 9.94708303761533e-05, + "loss": 0.43741750717163086, + "step": 3730 + }, + { + "epoch": 0.5308729595457772, + "grad_norm": 8.284717559814453, + "learning_rate": 9.94694109297374e-05, + "loss": 0.46701841354370116, + "step": 3740 + }, + { + "epoch": 0.5322924059616749, + "grad_norm": 8.111977577209473, + "learning_rate": 9.946799148332151e-05, + "loss": 0.5564829349517822, + "step": 3750 + }, + { + "epoch": 0.5337118523775728, + "grad_norm": 10.037016868591309, + "learning_rate": 9.946657203690561e-05, + "loss": 0.4543320655822754, + "step": 3760 + }, + { + "epoch": 0.5351312987934705, + "grad_norm": 6.1391191482543945, + "learning_rate": 9.946515259048972e-05, + "loss": 0.43409008979797364, + "step": 3770 + }, + { + "epoch": 0.5365507452093683, + "grad_norm": 9.031709671020508, + "learning_rate": 9.946373314407382e-05, + "loss": 0.45609292984008787, + "step": 3780 + }, + { + "epoch": 0.5379701916252662, + "grad_norm": 10.507880210876465, + "learning_rate": 9.946231369765791e-05, + "loss": 0.49566287994384767, + "step": 3790 + }, + { + "epoch": 0.5393896380411639, + "grad_norm": 7.94572114944458, + "learning_rate": 9.946089425124203e-05, + "loss": 0.43464975357055663, + "step": 3800 + }, + { + "epoch": 0.5408090844570618, + "grad_norm": 11.292725563049316, + "learning_rate": 9.945947480482612e-05, + "loss": 0.4976043224334717, + "step": 3810 + }, + { + "epoch": 0.5422285308729595, + "grad_norm": 9.720746040344238, + "learning_rate": 9.945805535841023e-05, + "loss": 0.44420394897460935, + "step": 3820 + }, + { + "epoch": 0.5436479772888574, + "grad_norm": 10.859402656555176, + "learning_rate": 9.945663591199432e-05, + "loss": 0.47884187698364256, + "step": 3830 + }, + { + "epoch": 0.5450674237047551, + "grad_norm": 10.234602928161621, + "learning_rate": 9.945521646557843e-05, + "loss": 0.4273094654083252, + "step": 3840 + }, + { + "epoch": 0.5464868701206529, + "grad_norm": 10.073461532592773, + "learning_rate": 9.945379701916253e-05, + "loss": 0.4809098243713379, + "step": 3850 + }, + { + "epoch": 0.5479063165365508, + "grad_norm": 8.402386665344238, + "learning_rate": 9.945237757274664e-05, + "loss": 0.5075035572052002, + "step": 3860 + }, + { + "epoch": 0.5493257629524485, + "grad_norm": 8.385801315307617, + "learning_rate": 9.945095812633073e-05, + "loss": 0.42679743766784667, + "step": 3870 + }, + { + "epoch": 0.5507452093683464, + "grad_norm": 8.214275360107422, + "learning_rate": 9.944953867991483e-05, + "loss": 0.42831969261169434, + "step": 3880 + }, + { + "epoch": 0.5521646557842441, + "grad_norm": 6.777364730834961, + "learning_rate": 9.944811923349894e-05, + "loss": 0.37784249782562257, + "step": 3890 + }, + { + "epoch": 0.553584102200142, + "grad_norm": 7.4766011238098145, + "learning_rate": 9.944669978708304e-05, + "loss": 0.5389047622680664, + "step": 3900 + }, + { + "epoch": 0.5550035486160397, + "grad_norm": 7.167613983154297, + "learning_rate": 9.944528034066715e-05, + "loss": 0.4535686492919922, + "step": 3910 + }, + { + "epoch": 0.5564229950319376, + "grad_norm": 3.94936203956604, + "learning_rate": 9.944386089425125e-05, + "loss": 0.392999267578125, + "step": 3920 + }, + { + "epoch": 0.5578424414478353, + "grad_norm": 7.909378528594971, + "learning_rate": 9.944244144783535e-05, + "loss": 0.4675307750701904, + "step": 3930 + }, + { + "epoch": 0.5592618878637331, + "grad_norm": 8.253449440002441, + "learning_rate": 9.944102200141944e-05, + "loss": 0.515011215209961, + "step": 3940 + }, + { + "epoch": 0.560681334279631, + "grad_norm": 5.535346984863281, + "learning_rate": 9.943960255500355e-05, + "loss": 0.35612196922302247, + "step": 3950 + }, + { + "epoch": 0.5621007806955287, + "grad_norm": 5.621975898742676, + "learning_rate": 9.943818310858765e-05, + "loss": 0.3292267322540283, + "step": 3960 + }, + { + "epoch": 0.5635202271114266, + "grad_norm": 8.432771682739258, + "learning_rate": 9.943676366217176e-05, + "loss": 0.44489707946777346, + "step": 3970 + }, + { + "epoch": 0.5649396735273243, + "grad_norm": 5.422188758850098, + "learning_rate": 9.943534421575586e-05, + "loss": 0.4312278270721436, + "step": 3980 + }, + { + "epoch": 0.5663591199432222, + "grad_norm": 6.463229179382324, + "learning_rate": 9.943392476933996e-05, + "loss": 0.4469761371612549, + "step": 3990 + }, + { + "epoch": 0.5677785663591199, + "grad_norm": 12.039133071899414, + "learning_rate": 9.943250532292407e-05, + "loss": 0.5653008937835693, + "step": 4000 + }, + { + "epoch": 0.5677785663591199, + "eval_accuracy": 0.8540726139759649, + "eval_loss": 0.4309617578983307, + "eval_runtime": 32.698, + "eval_samples_per_second": 480.977, + "eval_steps_per_second": 15.047, + "step": 4000 + }, + { + "epoch": 0.5691980127750177, + "grad_norm": 12.185803413391113, + "learning_rate": 9.943108587650817e-05, + "loss": 0.4458905220031738, + "step": 4010 + }, + { + "epoch": 0.5706174591909156, + "grad_norm": 9.691877365112305, + "learning_rate": 9.942966643009228e-05, + "loss": 0.39486720561981203, + "step": 4020 + }, + { + "epoch": 0.5720369056068133, + "grad_norm": 8.106902122497559, + "learning_rate": 9.942824698367637e-05, + "loss": 0.49444093704223635, + "step": 4030 + }, + { + "epoch": 0.5734563520227112, + "grad_norm": 9.14234447479248, + "learning_rate": 9.942682753726047e-05, + "loss": 0.43038039207458495, + "step": 4040 + }, + { + "epoch": 0.5748757984386089, + "grad_norm": 4.6097588539123535, + "learning_rate": 9.942540809084457e-05, + "loss": 0.4118741512298584, + "step": 4050 + }, + { + "epoch": 0.5762952448545068, + "grad_norm": 6.0909881591796875, + "learning_rate": 9.942398864442868e-05, + "loss": 0.4245272159576416, + "step": 4060 + }, + { + "epoch": 0.5777146912704045, + "grad_norm": 10.82681941986084, + "learning_rate": 9.942256919801278e-05, + "loss": 0.43029065132141114, + "step": 4070 + }, + { + "epoch": 0.5791341376863024, + "grad_norm": 7.2398481369018555, + "learning_rate": 9.942114975159689e-05, + "loss": 0.4268779277801514, + "step": 4080 + }, + { + "epoch": 0.5805535841022001, + "grad_norm": 12.160025596618652, + "learning_rate": 9.941973030518099e-05, + "loss": 0.45933380126953127, + "step": 4090 + }, + { + "epoch": 0.5819730305180979, + "grad_norm": 8.116787910461426, + "learning_rate": 9.941831085876508e-05, + "loss": 0.3810285568237305, + "step": 4100 + }, + { + "epoch": 0.5833924769339958, + "grad_norm": 7.5045037269592285, + "learning_rate": 9.94168914123492e-05, + "loss": 0.48673238754272463, + "step": 4110 + }, + { + "epoch": 0.5848119233498935, + "grad_norm": 10.9375581741333, + "learning_rate": 9.941547196593329e-05, + "loss": 0.5465658664703369, + "step": 4120 + }, + { + "epoch": 0.5862313697657914, + "grad_norm": 9.211751937866211, + "learning_rate": 9.94140525195174e-05, + "loss": 0.46831202507019043, + "step": 4130 + }, + { + "epoch": 0.5876508161816891, + "grad_norm": 7.636734485626221, + "learning_rate": 9.941263307310149e-05, + "loss": 0.3928233623504639, + "step": 4140 + }, + { + "epoch": 0.589070262597587, + "grad_norm": 7.125626564025879, + "learning_rate": 9.94112136266856e-05, + "loss": 0.40816364288330076, + "step": 4150 + }, + { + "epoch": 0.5904897090134847, + "grad_norm": 5.0693888664245605, + "learning_rate": 9.94097941802697e-05, + "loss": 0.3931445837020874, + "step": 4160 + }, + { + "epoch": 0.5919091554293825, + "grad_norm": 8.10261058807373, + "learning_rate": 9.94083747338538e-05, + "loss": 0.4498757839202881, + "step": 4170 + }, + { + "epoch": 0.5933286018452804, + "grad_norm": 9.593578338623047, + "learning_rate": 9.94069552874379e-05, + "loss": 0.46671414375305176, + "step": 4180 + }, + { + "epoch": 0.5947480482611781, + "grad_norm": 10.025617599487305, + "learning_rate": 9.9405535841022e-05, + "loss": 0.42932772636413574, + "step": 4190 + }, + { + "epoch": 0.596167494677076, + "grad_norm": 9.828198432922363, + "learning_rate": 9.940411639460611e-05, + "loss": 0.4723196506500244, + "step": 4200 + }, + { + "epoch": 0.5975869410929737, + "grad_norm": 7.570648193359375, + "learning_rate": 9.940269694819021e-05, + "loss": 0.4049358367919922, + "step": 4210 + }, + { + "epoch": 0.5990063875088716, + "grad_norm": 6.280502796173096, + "learning_rate": 9.940127750177432e-05, + "loss": 0.4247574806213379, + "step": 4220 + }, + { + "epoch": 0.6004258339247693, + "grad_norm": 8.619515419006348, + "learning_rate": 9.939985805535842e-05, + "loss": 0.3778993606567383, + "step": 4230 + }, + { + "epoch": 0.6018452803406671, + "grad_norm": 7.0030059814453125, + "learning_rate": 9.939843860894251e-05, + "loss": 0.4360033988952637, + "step": 4240 + }, + { + "epoch": 0.603264726756565, + "grad_norm": 6.206148624420166, + "learning_rate": 9.939701916252661e-05, + "loss": 0.4114119529724121, + "step": 4250 + }, + { + "epoch": 0.6046841731724627, + "grad_norm": 4.982306003570557, + "learning_rate": 9.939559971611072e-05, + "loss": 0.36019244194030764, + "step": 4260 + }, + { + "epoch": 0.6061036195883606, + "grad_norm": 7.193652153015137, + "learning_rate": 9.939418026969482e-05, + "loss": 0.47596259117126466, + "step": 4270 + }, + { + "epoch": 0.6075230660042583, + "grad_norm": 9.371147155761719, + "learning_rate": 9.939276082327893e-05, + "loss": 0.42912769317626953, + "step": 4280 + }, + { + "epoch": 0.6089425124201562, + "grad_norm": 8.962141036987305, + "learning_rate": 9.939134137686303e-05, + "loss": 0.42342243194580076, + "step": 4290 + }, + { + "epoch": 0.6103619588360539, + "grad_norm": 7.575186252593994, + "learning_rate": 9.938992193044712e-05, + "loss": 0.4972747802734375, + "step": 4300 + }, + { + "epoch": 0.6117814052519518, + "grad_norm": 6.965094566345215, + "learning_rate": 9.938850248403124e-05, + "loss": 0.3489841938018799, + "step": 4310 + }, + { + "epoch": 0.6132008516678495, + "grad_norm": 8.466391563415527, + "learning_rate": 9.938708303761533e-05, + "loss": 0.3544389009475708, + "step": 4320 + }, + { + "epoch": 0.6146202980837473, + "grad_norm": 6.5821123123168945, + "learning_rate": 9.938566359119944e-05, + "loss": 0.4732979297637939, + "step": 4330 + }, + { + "epoch": 0.6160397444996452, + "grad_norm": 6.803234100341797, + "learning_rate": 9.938424414478353e-05, + "loss": 0.39069912433624265, + "step": 4340 + }, + { + "epoch": 0.6174591909155429, + "grad_norm": 10.069840431213379, + "learning_rate": 9.938282469836764e-05, + "loss": 0.4592564582824707, + "step": 4350 + }, + { + "epoch": 0.6188786373314408, + "grad_norm": 9.41560173034668, + "learning_rate": 9.938140525195174e-05, + "loss": 0.45508289337158203, + "step": 4360 + }, + { + "epoch": 0.6202980837473385, + "grad_norm": 8.344886779785156, + "learning_rate": 9.937998580553585e-05, + "loss": 0.41198153495788575, + "step": 4370 + }, + { + "epoch": 0.6217175301632364, + "grad_norm": 9.129981994628906, + "learning_rate": 9.937856635911994e-05, + "loss": 0.33535902500152587, + "step": 4380 + }, + { + "epoch": 0.6231369765791341, + "grad_norm": 6.8436455726623535, + "learning_rate": 9.937714691270406e-05, + "loss": 0.3965883493423462, + "step": 4390 + }, + { + "epoch": 0.6245564229950319, + "grad_norm": 6.954466342926025, + "learning_rate": 9.937572746628815e-05, + "loss": 0.3352261304855347, + "step": 4400 + }, + { + "epoch": 0.6259758694109298, + "grad_norm": 8.227835655212402, + "learning_rate": 9.937430801987225e-05, + "loss": 0.4205745220184326, + "step": 4410 + }, + { + "epoch": 0.6273953158268275, + "grad_norm": 8.202418327331543, + "learning_rate": 9.937288857345636e-05, + "loss": 0.3927265405654907, + "step": 4420 + }, + { + "epoch": 0.6288147622427254, + "grad_norm": 9.406537055969238, + "learning_rate": 9.937146912704046e-05, + "loss": 0.4481183052062988, + "step": 4430 + }, + { + "epoch": 0.6302342086586231, + "grad_norm": 8.330412864685059, + "learning_rate": 9.937004968062457e-05, + "loss": 0.37030580043792727, + "step": 4440 + }, + { + "epoch": 0.631653655074521, + "grad_norm": 5.601277828216553, + "learning_rate": 9.936863023420865e-05, + "loss": 0.35557353496551514, + "step": 4450 + }, + { + "epoch": 0.6330731014904187, + "grad_norm": 11.551403999328613, + "learning_rate": 9.936721078779276e-05, + "loss": 0.3718759536743164, + "step": 4460 + }, + { + "epoch": 0.6344925479063165, + "grad_norm": 5.961857318878174, + "learning_rate": 9.936579134137686e-05, + "loss": 0.3828912258148193, + "step": 4470 + }, + { + "epoch": 0.6359119943222143, + "grad_norm": 6.173798561096191, + "learning_rate": 9.936437189496097e-05, + "loss": 0.392284631729126, + "step": 4480 + }, + { + "epoch": 0.6373314407381121, + "grad_norm": 8.952240943908691, + "learning_rate": 9.936295244854508e-05, + "loss": 0.41978960037231444, + "step": 4490 + }, + { + "epoch": 0.63875088715401, + "grad_norm": 9.86811637878418, + "learning_rate": 9.936153300212917e-05, + "loss": 0.42105417251586913, + "step": 4500 + }, + { + "epoch": 0.63875088715401, + "eval_accuracy": 0.8225344948178293, + "eval_loss": 0.5203356146812439, + "eval_runtime": 33.0374, + "eval_samples_per_second": 476.036, + "eval_steps_per_second": 14.892, + "step": 4500 + }, + { + "epoch": 0.6401703335699077, + "grad_norm": 10.036981582641602, + "learning_rate": 9.936011355571328e-05, + "loss": 0.41321401596069335, + "step": 4510 + }, + { + "epoch": 0.6415897799858056, + "grad_norm": 6.618304252624512, + "learning_rate": 9.935869410929738e-05, + "loss": 0.43657841682434084, + "step": 4520 + }, + { + "epoch": 0.6430092264017033, + "grad_norm": 9.975127220153809, + "learning_rate": 9.935727466288149e-05, + "loss": 0.3949880838394165, + "step": 4530 + }, + { + "epoch": 0.6444286728176012, + "grad_norm": 8.210672378540039, + "learning_rate": 9.935585521646558e-05, + "loss": 0.4280043125152588, + "step": 4540 + }, + { + "epoch": 0.6458481192334989, + "grad_norm": 12.055879592895508, + "learning_rate": 9.935443577004968e-05, + "loss": 0.39465947151184083, + "step": 4550 + }, + { + "epoch": 0.6472675656493967, + "grad_norm": 7.540829658508301, + "learning_rate": 9.935301632363378e-05, + "loss": 0.3965680837631226, + "step": 4560 + }, + { + "epoch": 0.6486870120652946, + "grad_norm": 9.717781066894531, + "learning_rate": 9.935159687721789e-05, + "loss": 0.40194106101989746, + "step": 4570 + }, + { + "epoch": 0.6501064584811923, + "grad_norm": 10.271167755126953, + "learning_rate": 9.9350177430802e-05, + "loss": 0.4726293087005615, + "step": 4580 + }, + { + "epoch": 0.6515259048970902, + "grad_norm": 7.158174514770508, + "learning_rate": 9.93487579843861e-05, + "loss": 0.40993413925170896, + "step": 4590 + }, + { + "epoch": 0.6529453513129879, + "grad_norm": 10.536994934082031, + "learning_rate": 9.93473385379702e-05, + "loss": 0.4424222469329834, + "step": 4600 + }, + { + "epoch": 0.6543647977288858, + "grad_norm": 7.256109714508057, + "learning_rate": 9.934591909155429e-05, + "loss": 0.388359522819519, + "step": 4610 + }, + { + "epoch": 0.6557842441447835, + "grad_norm": 8.278726577758789, + "learning_rate": 9.93444996451384e-05, + "loss": 0.3513230085372925, + "step": 4620 + }, + { + "epoch": 0.6572036905606813, + "grad_norm": 7.767818927764893, + "learning_rate": 9.93430801987225e-05, + "loss": 0.42050671577453613, + "step": 4630 + }, + { + "epoch": 0.6586231369765791, + "grad_norm": 3.4903321266174316, + "learning_rate": 9.934166075230661e-05, + "loss": 0.3255154609680176, + "step": 4640 + }, + { + "epoch": 0.6600425833924769, + "grad_norm": 8.193768501281738, + "learning_rate": 9.93402413058907e-05, + "loss": 0.34639596939086914, + "step": 4650 + }, + { + "epoch": 0.6614620298083748, + "grad_norm": 6.168176651000977, + "learning_rate": 9.93388218594748e-05, + "loss": 0.3619822025299072, + "step": 4660 + }, + { + "epoch": 0.6628814762242725, + "grad_norm": 4.793501853942871, + "learning_rate": 9.933740241305892e-05, + "loss": 0.3441330909729004, + "step": 4670 + }, + { + "epoch": 0.6643009226401704, + "grad_norm": 7.100066184997559, + "learning_rate": 9.933598296664301e-05, + "loss": 0.41966400146484373, + "step": 4680 + }, + { + "epoch": 0.6657203690560681, + "grad_norm": 8.032003402709961, + "learning_rate": 9.933456352022713e-05, + "loss": 0.39086959362030027, + "step": 4690 + }, + { + "epoch": 0.6671398154719659, + "grad_norm": 5.533408164978027, + "learning_rate": 9.933314407381121e-05, + "loss": 0.455733060836792, + "step": 4700 + }, + { + "epoch": 0.6685592618878637, + "grad_norm": 6.478943347930908, + "learning_rate": 9.933172462739532e-05, + "loss": 0.3870114326477051, + "step": 4710 + }, + { + "epoch": 0.6699787083037615, + "grad_norm": 8.963722229003906, + "learning_rate": 9.933030518097942e-05, + "loss": 0.4041899681091309, + "step": 4720 + }, + { + "epoch": 0.6713981547196594, + "grad_norm": 4.072963714599609, + "learning_rate": 9.932888573456353e-05, + "loss": 0.35542023181915283, + "step": 4730 + }, + { + "epoch": 0.6728176011355571, + "grad_norm": 6.834389686584473, + "learning_rate": 9.932746628814763e-05, + "loss": 0.34830470085144044, + "step": 4740 + }, + { + "epoch": 0.674237047551455, + "grad_norm": 7.003122329711914, + "learning_rate": 9.932604684173174e-05, + "loss": 0.3465887069702148, + "step": 4750 + }, + { + "epoch": 0.6756564939673527, + "grad_norm": 8.914156913757324, + "learning_rate": 9.932462739531583e-05, + "loss": 0.44321861267089846, + "step": 4760 + }, + { + "epoch": 0.6770759403832506, + "grad_norm": 7.6024627685546875, + "learning_rate": 9.932320794889993e-05, + "loss": 0.40067334175109864, + "step": 4770 + }, + { + "epoch": 0.6784953867991483, + "grad_norm": 8.667821884155273, + "learning_rate": 9.932178850248404e-05, + "loss": 0.371229887008667, + "step": 4780 + }, + { + "epoch": 0.6799148332150461, + "grad_norm": 9.355796813964844, + "learning_rate": 9.932036905606814e-05, + "loss": 0.3920291900634766, + "step": 4790 + }, + { + "epoch": 0.681334279630944, + "grad_norm": 6.767845153808594, + "learning_rate": 9.931894960965225e-05, + "loss": 0.3848612070083618, + "step": 4800 + }, + { + "epoch": 0.6827537260468417, + "grad_norm": 8.195937156677246, + "learning_rate": 9.931767210787794e-05, + "loss": 0.5190616607666015, + "step": 4810 + }, + { + "epoch": 0.6841731724627396, + "grad_norm": 6.033681869506836, + "learning_rate": 9.931625266146203e-05, + "loss": 0.39132606983184814, + "step": 4820 + }, + { + "epoch": 0.6855926188786373, + "grad_norm": 8.469270706176758, + "learning_rate": 9.931483321504613e-05, + "loss": 0.3626258850097656, + "step": 4830 + }, + { + "epoch": 0.6870120652945352, + "grad_norm": 4.255542278289795, + "learning_rate": 9.931341376863024e-05, + "loss": 0.31856842041015626, + "step": 4840 + }, + { + "epoch": 0.6884315117104329, + "grad_norm": 9.191469192504883, + "learning_rate": 9.931199432221434e-05, + "loss": 0.3280362367630005, + "step": 4850 + }, + { + "epoch": 0.6898509581263307, + "grad_norm": 8.94046688079834, + "learning_rate": 9.931057487579845e-05, + "loss": 0.39851620197296145, + "step": 4860 + }, + { + "epoch": 0.6912704045422285, + "grad_norm": 7.770534992218018, + "learning_rate": 9.930915542938255e-05, + "loss": 0.33825528621673584, + "step": 4870 + }, + { + "epoch": 0.6926898509581263, + "grad_norm": 6.560062885284424, + "learning_rate": 9.930773598296664e-05, + "loss": 0.35839481353759767, + "step": 4880 + }, + { + "epoch": 0.6941092973740242, + "grad_norm": 9.24365520477295, + "learning_rate": 9.930631653655074e-05, + "loss": 0.39770119190216063, + "step": 4890 + }, + { + "epoch": 0.6955287437899219, + "grad_norm": 11.744332313537598, + "learning_rate": 9.930489709013485e-05, + "loss": 0.4902297019958496, + "step": 4900 + }, + { + "epoch": 0.6969481902058198, + "grad_norm": 7.251524448394775, + "learning_rate": 9.930347764371895e-05, + "loss": 0.40317511558532715, + "step": 4910 + }, + { + "epoch": 0.6983676366217175, + "grad_norm": 8.896724700927734, + "learning_rate": 9.930205819730306e-05, + "loss": 0.44049978256225586, + "step": 4920 + }, + { + "epoch": 0.6997870830376153, + "grad_norm": 7.477156162261963, + "learning_rate": 9.930063875088716e-05, + "loss": 0.3586245536804199, + "step": 4930 + }, + { + "epoch": 0.7012065294535131, + "grad_norm": 6.159836769104004, + "learning_rate": 9.929921930447126e-05, + "loss": 0.32783629894256594, + "step": 4940 + }, + { + "epoch": 0.7026259758694109, + "grad_norm": 6.85299825668335, + "learning_rate": 9.929779985805537e-05, + "loss": 0.30911822319030763, + "step": 4950 + }, + { + "epoch": 0.7040454222853088, + "grad_norm": 7.820040225982666, + "learning_rate": 9.929638041163946e-05, + "loss": 0.36734838485717775, + "step": 4960 + }, + { + "epoch": 0.7054648687012065, + "grad_norm": 6.66180944442749, + "learning_rate": 9.929496096522358e-05, + "loss": 0.37120904922485354, + "step": 4970 + }, + { + "epoch": 0.7068843151171044, + "grad_norm": 7.3861775398254395, + "learning_rate": 9.929354151880766e-05, + "loss": 0.4064349174499512, + "step": 4980 + }, + { + "epoch": 0.7083037615330021, + "grad_norm": 7.068629741668701, + "learning_rate": 9.929212207239177e-05, + "loss": 0.36406426429748534, + "step": 4990 + }, + { + "epoch": 0.7097232079489, + "grad_norm": 7.482442378997803, + "learning_rate": 9.929070262597587e-05, + "loss": 0.40763154029846194, + "step": 5000 + }, + { + "epoch": 0.7097232079489, + "eval_accuracy": 0.8707954473198957, + "eval_loss": 0.37987253069877625, + "eval_runtime": 33.0642, + "eval_samples_per_second": 475.651, + "eval_steps_per_second": 14.88, + "step": 5000 + }, + { + "epoch": 0.7111426543647977, + "grad_norm": 5.368759632110596, + "learning_rate": 9.928928317955998e-05, + "loss": 0.40792322158813477, + "step": 5010 + }, + { + "epoch": 0.7125621007806955, + "grad_norm": 3.8395280838012695, + "learning_rate": 9.928786373314408e-05, + "loss": 0.45433621406555175, + "step": 5020 + }, + { + "epoch": 0.7139815471965933, + "grad_norm": 7.884678840637207, + "learning_rate": 9.928644428672817e-05, + "loss": 0.3092354774475098, + "step": 5030 + }, + { + "epoch": 0.7154009936124911, + "grad_norm": 9.11925983428955, + "learning_rate": 9.928502484031228e-05, + "loss": 0.3887113094329834, + "step": 5040 + }, + { + "epoch": 0.716820440028389, + "grad_norm": 8.5901517868042, + "learning_rate": 9.928360539389638e-05, + "loss": 0.3938072443008423, + "step": 5050 + }, + { + "epoch": 0.7182398864442867, + "grad_norm": 4.011209011077881, + "learning_rate": 9.928218594748049e-05, + "loss": 0.3140719890594482, + "step": 5060 + }, + { + "epoch": 0.7196593328601846, + "grad_norm": 9.04295825958252, + "learning_rate": 9.928076650106459e-05, + "loss": 0.37023751735687255, + "step": 5070 + }, + { + "epoch": 0.7210787792760823, + "grad_norm": 7.336644649505615, + "learning_rate": 9.92793470546487e-05, + "loss": 0.3326029539108276, + "step": 5080 + }, + { + "epoch": 0.7224982256919801, + "grad_norm": 6.824075698852539, + "learning_rate": 9.927792760823278e-05, + "loss": 0.31377925872802737, + "step": 5090 + }, + { + "epoch": 0.7239176721078779, + "grad_norm": 6.152795314788818, + "learning_rate": 9.92765081618169e-05, + "loss": 0.4362512111663818, + "step": 5100 + }, + { + "epoch": 0.7253371185237757, + "grad_norm": 7.997036457061768, + "learning_rate": 9.927508871540099e-05, + "loss": 0.39910459518432617, + "step": 5110 + }, + { + "epoch": 0.7267565649396736, + "grad_norm": 7.5024309158325195, + "learning_rate": 9.92736692689851e-05, + "loss": 0.3690288305282593, + "step": 5120 + }, + { + "epoch": 0.7281760113555713, + "grad_norm": 9.340811729431152, + "learning_rate": 9.92722498225692e-05, + "loss": 0.28037595748901367, + "step": 5130 + }, + { + "epoch": 0.7295954577714692, + "grad_norm": 6.796107292175293, + "learning_rate": 9.92708303761533e-05, + "loss": 0.2862435817718506, + "step": 5140 + }, + { + "epoch": 0.7310149041873669, + "grad_norm": 6.0283379554748535, + "learning_rate": 9.926941092973741e-05, + "loss": 0.351378345489502, + "step": 5150 + }, + { + "epoch": 0.7324343506032647, + "grad_norm": 6.880161762237549, + "learning_rate": 9.926799148332151e-05, + "loss": 0.3127347230911255, + "step": 5160 + }, + { + "epoch": 0.7338537970191625, + "grad_norm": 7.761416912078857, + "learning_rate": 9.926657203690562e-05, + "loss": 0.3232876777648926, + "step": 5170 + }, + { + "epoch": 0.7352732434350603, + "grad_norm": 8.840635299682617, + "learning_rate": 9.926515259048972e-05, + "loss": 0.36195032596588134, + "step": 5180 + }, + { + "epoch": 0.7366926898509581, + "grad_norm": 10.067350387573242, + "learning_rate": 9.926373314407381e-05, + "loss": 0.33318257331848145, + "step": 5190 + }, + { + "epoch": 0.7381121362668559, + "grad_norm": 4.935089111328125, + "learning_rate": 9.926231369765791e-05, + "loss": 0.3263442039489746, + "step": 5200 + }, + { + "epoch": 0.7395315826827538, + "grad_norm": 6.868301868438721, + "learning_rate": 9.926089425124202e-05, + "loss": 0.4087569236755371, + "step": 5210 + }, + { + "epoch": 0.7409510290986515, + "grad_norm": 7.978097915649414, + "learning_rate": 9.925947480482612e-05, + "loss": 0.33616573810577394, + "step": 5220 + }, + { + "epoch": 0.7423704755145494, + "grad_norm": 11.391094207763672, + "learning_rate": 9.925805535841023e-05, + "loss": 0.33483550548553465, + "step": 5230 + }, + { + "epoch": 0.7437899219304471, + "grad_norm": 5.558361530303955, + "learning_rate": 9.925663591199433e-05, + "loss": 0.38994641304016114, + "step": 5240 + }, + { + "epoch": 0.7452093683463449, + "grad_norm": 2.6022746562957764, + "learning_rate": 9.925521646557842e-05, + "loss": 0.2801194429397583, + "step": 5250 + }, + { + "epoch": 0.7466288147622427, + "grad_norm": 10.395146369934082, + "learning_rate": 9.925379701916253e-05, + "loss": 0.45772466659545896, + "step": 5260 + }, + { + "epoch": 0.7480482611781405, + "grad_norm": 10.162497520446777, + "learning_rate": 9.925237757274663e-05, + "loss": 0.3906741142272949, + "step": 5270 + }, + { + "epoch": 0.7494677075940384, + "grad_norm": 7.618703365325928, + "learning_rate": 9.925095812633074e-05, + "loss": 0.3549813747406006, + "step": 5280 + }, + { + "epoch": 0.7508871540099361, + "grad_norm": 6.407444953918457, + "learning_rate": 9.924953867991483e-05, + "loss": 0.3040858268737793, + "step": 5290 + }, + { + "epoch": 0.752306600425834, + "grad_norm": 7.738057613372803, + "learning_rate": 9.924811923349894e-05, + "loss": 0.39499850273132325, + "step": 5300 + }, + { + "epoch": 0.7537260468417317, + "grad_norm": 7.237374782562256, + "learning_rate": 9.924669978708304e-05, + "loss": 0.3085558652877808, + "step": 5310 + }, + { + "epoch": 0.7551454932576295, + "grad_norm": 6.442776203155518, + "learning_rate": 9.924528034066715e-05, + "loss": 0.40102262496948243, + "step": 5320 + }, + { + "epoch": 0.7565649396735273, + "grad_norm": 10.280111312866211, + "learning_rate": 9.924386089425126e-05, + "loss": 0.3338863611221313, + "step": 5330 + }, + { + "epoch": 0.7579843860894251, + "grad_norm": 8.590238571166992, + "learning_rate": 9.924244144783534e-05, + "loss": 0.48393831253051756, + "step": 5340 + }, + { + "epoch": 0.759403832505323, + "grad_norm": 4.818009376525879, + "learning_rate": 9.924102200141945e-05, + "loss": 0.31519811153411864, + "step": 5350 + }, + { + "epoch": 0.7608232789212207, + "grad_norm": 7.284486293792725, + "learning_rate": 9.923960255500355e-05, + "loss": 0.3537211179733276, + "step": 5360 + }, + { + "epoch": 0.7622427253371186, + "grad_norm": 8.618793487548828, + "learning_rate": 9.923818310858766e-05, + "loss": 0.34086947441101073, + "step": 5370 + }, + { + "epoch": 0.7636621717530163, + "grad_norm": 8.162178039550781, + "learning_rate": 9.923676366217176e-05, + "loss": 0.38811311721801756, + "step": 5380 + }, + { + "epoch": 0.7650816181689141, + "grad_norm": 7.360818386077881, + "learning_rate": 9.923534421575587e-05, + "loss": 0.30603010654449464, + "step": 5390 + }, + { + "epoch": 0.7665010645848119, + "grad_norm": 4.011861801147461, + "learning_rate": 9.923392476933995e-05, + "loss": 0.23683266639709472, + "step": 5400 + }, + { + "epoch": 0.7679205110007097, + "grad_norm": 5.943147659301758, + "learning_rate": 9.923250532292406e-05, + "loss": 0.34063313007354734, + "step": 5410 + }, + { + "epoch": 0.7693399574166075, + "grad_norm": 7.751121997833252, + "learning_rate": 9.923108587650817e-05, + "loss": 0.36524248123168945, + "step": 5420 + }, + { + "epoch": 0.7707594038325053, + "grad_norm": 8.413863182067871, + "learning_rate": 9.922966643009227e-05, + "loss": 0.3002290725708008, + "step": 5430 + }, + { + "epoch": 0.7721788502484032, + "grad_norm": 7.4792280197143555, + "learning_rate": 9.922824698367638e-05, + "loss": 0.2858253240585327, + "step": 5440 + }, + { + "epoch": 0.7735982966643009, + "grad_norm": 4.943634986877441, + "learning_rate": 9.922682753726047e-05, + "loss": 0.3922913074493408, + "step": 5450 + }, + { + "epoch": 0.7750177430801988, + "grad_norm": 9.556757926940918, + "learning_rate": 9.922540809084458e-05, + "loss": 0.32624542713165283, + "step": 5460 + }, + { + "epoch": 0.7764371894960965, + "grad_norm": 6.306029319763184, + "learning_rate": 9.922398864442867e-05, + "loss": 0.32522106170654297, + "step": 5470 + }, + { + "epoch": 0.7778566359119943, + "grad_norm": 9.622481346130371, + "learning_rate": 9.922256919801279e-05, + "loss": 0.32840585708618164, + "step": 5480 + }, + { + "epoch": 0.7792760823278921, + "grad_norm": 6.480415344238281, + "learning_rate": 9.922114975159688e-05, + "loss": 0.31494650840759275, + "step": 5490 + }, + { + "epoch": 0.7806955287437899, + "grad_norm": 9.822346687316895, + "learning_rate": 9.921973030518098e-05, + "loss": 0.3520227909088135, + "step": 5500 + }, + { + "epoch": 0.7806955287437899, + "eval_accuracy": 0.8887263940993196, + "eval_loss": 0.331625759601593, + "eval_runtime": 33.1217, + "eval_samples_per_second": 474.825, + "eval_steps_per_second": 14.854, + "step": 5500 + }, + { + "epoch": 0.7821149751596878, + "grad_norm": 8.544402122497559, + "learning_rate": 9.921831085876508e-05, + "loss": 0.3386709451675415, + "step": 5510 + }, + { + "epoch": 0.7835344215755855, + "grad_norm": 6.877591133117676, + "learning_rate": 9.921689141234919e-05, + "loss": 0.3577073574066162, + "step": 5520 + }, + { + "epoch": 0.7849538679914834, + "grad_norm": 8.182839393615723, + "learning_rate": 9.92154719659333e-05, + "loss": 0.33861188888549804, + "step": 5530 + }, + { + "epoch": 0.7863733144073811, + "grad_norm": 7.762393474578857, + "learning_rate": 9.92140525195174e-05, + "loss": 0.2913277387619019, + "step": 5540 + }, + { + "epoch": 0.7877927608232789, + "grad_norm": 9.238672256469727, + "learning_rate": 9.92126330731015e-05, + "loss": 0.27555758953094484, + "step": 5550 + }, + { + "epoch": 0.7892122072391767, + "grad_norm": 8.316729545593262, + "learning_rate": 9.921121362668559e-05, + "loss": 0.3221546411514282, + "step": 5560 + }, + { + "epoch": 0.7906316536550745, + "grad_norm": 5.685539245605469, + "learning_rate": 9.92097941802697e-05, + "loss": 0.335821533203125, + "step": 5570 + }, + { + "epoch": 0.7920511000709723, + "grad_norm": 9.121819496154785, + "learning_rate": 9.92083747338538e-05, + "loss": 0.41519789695739745, + "step": 5580 + }, + { + "epoch": 0.7934705464868701, + "grad_norm": 10.83812141418457, + "learning_rate": 9.920695528743791e-05, + "loss": 0.30081839561462403, + "step": 5590 + }, + { + "epoch": 0.794889992902768, + "grad_norm": 3.7030341625213623, + "learning_rate": 9.9205535841022e-05, + "loss": 0.3369245767593384, + "step": 5600 + }, + { + "epoch": 0.7963094393186657, + "grad_norm": 3.8987886905670166, + "learning_rate": 9.92041163946061e-05, + "loss": 0.3294223785400391, + "step": 5610 + }, + { + "epoch": 0.7977288857345636, + "grad_norm": 4.1831207275390625, + "learning_rate": 9.920269694819022e-05, + "loss": 0.2734922170639038, + "step": 5620 + }, + { + "epoch": 0.7991483321504613, + "grad_norm": 7.363320827484131, + "learning_rate": 9.920127750177431e-05, + "loss": 0.3629761219024658, + "step": 5630 + }, + { + "epoch": 0.8005677785663591, + "grad_norm": 3.947075366973877, + "learning_rate": 9.919985805535842e-05, + "loss": 0.24655752182006835, + "step": 5640 + }, + { + "epoch": 0.8019872249822569, + "grad_norm": 7.183192253112793, + "learning_rate": 9.919843860894251e-05, + "loss": 0.3074009895324707, + "step": 5650 + }, + { + "epoch": 0.8034066713981547, + "grad_norm": 9.004253387451172, + "learning_rate": 9.919701916252662e-05, + "loss": 0.38861281871795655, + "step": 5660 + }, + { + "epoch": 0.8048261178140526, + "grad_norm": 7.553649425506592, + "learning_rate": 9.919559971611072e-05, + "loss": 0.4247180461883545, + "step": 5670 + }, + { + "epoch": 0.8062455642299503, + "grad_norm": 6.382741928100586, + "learning_rate": 9.919418026969483e-05, + "loss": 0.304930305480957, + "step": 5680 + }, + { + "epoch": 0.8076650106458482, + "grad_norm": 5.102434158325195, + "learning_rate": 9.919276082327893e-05, + "loss": 0.38076980113983155, + "step": 5690 + }, + { + "epoch": 0.8090844570617459, + "grad_norm": 6.131350517272949, + "learning_rate": 9.919134137686302e-05, + "loss": 0.40895967483520507, + "step": 5700 + }, + { + "epoch": 0.8105039034776437, + "grad_norm": 7.717721939086914, + "learning_rate": 9.918992193044713e-05, + "loss": 0.34289727210998533, + "step": 5710 + }, + { + "epoch": 0.8119233498935415, + "grad_norm": 7.452071189880371, + "learning_rate": 9.918850248403123e-05, + "loss": 0.26248266696929934, + "step": 5720 + }, + { + "epoch": 0.8133427963094393, + "grad_norm": 4.934199333190918, + "learning_rate": 9.918708303761534e-05, + "loss": 0.2918365478515625, + "step": 5730 + }, + { + "epoch": 0.8147622427253371, + "grad_norm": 3.497220993041992, + "learning_rate": 9.918566359119944e-05, + "loss": 0.27859480381011964, + "step": 5740 + }, + { + "epoch": 0.8161816891412349, + "grad_norm": 9.320852279663086, + "learning_rate": 9.918424414478355e-05, + "loss": 0.34371328353881836, + "step": 5750 + }, + { + "epoch": 0.8176011355571328, + "grad_norm": 10.081619262695312, + "learning_rate": 9.918282469836763e-05, + "loss": 0.36181211471557617, + "step": 5760 + }, + { + "epoch": 0.8190205819730305, + "grad_norm": 7.466938018798828, + "learning_rate": 9.918140525195174e-05, + "loss": 0.34078028202056887, + "step": 5770 + }, + { + "epoch": 0.8204400283889283, + "grad_norm": 4.303114414215088, + "learning_rate": 9.917998580553584e-05, + "loss": 0.34729723930358886, + "step": 5780 + }, + { + "epoch": 0.8218594748048261, + "grad_norm": 9.38592529296875, + "learning_rate": 9.917856635911995e-05, + "loss": 0.4285425662994385, + "step": 5790 + }, + { + "epoch": 0.8232789212207239, + "grad_norm": 9.465388298034668, + "learning_rate": 9.917714691270405e-05, + "loss": 0.3501663446426392, + "step": 5800 + }, + { + "epoch": 0.8246983676366217, + "grad_norm": 5.500204086303711, + "learning_rate": 9.917572746628815e-05, + "loss": 0.3102808952331543, + "step": 5810 + }, + { + "epoch": 0.8261178140525195, + "grad_norm": 4.572218894958496, + "learning_rate": 9.917430801987226e-05, + "loss": 0.2433872938156128, + "step": 5820 + }, + { + "epoch": 0.8275372604684174, + "grad_norm": 9.858591079711914, + "learning_rate": 9.917288857345636e-05, + "loss": 0.30695419311523436, + "step": 5830 + }, + { + "epoch": 0.8289567068843151, + "grad_norm": 6.843176364898682, + "learning_rate": 9.917146912704047e-05, + "loss": 0.35634801387786863, + "step": 5840 + }, + { + "epoch": 0.830376153300213, + "grad_norm": 10.634949684143066, + "learning_rate": 9.917004968062456e-05, + "loss": 0.3107039451599121, + "step": 5850 + }, + { + "epoch": 0.8317955997161107, + "grad_norm": 8.44272518157959, + "learning_rate": 9.916863023420866e-05, + "loss": 0.3672316551208496, + "step": 5860 + }, + { + "epoch": 0.8332150461320085, + "grad_norm": 5.4848785400390625, + "learning_rate": 9.916721078779276e-05, + "loss": 0.4015390872955322, + "step": 5870 + }, + { + "epoch": 0.8346344925479063, + "grad_norm": 7.271710395812988, + "learning_rate": 9.916579134137687e-05, + "loss": 0.23676373958587646, + "step": 5880 + }, + { + "epoch": 0.8360539389638041, + "grad_norm": 4.376358509063721, + "learning_rate": 9.916437189496097e-05, + "loss": 0.2711988687515259, + "step": 5890 + }, + { + "epoch": 0.837473385379702, + "grad_norm": 6.931346416473389, + "learning_rate": 9.916295244854508e-05, + "loss": 0.2837867021560669, + "step": 5900 + }, + { + "epoch": 0.8388928317955997, + "grad_norm": 7.611521244049072, + "learning_rate": 9.916153300212918e-05, + "loss": 0.315134072303772, + "step": 5910 + }, + { + "epoch": 0.8403122782114976, + "grad_norm": 7.071038722991943, + "learning_rate": 9.916011355571327e-05, + "loss": 0.3368415594100952, + "step": 5920 + }, + { + "epoch": 0.8417317246273953, + "grad_norm": 4.1825056076049805, + "learning_rate": 9.915869410929738e-05, + "loss": 0.3074488162994385, + "step": 5930 + }, + { + "epoch": 0.8431511710432931, + "grad_norm": 6.3160929679870605, + "learning_rate": 9.915727466288148e-05, + "loss": 0.3252119541168213, + "step": 5940 + }, + { + "epoch": 0.8445706174591909, + "grad_norm": 8.007182121276855, + "learning_rate": 9.915585521646559e-05, + "loss": 0.23286638259887696, + "step": 5950 + }, + { + "epoch": 0.8459900638750887, + "grad_norm": 7.93002986907959, + "learning_rate": 9.915443577004968e-05, + "loss": 0.2870266199111938, + "step": 5960 + }, + { + "epoch": 0.8474095102909865, + "grad_norm": 5.426539897918701, + "learning_rate": 9.915301632363379e-05, + "loss": 0.29859611988067625, + "step": 5970 + }, + { + "epoch": 0.8488289567068843, + "grad_norm": 4.294735908508301, + "learning_rate": 9.915159687721788e-05, + "loss": 0.24727118015289307, + "step": 5980 + }, + { + "epoch": 0.8502484031227822, + "grad_norm": 8.501158714294434, + "learning_rate": 9.9150177430802e-05, + "loss": 0.3406102657318115, + "step": 5990 + }, + { + "epoch": 0.8516678495386799, + "grad_norm": 8.125472068786621, + "learning_rate": 9.914875798438609e-05, + "loss": 0.3179450273513794, + "step": 6000 + }, + { + "epoch": 0.8516678495386799, + "eval_accuracy": 0.8626565778597317, + "eval_loss": 0.4082823693752289, + "eval_runtime": 33.3539, + "eval_samples_per_second": 471.52, + "eval_steps_per_second": 14.751, + "step": 6000 + }, + { + "epoch": 0.8530872959545777, + "grad_norm": 4.980500221252441, + "learning_rate": 9.914733853797019e-05, + "loss": 0.3588885307312012, + "step": 6010 + }, + { + "epoch": 0.8545067423704755, + "grad_norm": 5.385146617889404, + "learning_rate": 9.91459190915543e-05, + "loss": 0.28512775897979736, + "step": 6020 + }, + { + "epoch": 0.8559261887863733, + "grad_norm": 8.24423599243164, + "learning_rate": 9.91444996451384e-05, + "loss": 0.32922515869140623, + "step": 6030 + }, + { + "epoch": 0.8573456352022711, + "grad_norm": 6.568521499633789, + "learning_rate": 9.914308019872251e-05, + "loss": 0.24458625316619872, + "step": 6040 + }, + { + "epoch": 0.8587650816181689, + "grad_norm": 6.268226146697998, + "learning_rate": 9.914166075230661e-05, + "loss": 0.30663580894470216, + "step": 6050 + }, + { + "epoch": 0.8601845280340668, + "grad_norm": 5.911208152770996, + "learning_rate": 9.91402413058907e-05, + "loss": 0.38018484115600587, + "step": 6060 + }, + { + "epoch": 0.8616039744499645, + "grad_norm": 5.170897483825684, + "learning_rate": 9.91388218594748e-05, + "loss": 0.22591965198516845, + "step": 6070 + }, + { + "epoch": 0.8630234208658624, + "grad_norm": 5.716799736022949, + "learning_rate": 9.913740241305891e-05, + "loss": 0.2626305103302002, + "step": 6080 + }, + { + "epoch": 0.8644428672817601, + "grad_norm": 6.144148349761963, + "learning_rate": 9.913598296664301e-05, + "loss": 0.23459088802337646, + "step": 6090 + }, + { + "epoch": 0.8658623136976579, + "grad_norm": 8.506244659423828, + "learning_rate": 9.913456352022712e-05, + "loss": 0.36330761909484866, + "step": 6100 + }, + { + "epoch": 0.8672817601135557, + "grad_norm": 9.882643699645996, + "learning_rate": 9.913314407381122e-05, + "loss": 0.32826101779937744, + "step": 6110 + }, + { + "epoch": 0.8687012065294535, + "grad_norm": 8.62743091583252, + "learning_rate": 9.913172462739532e-05, + "loss": 0.30355727672576904, + "step": 6120 + }, + { + "epoch": 0.8701206529453513, + "grad_norm": 11.726634979248047, + "learning_rate": 9.913030518097943e-05, + "loss": 0.280806303024292, + "step": 6130 + }, + { + "epoch": 0.8715400993612491, + "grad_norm": 7.7827839851379395, + "learning_rate": 9.912888573456352e-05, + "loss": 0.3389289855957031, + "step": 6140 + }, + { + "epoch": 0.872959545777147, + "grad_norm": 12.07807731628418, + "learning_rate": 9.912746628814764e-05, + "loss": 0.31570281982421877, + "step": 6150 + }, + { + "epoch": 0.8743789921930447, + "grad_norm": 4.949673652648926, + "learning_rate": 9.912604684173173e-05, + "loss": 0.269368839263916, + "step": 6160 + }, + { + "epoch": 0.8757984386089425, + "grad_norm": 6.946098327636719, + "learning_rate": 9.912462739531583e-05, + "loss": 0.33236119747161863, + "step": 6170 + }, + { + "epoch": 0.8772178850248403, + "grad_norm": 7.137246131896973, + "learning_rate": 9.912320794889993e-05, + "loss": 0.3343817710876465, + "step": 6180 + }, + { + "epoch": 0.8786373314407381, + "grad_norm": 4.929990768432617, + "learning_rate": 9.912178850248404e-05, + "loss": 0.23963472843170167, + "step": 6190 + }, + { + "epoch": 0.8800567778566359, + "grad_norm": 10.46869945526123, + "learning_rate": 9.912036905606814e-05, + "loss": 0.2913534641265869, + "step": 6200 + }, + { + "epoch": 0.8814762242725337, + "grad_norm": 7.179393291473389, + "learning_rate": 9.911894960965225e-05, + "loss": 0.27806806564331055, + "step": 6210 + }, + { + "epoch": 0.8828956706884316, + "grad_norm": 5.430668830871582, + "learning_rate": 9.911753016323634e-05, + "loss": 0.2537125587463379, + "step": 6220 + }, + { + "epoch": 0.8843151171043293, + "grad_norm": 7.001239776611328, + "learning_rate": 9.911611071682044e-05, + "loss": 0.2821568489074707, + "step": 6230 + }, + { + "epoch": 0.8857345635202271, + "grad_norm": 10.218942642211914, + "learning_rate": 9.911469127040455e-05, + "loss": 0.30785112380981444, + "step": 6240 + }, + { + "epoch": 0.8871540099361249, + "grad_norm": 3.9179635047912598, + "learning_rate": 9.911327182398865e-05, + "loss": 0.3376051902770996, + "step": 6250 + }, + { + "epoch": 0.8885734563520227, + "grad_norm": 7.35114049911499, + "learning_rate": 9.911185237757276e-05, + "loss": 0.2029582977294922, + "step": 6260 + }, + { + "epoch": 0.8899929027679205, + "grad_norm": 7.477942943572998, + "learning_rate": 9.911043293115684e-05, + "loss": 0.31639838218688965, + "step": 6270 + }, + { + "epoch": 0.8914123491838183, + "grad_norm": 6.479630470275879, + "learning_rate": 9.910901348474096e-05, + "loss": 0.35874156951904296, + "step": 6280 + }, + { + "epoch": 0.8928317955997161, + "grad_norm": 5.139812469482422, + "learning_rate": 9.910759403832505e-05, + "loss": 0.23642609119415284, + "step": 6290 + }, + { + "epoch": 0.8942512420156139, + "grad_norm": 7.17330265045166, + "learning_rate": 9.910617459190916e-05, + "loss": 0.27939982414245607, + "step": 6300 + }, + { + "epoch": 0.8956706884315118, + "grad_norm": 8.804689407348633, + "learning_rate": 9.910475514549326e-05, + "loss": 0.3722469568252563, + "step": 6310 + }, + { + "epoch": 0.8970901348474095, + "grad_norm": 2.958435297012329, + "learning_rate": 9.910333569907736e-05, + "loss": 0.23576738834381103, + "step": 6320 + }, + { + "epoch": 0.8985095812633073, + "grad_norm": 10.53680419921875, + "learning_rate": 9.910191625266147e-05, + "loss": 0.4027998447418213, + "step": 6330 + }, + { + "epoch": 0.8999290276792051, + "grad_norm": 5.857926368713379, + "learning_rate": 9.910049680624557e-05, + "loss": 0.29457688331604004, + "step": 6340 + }, + { + "epoch": 0.9013484740951029, + "grad_norm": 1.7572773694992065, + "learning_rate": 9.909907735982968e-05, + "loss": 0.2572882890701294, + "step": 6350 + }, + { + "epoch": 0.9027679205110007, + "grad_norm": 4.274378299713135, + "learning_rate": 9.909765791341377e-05, + "loss": 0.23681292533874512, + "step": 6360 + }, + { + "epoch": 0.9041873669268985, + "grad_norm": 7.596087455749512, + "learning_rate": 9.909623846699787e-05, + "loss": 0.23812153339385986, + "step": 6370 + }, + { + "epoch": 0.9056068133427964, + "grad_norm": 5.59556770324707, + "learning_rate": 9.909481902058197e-05, + "loss": 0.29871695041656493, + "step": 6380 + }, + { + "epoch": 0.9070262597586941, + "grad_norm": 4.671100616455078, + "learning_rate": 9.909339957416608e-05, + "loss": 0.23768167495727538, + "step": 6390 + }, + { + "epoch": 0.9084457061745919, + "grad_norm": 6.55142068862915, + "learning_rate": 9.909198012775018e-05, + "loss": 0.2650206804275513, + "step": 6400 + }, + { + "epoch": 0.9098651525904897, + "grad_norm": 7.774087429046631, + "learning_rate": 9.909056068133429e-05, + "loss": 0.2898139238357544, + "step": 6410 + }, + { + "epoch": 0.9112845990063875, + "grad_norm": 6.386779308319092, + "learning_rate": 9.908914123491839e-05, + "loss": 0.26163647174835203, + "step": 6420 + }, + { + "epoch": 0.9127040454222853, + "grad_norm": 7.33029317855835, + "learning_rate": 9.908772178850248e-05, + "loss": 0.2447366952896118, + "step": 6430 + }, + { + "epoch": 0.9141234918381831, + "grad_norm": 10.35724925994873, + "learning_rate": 9.90863023420866e-05, + "loss": 0.2560460329055786, + "step": 6440 + }, + { + "epoch": 0.915542938254081, + "grad_norm": 9.2293062210083, + "learning_rate": 9.908488289567069e-05, + "loss": 0.3864759922027588, + "step": 6450 + }, + { + "epoch": 0.9169623846699787, + "grad_norm": 8.472285270690918, + "learning_rate": 9.90834634492548e-05, + "loss": 0.2888746976852417, + "step": 6460 + }, + { + "epoch": 0.9183818310858765, + "grad_norm": 6.22374153137207, + "learning_rate": 9.90820440028389e-05, + "loss": 0.2505399942398071, + "step": 6470 + }, + { + "epoch": 0.9198012775017743, + "grad_norm": 7.827479839324951, + "learning_rate": 9.9080624556423e-05, + "loss": 0.2327653408050537, + "step": 6480 + }, + { + "epoch": 0.9212207239176721, + "grad_norm": 7.873356819152832, + "learning_rate": 9.90792051100071e-05, + "loss": 0.2565167903900146, + "step": 6490 + }, + { + "epoch": 0.9226401703335699, + "grad_norm": 4.665884494781494, + "learning_rate": 9.90777856635912e-05, + "loss": 0.2404710292816162, + "step": 6500 + }, + { + "epoch": 0.9226401703335699, + "eval_accuracy": 0.9011890379601959, + "eval_loss": 0.29011303186416626, + "eval_runtime": 34.6022, + "eval_samples_per_second": 454.509, + "eval_steps_per_second": 14.219, + "step": 6500 + }, + { + "epoch": 0.9240596167494677, + "grad_norm": 7.10374641418457, + "learning_rate": 9.90763662171753e-05, + "loss": 0.28783435821533204, + "step": 6510 + }, + { + "epoch": 0.9254790631653655, + "grad_norm": 7.5799784660339355, + "learning_rate": 9.907494677075941e-05, + "loss": 0.3219441890716553, + "step": 6520 + }, + { + "epoch": 0.9268985095812633, + "grad_norm": 3.9083335399627686, + "learning_rate": 9.907352732434351e-05, + "loss": 0.2374324083328247, + "step": 6530 + }, + { + "epoch": 0.9283179559971612, + "grad_norm": 9.309243202209473, + "learning_rate": 9.907210787792761e-05, + "loss": 0.2314399242401123, + "step": 6540 + }, + { + "epoch": 0.9297374024130589, + "grad_norm": 5.650235176086426, + "learning_rate": 9.907068843151172e-05, + "loss": 0.2187626600265503, + "step": 6550 + }, + { + "epoch": 0.9311568488289567, + "grad_norm": 5.9835710525512695, + "learning_rate": 9.906926898509582e-05, + "loss": 0.27225399017333984, + "step": 6560 + }, + { + "epoch": 0.9325762952448545, + "grad_norm": 8.403820991516113, + "learning_rate": 9.906784953867993e-05, + "loss": 0.24051570892333984, + "step": 6570 + }, + { + "epoch": 0.9339957416607523, + "grad_norm": 5.456867218017578, + "learning_rate": 9.906643009226401e-05, + "loss": 0.229835844039917, + "step": 6580 + }, + { + "epoch": 0.9354151880766501, + "grad_norm": 11.34472942352295, + "learning_rate": 9.906501064584812e-05, + "loss": 0.28583712577819825, + "step": 6590 + }, + { + "epoch": 0.9368346344925479, + "grad_norm": 7.0680694580078125, + "learning_rate": 9.906359119943222e-05, + "loss": 0.28688597679138184, + "step": 6600 + }, + { + "epoch": 0.9382540809084458, + "grad_norm": 4.637568950653076, + "learning_rate": 9.906217175301633e-05, + "loss": 0.3234848976135254, + "step": 6610 + }, + { + "epoch": 0.9396735273243435, + "grad_norm": 4.935168743133545, + "learning_rate": 9.906075230660043e-05, + "loss": 0.2546673059463501, + "step": 6620 + }, + { + "epoch": 0.9410929737402413, + "grad_norm": 8.563390731811523, + "learning_rate": 9.905933286018453e-05, + "loss": 0.26501734256744386, + "step": 6630 + }, + { + "epoch": 0.9425124201561391, + "grad_norm": 8.05203914642334, + "learning_rate": 9.905791341376864e-05, + "loss": 0.19906221628189086, + "step": 6640 + }, + { + "epoch": 0.9439318665720369, + "grad_norm": 4.535382270812988, + "learning_rate": 9.905649396735273e-05, + "loss": 0.2355113744735718, + "step": 6650 + }, + { + "epoch": 0.9453513129879347, + "grad_norm": 5.967373371124268, + "learning_rate": 9.905507452093685e-05, + "loss": 0.2591426372528076, + "step": 6660 + }, + { + "epoch": 0.9467707594038325, + "grad_norm": 5.093105792999268, + "learning_rate": 9.905365507452094e-05, + "loss": 0.2508120536804199, + "step": 6670 + }, + { + "epoch": 0.9481902058197303, + "grad_norm": 6.775847911834717, + "learning_rate": 9.905223562810504e-05, + "loss": 0.2802272319793701, + "step": 6680 + }, + { + "epoch": 0.9496096522356281, + "grad_norm": 7.280439376831055, + "learning_rate": 9.905081618168914e-05, + "loss": 0.23689627647399902, + "step": 6690 + }, + { + "epoch": 0.9510290986515259, + "grad_norm": 7.68773078918457, + "learning_rate": 9.904939673527325e-05, + "loss": 0.2927251815795898, + "step": 6700 + }, + { + "epoch": 0.9524485450674237, + "grad_norm": 5.4808831214904785, + "learning_rate": 9.904797728885735e-05, + "loss": 0.28672428131103517, + "step": 6710 + }, + { + "epoch": 0.9538679914833215, + "grad_norm": 8.087321281433105, + "learning_rate": 9.904655784244146e-05, + "loss": 0.3129342794418335, + "step": 6720 + }, + { + "epoch": 0.9552874378992193, + "grad_norm": 2.7893686294555664, + "learning_rate": 9.904513839602555e-05, + "loss": 0.22520501613616944, + "step": 6730 + }, + { + "epoch": 0.9567068843151171, + "grad_norm": 10.040759086608887, + "learning_rate": 9.904371894960965e-05, + "loss": 0.2705253601074219, + "step": 6740 + }, + { + "epoch": 0.9581263307310149, + "grad_norm": 3.0198464393615723, + "learning_rate": 9.904229950319376e-05, + "loss": 0.27905032634735105, + "step": 6750 + }, + { + "epoch": 0.9595457771469127, + "grad_norm": 9.044099807739258, + "learning_rate": 9.904088005677786e-05, + "loss": 0.2549771547317505, + "step": 6760 + }, + { + "epoch": 0.9609652235628106, + "grad_norm": 3.4965715408325195, + "learning_rate": 9.903946061036197e-05, + "loss": 0.2617889165878296, + "step": 6770 + }, + { + "epoch": 0.9623846699787083, + "grad_norm": 4.959318161010742, + "learning_rate": 9.903804116394605e-05, + "loss": 0.24190716743469237, + "step": 6780 + }, + { + "epoch": 0.9638041163946061, + "grad_norm": 4.6404314041137695, + "learning_rate": 9.903662171753017e-05, + "loss": 0.29865779876708987, + "step": 6790 + }, + { + "epoch": 0.9652235628105039, + "grad_norm": 6.315147876739502, + "learning_rate": 9.903520227111426e-05, + "loss": 0.2937409162521362, + "step": 6800 + }, + { + "epoch": 0.9666430092264017, + "grad_norm": 6.294488906860352, + "learning_rate": 9.903378282469837e-05, + "loss": 0.28489468097686765, + "step": 6810 + }, + { + "epoch": 0.9680624556422995, + "grad_norm": 6.917492866516113, + "learning_rate": 9.903236337828248e-05, + "loss": 0.18736352920532226, + "step": 6820 + }, + { + "epoch": 0.9694819020581973, + "grad_norm": 6.20442533493042, + "learning_rate": 9.903094393186658e-05, + "loss": 0.24552693367004394, + "step": 6830 + }, + { + "epoch": 0.9709013484740951, + "grad_norm": 9.16247844696045, + "learning_rate": 9.902952448545068e-05, + "loss": 0.22968952655792235, + "step": 6840 + }, + { + "epoch": 0.9723207948899929, + "grad_norm": 8.185150146484375, + "learning_rate": 9.902810503903478e-05, + "loss": 0.25458450317382814, + "step": 6850 + }, + { + "epoch": 0.9737402413058907, + "grad_norm": 8.134267807006836, + "learning_rate": 9.902668559261889e-05, + "loss": 0.25451316833496096, + "step": 6860 + }, + { + "epoch": 0.9751596877217885, + "grad_norm": 12.39373779296875, + "learning_rate": 9.902526614620298e-05, + "loss": 0.2887612819671631, + "step": 6870 + }, + { + "epoch": 0.9765791341376863, + "grad_norm": 7.776149272918701, + "learning_rate": 9.90238466997871e-05, + "loss": 0.3695904970169067, + "step": 6880 + }, + { + "epoch": 0.9779985805535841, + "grad_norm": 6.241235256195068, + "learning_rate": 9.902242725337118e-05, + "loss": 0.26552643775939944, + "step": 6890 + }, + { + "epoch": 0.9794180269694819, + "grad_norm": 11.734026908874512, + "learning_rate": 9.902100780695529e-05, + "loss": 0.32755370140075685, + "step": 6900 + }, + { + "epoch": 0.9808374733853797, + "grad_norm": 6.049038887023926, + "learning_rate": 9.90195883605394e-05, + "loss": 0.22059807777404786, + "step": 6910 + }, + { + "epoch": 0.9822569198012775, + "grad_norm": 4.156560897827148, + "learning_rate": 9.901831085876509e-05, + "loss": 0.3507907629013062, + "step": 6920 + }, + { + "epoch": 0.9836763662171752, + "grad_norm": 4.315751552581787, + "learning_rate": 9.901689141234918e-05, + "loss": 0.25436155796051024, + "step": 6930 + }, + { + "epoch": 0.9850958126330731, + "grad_norm": 6.76514196395874, + "learning_rate": 9.90154719659333e-05, + "loss": 0.24831132888793944, + "step": 6940 + }, + { + "epoch": 0.9865152590489709, + "grad_norm": 6.7387261390686035, + "learning_rate": 9.901405251951739e-05, + "loss": 0.23655142784118652, + "step": 6950 + }, + { + "epoch": 0.9879347054648687, + "grad_norm": 3.8014583587646484, + "learning_rate": 9.901263307310149e-05, + "loss": 0.2415374994277954, + "step": 6960 + }, + { + "epoch": 0.9893541518807665, + "grad_norm": 5.04398775100708, + "learning_rate": 9.90112136266856e-05, + "loss": 0.23744730949401854, + "step": 6970 + }, + { + "epoch": 0.9907735982966643, + "grad_norm": 5.434844017028809, + "learning_rate": 9.90097941802697e-05, + "loss": 0.24512255191802979, + "step": 6980 + }, + { + "epoch": 0.9921930447125621, + "grad_norm": 5.528685092926025, + "learning_rate": 9.900837473385381e-05, + "loss": 0.2296142578125, + "step": 6990 + }, + { + "epoch": 0.99361249112846, + "grad_norm": 5.2856526374816895, + "learning_rate": 9.90069552874379e-05, + "loss": 0.2707331418991089, + "step": 7000 + }, + { + "epoch": 0.99361249112846, + "eval_accuracy": 0.9093914923380174, + "eval_loss": 0.27007216215133667, + "eval_runtime": 33.3907, + "eval_samples_per_second": 470.999, + "eval_steps_per_second": 14.735, + "step": 7000 + }, + { + "epoch": 0.9950319375443577, + "grad_norm": 8.654793739318848, + "learning_rate": 9.9005535841022e-05, + "loss": 0.34286386966705323, + "step": 7010 + }, + { + "epoch": 0.9964513839602555, + "grad_norm": 3.311750888824463, + "learning_rate": 9.90041163946061e-05, + "loss": 0.269917893409729, + "step": 7020 + }, + { + "epoch": 0.9978708303761533, + "grad_norm": 6.643321514129639, + "learning_rate": 9.900269694819021e-05, + "loss": 0.2132892370223999, + "step": 7030 + }, + { + "epoch": 0.9992902767920511, + "grad_norm": 10.397172927856445, + "learning_rate": 9.900127750177431e-05, + "loss": 0.2613171339035034, + "step": 7040 + }, + { + "epoch": 1.000709723207949, + "grad_norm": 6.357808589935303, + "learning_rate": 9.899985805535842e-05, + "loss": 0.2258657455444336, + "step": 7050 + }, + { + "epoch": 1.0021291696238468, + "grad_norm": 6.077082633972168, + "learning_rate": 9.899843860894252e-05, + "loss": 0.20697100162506105, + "step": 7060 + }, + { + "epoch": 1.0035486160397444, + "grad_norm": 12.1661376953125, + "learning_rate": 9.899701916252661e-05, + "loss": 0.1927890658378601, + "step": 7070 + }, + { + "epoch": 1.0049680624556423, + "grad_norm": 4.968541145324707, + "learning_rate": 9.899559971611073e-05, + "loss": 0.23719356060028077, + "step": 7080 + }, + { + "epoch": 1.0063875088715402, + "grad_norm": 8.79593563079834, + "learning_rate": 9.899418026969482e-05, + "loss": 0.18882639408111573, + "step": 7090 + }, + { + "epoch": 1.0078069552874378, + "grad_norm": 5.142887115478516, + "learning_rate": 9.899276082327893e-05, + "loss": 0.2634677171707153, + "step": 7100 + }, + { + "epoch": 1.0092264017033357, + "grad_norm": 8.761039733886719, + "learning_rate": 9.899134137686302e-05, + "loss": 0.321915602684021, + "step": 7110 + }, + { + "epoch": 1.0106458481192335, + "grad_norm": 3.3865628242492676, + "learning_rate": 9.898992193044713e-05, + "loss": 0.23035690784454346, + "step": 7120 + }, + { + "epoch": 1.0120652945351314, + "grad_norm": 5.229470729827881, + "learning_rate": 9.898850248403123e-05, + "loss": 0.23260829448699952, + "step": 7130 + }, + { + "epoch": 1.013484740951029, + "grad_norm": 6.637743949890137, + "learning_rate": 9.898708303761534e-05, + "loss": 0.29780044555664065, + "step": 7140 + }, + { + "epoch": 1.014904187366927, + "grad_norm": 5.488855838775635, + "learning_rate": 9.898566359119943e-05, + "loss": 0.17786208391189576, + "step": 7150 + }, + { + "epoch": 1.0163236337828248, + "grad_norm": 3.6873295307159424, + "learning_rate": 9.898424414478355e-05, + "loss": 0.16665832996368407, + "step": 7160 + }, + { + "epoch": 1.0177430801987224, + "grad_norm": 3.507009267807007, + "learning_rate": 9.898282469836764e-05, + "loss": 0.2571221351623535, + "step": 7170 + }, + { + "epoch": 1.0191625266146203, + "grad_norm": 3.279927968978882, + "learning_rate": 9.898140525195174e-05, + "loss": 0.2422633171081543, + "step": 7180 + }, + { + "epoch": 1.0205819730305181, + "grad_norm": 7.186861991882324, + "learning_rate": 9.897998580553585e-05, + "loss": 0.2877654552459717, + "step": 7190 + }, + { + "epoch": 1.022001419446416, + "grad_norm": 8.821130752563477, + "learning_rate": 9.897856635911995e-05, + "loss": 0.21563093662261962, + "step": 7200 + }, + { + "epoch": 1.0234208658623136, + "grad_norm": 1.849163293838501, + "learning_rate": 9.897714691270406e-05, + "loss": 0.21513009071350098, + "step": 7210 + }, + { + "epoch": 1.0248403122782115, + "grad_norm": 7.898414611816406, + "learning_rate": 9.897572746628814e-05, + "loss": 0.24002442359924317, + "step": 7220 + }, + { + "epoch": 1.0262597586941093, + "grad_norm": 8.41958236694336, + "learning_rate": 9.897430801987225e-05, + "loss": 0.22358598709106445, + "step": 7230 + }, + { + "epoch": 1.027679205110007, + "grad_norm": 5.978959560394287, + "learning_rate": 9.897288857345635e-05, + "loss": 0.24321112632751465, + "step": 7240 + }, + { + "epoch": 1.0290986515259049, + "grad_norm": 7.758601665496826, + "learning_rate": 9.897146912704046e-05, + "loss": 0.2519962310791016, + "step": 7250 + }, + { + "epoch": 1.0305180979418027, + "grad_norm": 6.9067487716674805, + "learning_rate": 9.897004968062456e-05, + "loss": 0.22714946269989014, + "step": 7260 + }, + { + "epoch": 1.0319375443577006, + "grad_norm": 7.974116802215576, + "learning_rate": 9.896863023420866e-05, + "loss": 0.22177364826202392, + "step": 7270 + }, + { + "epoch": 1.0333569907735982, + "grad_norm": 2.706422805786133, + "learning_rate": 9.896721078779277e-05, + "loss": 0.19734153747558594, + "step": 7280 + }, + { + "epoch": 1.034776437189496, + "grad_norm": 10.539275169372559, + "learning_rate": 9.896579134137687e-05, + "loss": 0.2604410648345947, + "step": 7290 + }, + { + "epoch": 1.036195883605394, + "grad_norm": 6.023902893066406, + "learning_rate": 9.896437189496098e-05, + "loss": 0.23188574314117433, + "step": 7300 + }, + { + "epoch": 1.0376153300212918, + "grad_norm": 4.0170512199401855, + "learning_rate": 9.896295244854507e-05, + "loss": 0.20175492763519287, + "step": 7310 + }, + { + "epoch": 1.0390347764371894, + "grad_norm": 4.9612579345703125, + "learning_rate": 9.896153300212917e-05, + "loss": 0.2120590925216675, + "step": 7320 + }, + { + "epoch": 1.0404542228530873, + "grad_norm": 4.898397922515869, + "learning_rate": 9.896011355571327e-05, + "loss": 0.22397477626800538, + "step": 7330 + }, + { + "epoch": 1.0418736692689852, + "grad_norm": 7.394660472869873, + "learning_rate": 9.895869410929738e-05, + "loss": 0.2079904556274414, + "step": 7340 + }, + { + "epoch": 1.0432931156848828, + "grad_norm": 3.7839152812957764, + "learning_rate": 9.895727466288148e-05, + "loss": 0.1861090302467346, + "step": 7350 + }, + { + "epoch": 1.0447125621007807, + "grad_norm": 6.4003496170043945, + "learning_rate": 9.895585521646559e-05, + "loss": 0.21509413719177245, + "step": 7360 + }, + { + "epoch": 1.0461320085166785, + "grad_norm": 5.966845989227295, + "learning_rate": 9.895443577004969e-05, + "loss": 0.22056474685668945, + "step": 7370 + }, + { + "epoch": 1.0475514549325764, + "grad_norm": 3.580226182937622, + "learning_rate": 9.895301632363378e-05, + "loss": 0.2572075128555298, + "step": 7380 + }, + { + "epoch": 1.048970901348474, + "grad_norm": 7.922166347503662, + "learning_rate": 9.89515968772179e-05, + "loss": 0.26929004192352296, + "step": 7390 + }, + { + "epoch": 1.050390347764372, + "grad_norm": 8.884166717529297, + "learning_rate": 9.895017743080199e-05, + "loss": 0.23953988552093505, + "step": 7400 + }, + { + "epoch": 1.0518097941802698, + "grad_norm": 13.472792625427246, + "learning_rate": 9.89487579843861e-05, + "loss": 0.26428995132446287, + "step": 7410 + }, + { + "epoch": 1.0532292405961674, + "grad_norm": 5.455354690551758, + "learning_rate": 9.894733853797019e-05, + "loss": 0.22658278942108154, + "step": 7420 + }, + { + "epoch": 1.0546486870120653, + "grad_norm": 12.143173217773438, + "learning_rate": 9.89459190915543e-05, + "loss": 0.2838724136352539, + "step": 7430 + }, + { + "epoch": 1.0560681334279631, + "grad_norm": 12.741036415100098, + "learning_rate": 9.89444996451384e-05, + "loss": 0.22514543533325196, + "step": 7440 + }, + { + "epoch": 1.057487579843861, + "grad_norm": 3.3944201469421387, + "learning_rate": 9.89430801987225e-05, + "loss": 0.2505282163619995, + "step": 7450 + }, + { + "epoch": 1.0589070262597586, + "grad_norm": 4.490118503570557, + "learning_rate": 9.89416607523066e-05, + "loss": 0.24113750457763672, + "step": 7460 + }, + { + "epoch": 1.0603264726756565, + "grad_norm": 3.8860394954681396, + "learning_rate": 9.89402413058907e-05, + "loss": 0.19650124311447142, + "step": 7470 + }, + { + "epoch": 1.0617459190915544, + "grad_norm": 8.089933395385742, + "learning_rate": 9.893882185947481e-05, + "loss": 0.20081098079681398, + "step": 7480 + }, + { + "epoch": 1.063165365507452, + "grad_norm": 5.854043483734131, + "learning_rate": 9.893740241305891e-05, + "loss": 0.19387896060943605, + "step": 7490 + }, + { + "epoch": 1.0645848119233499, + "grad_norm": 3.3195252418518066, + "learning_rate": 9.893598296664302e-05, + "loss": 0.1918407201766968, + "step": 7500 + }, + { + "epoch": 1.0645848119233499, + "eval_accuracy": 0.9091371526673873, + "eval_loss": 0.25946471095085144, + "eval_runtime": 32.8002, + "eval_samples_per_second": 479.478, + "eval_steps_per_second": 15.0, + "step": 7500 + }, + { + "epoch": 1.0660042583392477, + "grad_norm": 7.044492244720459, + "learning_rate": 9.893456352022712e-05, + "loss": 0.18088626861572266, + "step": 7510 + }, + { + "epoch": 1.0674237047551456, + "grad_norm": 2.1477725505828857, + "learning_rate": 9.893314407381123e-05, + "loss": 0.25041606426239016, + "step": 7520 + }, + { + "epoch": 1.0688431511710432, + "grad_norm": 5.232922077178955, + "learning_rate": 9.893172462739531e-05, + "loss": 0.13164312839508058, + "step": 7530 + }, + { + "epoch": 1.070262597586941, + "grad_norm": 7.097192764282227, + "learning_rate": 9.893030518097942e-05, + "loss": 0.2210529088973999, + "step": 7540 + }, + { + "epoch": 1.071682044002839, + "grad_norm": 6.555529594421387, + "learning_rate": 9.892888573456352e-05, + "loss": 0.22583472728729248, + "step": 7550 + }, + { + "epoch": 1.0731014904187366, + "grad_norm": 4.672628879547119, + "learning_rate": 9.892746628814763e-05, + "loss": 0.2420278787612915, + "step": 7560 + }, + { + "epoch": 1.0745209368346345, + "grad_norm": 5.684006690979004, + "learning_rate": 9.892604684173174e-05, + "loss": 0.16603726148605347, + "step": 7570 + }, + { + "epoch": 1.0759403832505323, + "grad_norm": 8.538924217224121, + "learning_rate": 9.892462739531582e-05, + "loss": 0.22756731510162354, + "step": 7580 + }, + { + "epoch": 1.0773598296664302, + "grad_norm": 10.23405647277832, + "learning_rate": 9.892320794889994e-05, + "loss": 0.17195621728897095, + "step": 7590 + }, + { + "epoch": 1.0787792760823278, + "grad_norm": 3.4394562244415283, + "learning_rate": 9.892178850248403e-05, + "loss": 0.1631350874900818, + "step": 7600 + }, + { + "epoch": 1.0801987224982257, + "grad_norm": 9.240316390991211, + "learning_rate": 9.892036905606814e-05, + "loss": 0.2647270917892456, + "step": 7610 + }, + { + "epoch": 1.0816181689141235, + "grad_norm": 11.555622100830078, + "learning_rate": 9.891894960965224e-05, + "loss": 0.26429762840271, + "step": 7620 + }, + { + "epoch": 1.0830376153300212, + "grad_norm": 2.4831769466400146, + "learning_rate": 9.891753016323634e-05, + "loss": 0.29258711338043214, + "step": 7630 + }, + { + "epoch": 1.084457061745919, + "grad_norm": 4.935022830963135, + "learning_rate": 9.891611071682044e-05, + "loss": 0.21570188999176027, + "step": 7640 + }, + { + "epoch": 1.085876508161817, + "grad_norm": 11.602439880371094, + "learning_rate": 9.891469127040455e-05, + "loss": 0.32711737155914306, + "step": 7650 + }, + { + "epoch": 1.0872959545777148, + "grad_norm": 6.064338207244873, + "learning_rate": 9.891327182398866e-05, + "loss": 0.226470947265625, + "step": 7660 + }, + { + "epoch": 1.0887154009936124, + "grad_norm": 5.629254341125488, + "learning_rate": 9.891185237757276e-05, + "loss": 0.1874476909637451, + "step": 7670 + }, + { + "epoch": 1.0901348474095103, + "grad_norm": 6.994508743286133, + "learning_rate": 9.891043293115685e-05, + "loss": 0.2323138952255249, + "step": 7680 + }, + { + "epoch": 1.0915542938254081, + "grad_norm": 7.654874324798584, + "learning_rate": 9.890901348474095e-05, + "loss": 0.267806077003479, + "step": 7690 + }, + { + "epoch": 1.0929737402413058, + "grad_norm": 2.5339603424072266, + "learning_rate": 9.890759403832506e-05, + "loss": 0.17415390014648438, + "step": 7700 + }, + { + "epoch": 1.0943931866572036, + "grad_norm": 9.036078453063965, + "learning_rate": 9.890617459190916e-05, + "loss": 0.26232335567474363, + "step": 7710 + }, + { + "epoch": 1.0958126330731015, + "grad_norm": 8.1493558883667, + "learning_rate": 9.890475514549327e-05, + "loss": 0.26018438339233396, + "step": 7720 + }, + { + "epoch": 1.0972320794889994, + "grad_norm": 4.394131660461426, + "learning_rate": 9.890333569907735e-05, + "loss": 0.20033717155456543, + "step": 7730 + }, + { + "epoch": 1.098651525904897, + "grad_norm": 7.311230659484863, + "learning_rate": 9.890191625266146e-05, + "loss": 0.2336057662963867, + "step": 7740 + }, + { + "epoch": 1.1000709723207949, + "grad_norm": 3.716153621673584, + "learning_rate": 9.890049680624556e-05, + "loss": 0.21649951934814454, + "step": 7750 + }, + { + "epoch": 1.1014904187366927, + "grad_norm": 5.747766017913818, + "learning_rate": 9.889907735982967e-05, + "loss": 0.21761865615844728, + "step": 7760 + }, + { + "epoch": 1.1029098651525904, + "grad_norm": 2.6889519691467285, + "learning_rate": 9.889765791341378e-05, + "loss": 0.2489168405532837, + "step": 7770 + }, + { + "epoch": 1.1043293115684882, + "grad_norm": 6.918911933898926, + "learning_rate": 9.889623846699787e-05, + "loss": 0.22506451606750488, + "step": 7780 + }, + { + "epoch": 1.105748757984386, + "grad_norm": 6.129018783569336, + "learning_rate": 9.889481902058198e-05, + "loss": 0.22557535171508789, + "step": 7790 + }, + { + "epoch": 1.107168204400284, + "grad_norm": 6.179121017456055, + "learning_rate": 9.889339957416608e-05, + "loss": 0.20877602100372314, + "step": 7800 + }, + { + "epoch": 1.1085876508161816, + "grad_norm": 4.490073204040527, + "learning_rate": 9.889198012775019e-05, + "loss": 0.24456796646118165, + "step": 7810 + }, + { + "epoch": 1.1100070972320795, + "grad_norm": 11.580991744995117, + "learning_rate": 9.889056068133428e-05, + "loss": 0.2545257806777954, + "step": 7820 + }, + { + "epoch": 1.1114265436479773, + "grad_norm": 5.933578968048096, + "learning_rate": 9.88891412349184e-05, + "loss": 0.20906269550323486, + "step": 7830 + }, + { + "epoch": 1.1128459900638752, + "grad_norm": 8.964847564697266, + "learning_rate": 9.888772178850248e-05, + "loss": 0.21426281929016114, + "step": 7840 + }, + { + "epoch": 1.1142654364797728, + "grad_norm": 3.047978401184082, + "learning_rate": 9.888630234208659e-05, + "loss": 0.20127902030944825, + "step": 7850 + }, + { + "epoch": 1.1156848828956707, + "grad_norm": 11.52719783782959, + "learning_rate": 9.88848828956707e-05, + "loss": 0.23301458358764648, + "step": 7860 + }, + { + "epoch": 1.1171043293115686, + "grad_norm": 4.898934364318848, + "learning_rate": 9.88834634492548e-05, + "loss": 0.26660704612731934, + "step": 7870 + }, + { + "epoch": 1.1185237757274662, + "grad_norm": 6.535075664520264, + "learning_rate": 9.888204400283891e-05, + "loss": 0.2355792284011841, + "step": 7880 + }, + { + "epoch": 1.119943222143364, + "grad_norm": 6.307318687438965, + "learning_rate": 9.888062455642299e-05, + "loss": 0.20682175159454347, + "step": 7890 + }, + { + "epoch": 1.121362668559262, + "grad_norm": 3.9123454093933105, + "learning_rate": 9.88792051100071e-05, + "loss": 0.3205126762390137, + "step": 7900 + }, + { + "epoch": 1.1227821149751598, + "grad_norm": 9.152158737182617, + "learning_rate": 9.88777856635912e-05, + "loss": 0.2413860082626343, + "step": 7910 + }, + { + "epoch": 1.1242015613910574, + "grad_norm": 9.178197860717773, + "learning_rate": 9.887636621717531e-05, + "loss": 0.32107110023498536, + "step": 7920 + }, + { + "epoch": 1.1256210078069553, + "grad_norm": 8.382686614990234, + "learning_rate": 9.887494677075941e-05, + "loss": 0.26145339012145996, + "step": 7930 + }, + { + "epoch": 1.1270404542228531, + "grad_norm": 6.847768306732178, + "learning_rate": 9.88735273243435e-05, + "loss": 0.21859989166259766, + "step": 7940 + }, + { + "epoch": 1.1284599006387508, + "grad_norm": 3.770111560821533, + "learning_rate": 9.887210787792762e-05, + "loss": 0.13420095443725585, + "step": 7950 + }, + { + "epoch": 1.1298793470546487, + "grad_norm": 7.4002509117126465, + "learning_rate": 9.887068843151171e-05, + "loss": 0.18695064783096313, + "step": 7960 + }, + { + "epoch": 1.1312987934705465, + "grad_norm": 4.0712761878967285, + "learning_rate": 9.886926898509583e-05, + "loss": 0.20656538009643555, + "step": 7970 + }, + { + "epoch": 1.1327182398864444, + "grad_norm": 4.4091291427612305, + "learning_rate": 9.886784953867992e-05, + "loss": 0.28663394451141355, + "step": 7980 + }, + { + "epoch": 1.134137686302342, + "grad_norm": 10.553000450134277, + "learning_rate": 9.886643009226402e-05, + "loss": 0.319093132019043, + "step": 7990 + }, + { + "epoch": 1.1355571327182399, + "grad_norm": 6.1367597579956055, + "learning_rate": 9.886501064584812e-05, + "loss": 0.19342881441116333, + "step": 8000 + }, + { + "epoch": 1.1355571327182399, + "eval_accuracy": 0.9207731925987156, + "eval_loss": 0.24032267928123474, + "eval_runtime": 32.4949, + "eval_samples_per_second": 483.984, + "eval_steps_per_second": 15.141, + "step": 8000 + }, + { + "epoch": 1.1369765791341377, + "grad_norm": 2.405918598175049, + "learning_rate": 9.886359119943223e-05, + "loss": 0.22856481075286866, + "step": 8010 + }, + { + "epoch": 1.1383960255500356, + "grad_norm": 3.4976019859313965, + "learning_rate": 9.886217175301633e-05, + "loss": 0.18118438720703126, + "step": 8020 + }, + { + "epoch": 1.1398154719659332, + "grad_norm": 6.432300567626953, + "learning_rate": 9.886075230660044e-05, + "loss": 0.21989898681640624, + "step": 8030 + }, + { + "epoch": 1.141234918381831, + "grad_norm": 8.299015045166016, + "learning_rate": 9.885933286018453e-05, + "loss": 0.18632423877716064, + "step": 8040 + }, + { + "epoch": 1.142654364797729, + "grad_norm": 4.741350173950195, + "learning_rate": 9.885791341376863e-05, + "loss": 0.3003889799118042, + "step": 8050 + }, + { + "epoch": 1.1440738112136266, + "grad_norm": 2.561021327972412, + "learning_rate": 9.885649396735274e-05, + "loss": 0.20989477634429932, + "step": 8060 + }, + { + "epoch": 1.1454932576295245, + "grad_norm": 4.419784069061279, + "learning_rate": 9.885507452093684e-05, + "loss": 0.20898723602294922, + "step": 8070 + }, + { + "epoch": 1.1469127040454223, + "grad_norm": 4.329728603363037, + "learning_rate": 9.885365507452095e-05, + "loss": 0.191938316822052, + "step": 8080 + }, + { + "epoch": 1.1483321504613202, + "grad_norm": 5.096283912658691, + "learning_rate": 9.885223562810503e-05, + "loss": 0.21612834930419922, + "step": 8090 + }, + { + "epoch": 1.1497515968772178, + "grad_norm": 7.623912811279297, + "learning_rate": 9.885081618168915e-05, + "loss": 0.2056267261505127, + "step": 8100 + }, + { + "epoch": 1.1511710432931157, + "grad_norm": 5.211782455444336, + "learning_rate": 9.884939673527324e-05, + "loss": 0.2458388090133667, + "step": 8110 + }, + { + "epoch": 1.1525904897090136, + "grad_norm": 4.73144006729126, + "learning_rate": 9.884797728885735e-05, + "loss": 0.2795632123947144, + "step": 8120 + }, + { + "epoch": 1.1540099361249112, + "grad_norm": 4.658935546875, + "learning_rate": 9.884655784244145e-05, + "loss": 0.19132717847824096, + "step": 8130 + }, + { + "epoch": 1.155429382540809, + "grad_norm": 2.4226841926574707, + "learning_rate": 9.884513839602555e-05, + "loss": 0.2345660448074341, + "step": 8140 + }, + { + "epoch": 1.156848828956707, + "grad_norm": 4.741151809692383, + "learning_rate": 9.884371894960966e-05, + "loss": 0.16295211315155028, + "step": 8150 + }, + { + "epoch": 1.1582682753726048, + "grad_norm": 5.364559173583984, + "learning_rate": 9.884229950319376e-05, + "loss": 0.32001848220825196, + "step": 8160 + }, + { + "epoch": 1.1596877217885024, + "grad_norm": 5.700736045837402, + "learning_rate": 9.884088005677787e-05, + "loss": 0.2149799346923828, + "step": 8170 + }, + { + "epoch": 1.1611071682044003, + "grad_norm": 8.003674507141113, + "learning_rate": 9.883946061036197e-05, + "loss": 0.1882821202278137, + "step": 8180 + }, + { + "epoch": 1.1625266146202982, + "grad_norm": 4.5582122802734375, + "learning_rate": 9.883804116394608e-05, + "loss": 0.21344914436340331, + "step": 8190 + }, + { + "epoch": 1.1639460610361958, + "grad_norm": 7.819937229156494, + "learning_rate": 9.883662171753016e-05, + "loss": 0.20212192535400392, + "step": 8200 + }, + { + "epoch": 1.1653655074520937, + "grad_norm": 4.706314563751221, + "learning_rate": 9.883520227111427e-05, + "loss": 0.23133435249328613, + "step": 8210 + }, + { + "epoch": 1.1667849538679915, + "grad_norm": 6.7971343994140625, + "learning_rate": 9.883378282469837e-05, + "loss": 0.2259516477584839, + "step": 8220 + }, + { + "epoch": 1.1682044002838894, + "grad_norm": 6.324117183685303, + "learning_rate": 9.883236337828248e-05, + "loss": 0.2526458024978638, + "step": 8230 + }, + { + "epoch": 1.169623846699787, + "grad_norm": 11.824000358581543, + "learning_rate": 9.883094393186658e-05, + "loss": 0.28786749839782716, + "step": 8240 + }, + { + "epoch": 1.171043293115685, + "grad_norm": 6.5561089515686035, + "learning_rate": 9.882952448545067e-05, + "loss": 0.2411046028137207, + "step": 8250 + }, + { + "epoch": 1.1724627395315828, + "grad_norm": 9.257662773132324, + "learning_rate": 9.882810503903479e-05, + "loss": 0.2078631639480591, + "step": 8260 + }, + { + "epoch": 1.1738821859474804, + "grad_norm": 6.388674736022949, + "learning_rate": 9.882668559261888e-05, + "loss": 0.2299574851989746, + "step": 8270 + }, + { + "epoch": 1.1753016323633783, + "grad_norm": 5.7360992431640625, + "learning_rate": 9.8825266146203e-05, + "loss": 0.18881726264953613, + "step": 8280 + }, + { + "epoch": 1.1767210787792761, + "grad_norm": 6.240981578826904, + "learning_rate": 9.882384669978709e-05, + "loss": 0.1505158066749573, + "step": 8290 + }, + { + "epoch": 1.178140525195174, + "grad_norm": 5.832661151885986, + "learning_rate": 9.882242725337119e-05, + "loss": 0.22867400646209718, + "step": 8300 + }, + { + "epoch": 1.1795599716110716, + "grad_norm": 10.773929595947266, + "learning_rate": 9.882100780695529e-05, + "loss": 0.1888264536857605, + "step": 8310 + }, + { + "epoch": 1.1809794180269695, + "grad_norm": 3.489490509033203, + "learning_rate": 9.88195883605394e-05, + "loss": 0.1748473525047302, + "step": 8320 + }, + { + "epoch": 1.1823988644428673, + "grad_norm": 5.332619667053223, + "learning_rate": 9.88181689141235e-05, + "loss": 0.20995078086853028, + "step": 8330 + }, + { + "epoch": 1.183818310858765, + "grad_norm": 4.1643147468566895, + "learning_rate": 9.88167494677076e-05, + "loss": 0.17949424982070922, + "step": 8340 + }, + { + "epoch": 1.1852377572746629, + "grad_norm": 5.263898849487305, + "learning_rate": 9.88153300212917e-05, + "loss": 0.17099075317382811, + "step": 8350 + }, + { + "epoch": 1.1866572036905607, + "grad_norm": 10.222403526306152, + "learning_rate": 9.88139105748758e-05, + "loss": 0.163385272026062, + "step": 8360 + }, + { + "epoch": 1.1880766501064586, + "grad_norm": 4.657668113708496, + "learning_rate": 9.881249112845991e-05, + "loss": 0.2960475444793701, + "step": 8370 + }, + { + "epoch": 1.1894960965223562, + "grad_norm": 4.420619964599609, + "learning_rate": 9.881107168204401e-05, + "loss": 0.1871565818786621, + "step": 8380 + }, + { + "epoch": 1.190915542938254, + "grad_norm": 6.741722583770752, + "learning_rate": 9.880965223562812e-05, + "loss": 0.18152236938476562, + "step": 8390 + }, + { + "epoch": 1.192334989354152, + "grad_norm": 7.203516483306885, + "learning_rate": 9.88082327892122e-05, + "loss": 0.21214077472686768, + "step": 8400 + }, + { + "epoch": 1.1937544357700496, + "grad_norm": 4.927282810211182, + "learning_rate": 9.880681334279631e-05, + "loss": 0.2104212999343872, + "step": 8410 + }, + { + "epoch": 1.1951738821859474, + "grad_norm": 5.8592023849487305, + "learning_rate": 9.880539389638041e-05, + "loss": 0.2139230728149414, + "step": 8420 + }, + { + "epoch": 1.1965933286018453, + "grad_norm": 7.09868860244751, + "learning_rate": 9.880397444996452e-05, + "loss": 0.1821369171142578, + "step": 8430 + }, + { + "epoch": 1.1980127750177432, + "grad_norm": 3.22680401802063, + "learning_rate": 9.880255500354862e-05, + "loss": 0.20524086952209472, + "step": 8440 + }, + { + "epoch": 1.1994322214336408, + "grad_norm": 6.953636169433594, + "learning_rate": 9.880113555713272e-05, + "loss": 0.12908190488815308, + "step": 8450 + }, + { + "epoch": 1.2008516678495387, + "grad_norm": 3.305361032485962, + "learning_rate": 9.879971611071683e-05, + "loss": 0.21676282882690429, + "step": 8460 + }, + { + "epoch": 1.2022711142654365, + "grad_norm": 5.03612756729126, + "learning_rate": 9.879829666430093e-05, + "loss": 0.21339573860168456, + "step": 8470 + }, + { + "epoch": 1.2036905606813342, + "grad_norm": 8.03529167175293, + "learning_rate": 9.879687721788504e-05, + "loss": 0.22714192867279054, + "step": 8480 + }, + { + "epoch": 1.205110007097232, + "grad_norm": 11.267200469970703, + "learning_rate": 9.879545777146913e-05, + "loss": 0.2318274736404419, + "step": 8490 + }, + { + "epoch": 1.20652945351313, + "grad_norm": 4.298351764678955, + "learning_rate": 9.879403832505323e-05, + "loss": 0.13804138898849488, + "step": 8500 + }, + { + "epoch": 1.20652945351313, + "eval_accuracy": 0.9154320595154829, + "eval_loss": 0.2389156073331833, + "eval_runtime": 32.8287, + "eval_samples_per_second": 479.062, + "eval_steps_per_second": 14.987, + "step": 8500 + }, + { + "epoch": 1.2079488999290278, + "grad_norm": 7.828441619873047, + "learning_rate": 9.879261887863733e-05, + "loss": 0.22812976837158203, + "step": 8510 + }, + { + "epoch": 1.2093683463449254, + "grad_norm": 6.791322708129883, + "learning_rate": 9.879119943222144e-05, + "loss": 0.2314612865447998, + "step": 8520 + }, + { + "epoch": 1.2107877927608233, + "grad_norm": 2.5891473293304443, + "learning_rate": 9.878977998580554e-05, + "loss": 0.2156294107437134, + "step": 8530 + }, + { + "epoch": 1.2122072391767211, + "grad_norm": 8.005664825439453, + "learning_rate": 9.878836053938965e-05, + "loss": 0.2180927038192749, + "step": 8540 + }, + { + "epoch": 1.2136266855926188, + "grad_norm": 4.849853515625, + "learning_rate": 9.878694109297374e-05, + "loss": 0.2122575521469116, + "step": 8550 + }, + { + "epoch": 1.2150461320085166, + "grad_norm": 2.7616207599639893, + "learning_rate": 9.878552164655784e-05, + "loss": 0.17834146022796632, + "step": 8560 + }, + { + "epoch": 1.2164655784244145, + "grad_norm": 5.352903366088867, + "learning_rate": 9.878410220014195e-05, + "loss": 0.13497724533081054, + "step": 8570 + }, + { + "epoch": 1.2178850248403124, + "grad_norm": 8.255563735961914, + "learning_rate": 9.878268275372605e-05, + "loss": 0.19454526901245117, + "step": 8580 + }, + { + "epoch": 1.21930447125621, + "grad_norm": 3.5060651302337646, + "learning_rate": 9.878126330731016e-05, + "loss": 0.23703739643096924, + "step": 8590 + }, + { + "epoch": 1.2207239176721079, + "grad_norm": 5.917641639709473, + "learning_rate": 9.877984386089426e-05, + "loss": 0.1788935661315918, + "step": 8600 + }, + { + "epoch": 1.2221433640880057, + "grad_norm": 7.5726542472839355, + "learning_rate": 9.877842441447836e-05, + "loss": 0.1879301905632019, + "step": 8610 + }, + { + "epoch": 1.2235628105039034, + "grad_norm": 6.313500881195068, + "learning_rate": 9.877700496806245e-05, + "loss": 0.19519026279449464, + "step": 8620 + }, + { + "epoch": 1.2249822569198012, + "grad_norm": 6.073189735412598, + "learning_rate": 9.877558552164656e-05, + "loss": 0.16100149154663085, + "step": 8630 + }, + { + "epoch": 1.226401703335699, + "grad_norm": 9.31675910949707, + "learning_rate": 9.877416607523066e-05, + "loss": 0.24087250232696533, + "step": 8640 + }, + { + "epoch": 1.227821149751597, + "grad_norm": 6.469115734100342, + "learning_rate": 9.877274662881477e-05, + "loss": 0.15760414600372313, + "step": 8650 + }, + { + "epoch": 1.2292405961674946, + "grad_norm": 5.7666192054748535, + "learning_rate": 9.877132718239887e-05, + "loss": 0.2261284589767456, + "step": 8660 + }, + { + "epoch": 1.2306600425833925, + "grad_norm": 7.881688117980957, + "learning_rate": 9.876990773598297e-05, + "loss": 0.22792091369628906, + "step": 8670 + }, + { + "epoch": 1.2320794889992903, + "grad_norm": 4.771458625793457, + "learning_rate": 9.876848828956708e-05, + "loss": 0.21116392612457274, + "step": 8680 + }, + { + "epoch": 1.233498935415188, + "grad_norm": 9.804439544677734, + "learning_rate": 9.876706884315118e-05, + "loss": 0.25815906524658205, + "step": 8690 + }, + { + "epoch": 1.2349183818310858, + "grad_norm": 3.326082229614258, + "learning_rate": 9.876564939673529e-05, + "loss": 0.21468789577484132, + "step": 8700 + }, + { + "epoch": 1.2363378282469837, + "grad_norm": 3.82004714012146, + "learning_rate": 9.876422995031937e-05, + "loss": 0.17646214962005616, + "step": 8710 + }, + { + "epoch": 1.2377572746628815, + "grad_norm": 7.979610443115234, + "learning_rate": 9.876281050390348e-05, + "loss": 0.23217053413391114, + "step": 8720 + }, + { + "epoch": 1.2391767210787792, + "grad_norm": 6.828559398651123, + "learning_rate": 9.876139105748758e-05, + "loss": 0.226235294342041, + "step": 8730 + }, + { + "epoch": 1.240596167494677, + "grad_norm": 7.083154678344727, + "learning_rate": 9.875997161107169e-05, + "loss": 0.2136064052581787, + "step": 8740 + }, + { + "epoch": 1.242015613910575, + "grad_norm": 8.167536735534668, + "learning_rate": 9.875855216465579e-05, + "loss": 0.20408027172088622, + "step": 8750 + }, + { + "epoch": 1.2434350603264726, + "grad_norm": 7.635597229003906, + "learning_rate": 9.875713271823988e-05, + "loss": 0.2205681324005127, + "step": 8760 + }, + { + "epoch": 1.2448545067423704, + "grad_norm": 6.944504737854004, + "learning_rate": 9.8755713271824e-05, + "loss": 0.14819756746292115, + "step": 8770 + }, + { + "epoch": 1.2462739531582683, + "grad_norm": 7.144880771636963, + "learning_rate": 9.875429382540809e-05, + "loss": 0.25865755081176756, + "step": 8780 + }, + { + "epoch": 1.2476933995741661, + "grad_norm": 4.50839900970459, + "learning_rate": 9.87528743789922e-05, + "loss": 0.19764204025268556, + "step": 8790 + }, + { + "epoch": 1.2491128459900638, + "grad_norm": 3.0644021034240723, + "learning_rate": 9.87514549325763e-05, + "loss": 0.23454864025115968, + "step": 8800 + }, + { + "epoch": 1.2505322924059616, + "grad_norm": 6.562272548675537, + "learning_rate": 9.87500354861604e-05, + "loss": 0.2683814525604248, + "step": 8810 + }, + { + "epoch": 1.2519517388218595, + "grad_norm": 4.825582027435303, + "learning_rate": 9.87486160397445e-05, + "loss": 0.2111285924911499, + "step": 8820 + }, + { + "epoch": 1.2533711852377571, + "grad_norm": 5.02101469039917, + "learning_rate": 9.87471965933286e-05, + "loss": 0.20650248527526854, + "step": 8830 + }, + { + "epoch": 1.254790631653655, + "grad_norm": 6.4850754737854, + "learning_rate": 9.87457771469127e-05, + "loss": 0.18662099838256835, + "step": 8840 + }, + { + "epoch": 1.2562100780695529, + "grad_norm": 6.745723724365234, + "learning_rate": 9.874435770049682e-05, + "loss": 0.12750645875930786, + "step": 8850 + }, + { + "epoch": 1.2576295244854507, + "grad_norm": 10.856019973754883, + "learning_rate": 9.874293825408091e-05, + "loss": 0.22051913738250734, + "step": 8860 + }, + { + "epoch": 1.2590489709013486, + "grad_norm": 7.022629737854004, + "learning_rate": 9.874151880766501e-05, + "loss": 0.2626792907714844, + "step": 8870 + }, + { + "epoch": 1.2604684173172462, + "grad_norm": 8.997479438781738, + "learning_rate": 9.874009936124912e-05, + "loss": 0.22494235038757324, + "step": 8880 + }, + { + "epoch": 1.261887863733144, + "grad_norm": 8.640801429748535, + "learning_rate": 9.873867991483322e-05, + "loss": 0.21826319694519042, + "step": 8890 + }, + { + "epoch": 1.2633073101490417, + "grad_norm": 4.579946517944336, + "learning_rate": 9.873726046841733e-05, + "loss": 0.18379125595092774, + "step": 8900 + }, + { + "epoch": 1.2647267565649396, + "grad_norm": 6.971579074859619, + "learning_rate": 9.873584102200143e-05, + "loss": 0.23222970962524414, + "step": 8910 + }, + { + "epoch": 1.2661462029808375, + "grad_norm": 6.197728633880615, + "learning_rate": 9.873442157558552e-05, + "loss": 0.23273870944976807, + "step": 8920 + }, + { + "epoch": 1.2675656493967353, + "grad_norm": 9.468696594238281, + "learning_rate": 9.873300212916962e-05, + "loss": 0.18107137680053711, + "step": 8930 + }, + { + "epoch": 1.2689850958126332, + "grad_norm": 3.7539901733398438, + "learning_rate": 9.873158268275373e-05, + "loss": 0.1382051467895508, + "step": 8940 + }, + { + "epoch": 1.2704045422285308, + "grad_norm": 7.013411521911621, + "learning_rate": 9.873016323633783e-05, + "loss": 0.13840343952178955, + "step": 8950 + }, + { + "epoch": 1.2718239886444287, + "grad_norm": 4.136613845825195, + "learning_rate": 9.872874378992194e-05, + "loss": 0.27057197093963625, + "step": 8960 + }, + { + "epoch": 1.2732434350603263, + "grad_norm": 7.147876262664795, + "learning_rate": 9.872732434350604e-05, + "loss": 0.19125341176986693, + "step": 8970 + }, + { + "epoch": 1.2746628814762242, + "grad_norm": 1.9221298694610596, + "learning_rate": 9.872590489709014e-05, + "loss": 0.22451837062835694, + "step": 8980 + }, + { + "epoch": 1.276082327892122, + "grad_norm": 10.765070915222168, + "learning_rate": 9.872448545067425e-05, + "loss": 0.2057518482208252, + "step": 8990 + }, + { + "epoch": 1.27750177430802, + "grad_norm": 3.960794448852539, + "learning_rate": 9.872306600425834e-05, + "loss": 0.21558022499084473, + "step": 9000 + }, + { + "epoch": 1.27750177430802, + "eval_accuracy": 0.907420359890634, + "eval_loss": 0.2675907015800476, + "eval_runtime": 32.1907, + "eval_samples_per_second": 488.557, + "eval_steps_per_second": 15.284, + "step": 9000 + }, + { + "epoch": 1.2789212207239178, + "grad_norm": 6.640925884246826, + "learning_rate": 9.872164655784245e-05, + "loss": 0.21932268142700195, + "step": 9010 + }, + { + "epoch": 1.2803406671398154, + "grad_norm": 3.883657455444336, + "learning_rate": 9.872022711142654e-05, + "loss": 0.20566184520721437, + "step": 9020 + }, + { + "epoch": 1.2817601135557133, + "grad_norm": 8.243616104125977, + "learning_rate": 9.871880766501065e-05, + "loss": 0.1661081552505493, + "step": 9030 + }, + { + "epoch": 1.2831795599716112, + "grad_norm": 9.827435493469238, + "learning_rate": 9.871738821859475e-05, + "loss": 0.17904939651489257, + "step": 9040 + }, + { + "epoch": 1.2845990063875088, + "grad_norm": 7.80245304107666, + "learning_rate": 9.871596877217886e-05, + "loss": 0.15805249214172362, + "step": 9050 + }, + { + "epoch": 1.2860184528034067, + "grad_norm": 4.689866542816162, + "learning_rate": 9.871454932576297e-05, + "loss": 0.23644819259643554, + "step": 9060 + }, + { + "epoch": 1.2874378992193045, + "grad_norm": 6.257835865020752, + "learning_rate": 9.871312987934705e-05, + "loss": 0.2536448955535889, + "step": 9070 + }, + { + "epoch": 1.2888573456352024, + "grad_norm": 1.8020100593566895, + "learning_rate": 9.871185237757275e-05, + "loss": 0.1373010277748108, + "step": 9080 + }, + { + "epoch": 1.2902767920511, + "grad_norm": 4.135176658630371, + "learning_rate": 9.871043293115685e-05, + "loss": 0.1967120051383972, + "step": 9090 + }, + { + "epoch": 1.2916962384669979, + "grad_norm": 5.261960506439209, + "learning_rate": 9.870901348474096e-05, + "loss": 0.21039602756500245, + "step": 9100 + }, + { + "epoch": 1.2931156848828957, + "grad_norm": 6.985999584197998, + "learning_rate": 9.870759403832506e-05, + "loss": 0.22036538124084473, + "step": 9110 + }, + { + "epoch": 1.2945351312987934, + "grad_norm": 3.4260783195495605, + "learning_rate": 9.870617459190917e-05, + "loss": 0.2039936065673828, + "step": 9120 + }, + { + "epoch": 1.2959545777146912, + "grad_norm": 3.7384250164031982, + "learning_rate": 9.870475514549326e-05, + "loss": 0.20263819694519042, + "step": 9130 + }, + { + "epoch": 1.297374024130589, + "grad_norm": 3.172229528427124, + "learning_rate": 9.870333569907736e-05, + "loss": 0.13130682706832886, + "step": 9140 + }, + { + "epoch": 1.298793470546487, + "grad_norm": 12.370247840881348, + "learning_rate": 9.870191625266146e-05, + "loss": 0.20618796348571777, + "step": 9150 + }, + { + "epoch": 1.3002129169623846, + "grad_norm": 7.193541049957275, + "learning_rate": 9.870049680624557e-05, + "loss": 0.2788748264312744, + "step": 9160 + }, + { + "epoch": 1.3016323633782825, + "grad_norm": 4.76792573928833, + "learning_rate": 9.869907735982967e-05, + "loss": 0.18996012210845947, + "step": 9170 + }, + { + "epoch": 1.3030518097941803, + "grad_norm": 3.7090489864349365, + "learning_rate": 9.869765791341378e-05, + "loss": 0.18860991001129152, + "step": 9180 + }, + { + "epoch": 1.304471256210078, + "grad_norm": 6.190913677215576, + "learning_rate": 9.869623846699788e-05, + "loss": 0.224440860748291, + "step": 9190 + }, + { + "epoch": 1.3058907026259758, + "grad_norm": 3.286689281463623, + "learning_rate": 9.869481902058197e-05, + "loss": 0.20683689117431642, + "step": 9200 + }, + { + "epoch": 1.3073101490418737, + "grad_norm": 4.6291937828063965, + "learning_rate": 9.869339957416608e-05, + "loss": 0.19128093719482422, + "step": 9210 + }, + { + "epoch": 1.3087295954577716, + "grad_norm": 8.739839553833008, + "learning_rate": 9.869198012775018e-05, + "loss": 0.21355061531066893, + "step": 9220 + }, + { + "epoch": 1.3101490418736692, + "grad_norm": 4.578412055969238, + "learning_rate": 9.869056068133429e-05, + "loss": 0.1978748083114624, + "step": 9230 + }, + { + "epoch": 1.311568488289567, + "grad_norm": 5.891171932220459, + "learning_rate": 9.868914123491839e-05, + "loss": 0.21060125827789306, + "step": 9240 + }, + { + "epoch": 1.312987934705465, + "grad_norm": 8.383025169372559, + "learning_rate": 9.868772178850249e-05, + "loss": 0.29614646434783937, + "step": 9250 + }, + { + "epoch": 1.3144073811213626, + "grad_norm": 7.3245930671691895, + "learning_rate": 9.868630234208658e-05, + "loss": 0.22820439338684081, + "step": 9260 + }, + { + "epoch": 1.3158268275372604, + "grad_norm": 3.143709182739258, + "learning_rate": 9.86848828956707e-05, + "loss": 0.1735852003097534, + "step": 9270 + }, + { + "epoch": 1.3172462739531583, + "grad_norm": 8.565205574035645, + "learning_rate": 9.868346344925479e-05, + "loss": 0.175143563747406, + "step": 9280 + }, + { + "epoch": 1.3186657203690562, + "grad_norm": 5.662914752960205, + "learning_rate": 9.86820440028389e-05, + "loss": 0.19213972091674805, + "step": 9290 + }, + { + "epoch": 1.3200851667849538, + "grad_norm": 7.872828960418701, + "learning_rate": 9.8680624556423e-05, + "loss": 0.14704231023788453, + "step": 9300 + }, + { + "epoch": 1.3215046132008517, + "grad_norm": 11.20383071899414, + "learning_rate": 9.86792051100071e-05, + "loss": 0.24307498931884766, + "step": 9310 + }, + { + "epoch": 1.3229240596167495, + "grad_norm": 2.9435956478118896, + "learning_rate": 9.867778566359121e-05, + "loss": 0.23251771926879883, + "step": 9320 + }, + { + "epoch": 1.3243435060326472, + "grad_norm": 3.8682780265808105, + "learning_rate": 9.867636621717531e-05, + "loss": 0.21560065746307372, + "step": 9330 + }, + { + "epoch": 1.325762952448545, + "grad_norm": 7.9737420082092285, + "learning_rate": 9.867494677075942e-05, + "loss": 0.1927724599838257, + "step": 9340 + }, + { + "epoch": 1.327182398864443, + "grad_norm": 6.955791473388672, + "learning_rate": 9.86735273243435e-05, + "loss": 0.22344651222229003, + "step": 9350 + }, + { + "epoch": 1.3286018452803408, + "grad_norm": 9.098529815673828, + "learning_rate": 9.867210787792761e-05, + "loss": 0.2260176420211792, + "step": 9360 + }, + { + "epoch": 1.3300212916962384, + "grad_norm": 5.625829219818115, + "learning_rate": 9.867068843151171e-05, + "loss": 0.1760912299156189, + "step": 9370 + }, + { + "epoch": 1.3314407381121363, + "grad_norm": 2.4090805053710938, + "learning_rate": 9.866926898509582e-05, + "loss": 0.16904083490371705, + "step": 9380 + }, + { + "epoch": 1.3328601845280341, + "grad_norm": 4.635160446166992, + "learning_rate": 9.866784953867992e-05, + "loss": 0.21562621593475342, + "step": 9390 + }, + { + "epoch": 1.3342796309439318, + "grad_norm": 8.606550216674805, + "learning_rate": 9.866643009226402e-05, + "loss": 0.21092190742492675, + "step": 9400 + }, + { + "epoch": 1.3356990773598296, + "grad_norm": 5.678009033203125, + "learning_rate": 9.866501064584813e-05, + "loss": 0.19930131435394288, + "step": 9410 + }, + { + "epoch": 1.3371185237757275, + "grad_norm": 6.880139350891113, + "learning_rate": 9.866359119943222e-05, + "loss": 0.3152653217315674, + "step": 9420 + }, + { + "epoch": 1.3385379701916253, + "grad_norm": 5.563040733337402, + "learning_rate": 9.866217175301633e-05, + "loss": 0.18800781965255736, + "step": 9430 + }, + { + "epoch": 1.339957416607523, + "grad_norm": 2.5089986324310303, + "learning_rate": 9.866075230660043e-05, + "loss": 0.11295425891876221, + "step": 9440 + }, + { + "epoch": 1.3413768630234209, + "grad_norm": 4.770693302154541, + "learning_rate": 9.865933286018453e-05, + "loss": 0.18411701917648315, + "step": 9450 + }, + { + "epoch": 1.3427963094393187, + "grad_norm": 4.498220920562744, + "learning_rate": 9.865791341376863e-05, + "loss": 0.2168651342391968, + "step": 9460 + }, + { + "epoch": 1.3442157558552164, + "grad_norm": 3.5189125537872314, + "learning_rate": 9.865649396735274e-05, + "loss": 0.23824927806854249, + "step": 9470 + }, + { + "epoch": 1.3456352022711142, + "grad_norm": 5.034974098205566, + "learning_rate": 9.865507452093684e-05, + "loss": 0.14622821807861328, + "step": 9480 + }, + { + "epoch": 1.347054648687012, + "grad_norm": 2.3215811252593994, + "learning_rate": 9.865365507452095e-05, + "loss": 0.11778559684753417, + "step": 9490 + }, + { + "epoch": 1.34847409510291, + "grad_norm": 4.806303977966309, + "learning_rate": 9.865223562810504e-05, + "loss": 0.12332210540771485, + "step": 9500 + }, + { + "epoch": 1.34847409510291, + "eval_accuracy": 0.9099637565969352, + "eval_loss": 0.2493496835231781, + "eval_runtime": 31.6926, + "eval_samples_per_second": 496.236, + "eval_steps_per_second": 15.524, + "step": 9500 + }, + { + "epoch": 1.3498935415188076, + "grad_norm": 6.961501598358154, + "learning_rate": 9.865081618168914e-05, + "loss": 0.2591987371444702, + "step": 9510 + }, + { + "epoch": 1.3513129879347054, + "grad_norm": 4.2426323890686035, + "learning_rate": 9.864939673527325e-05, + "loss": 0.17831168174743653, + "step": 9520 + }, + { + "epoch": 1.3527324343506033, + "grad_norm": 6.4358625411987305, + "learning_rate": 9.864797728885735e-05, + "loss": 0.2314450740814209, + "step": 9530 + }, + { + "epoch": 1.354151880766501, + "grad_norm": 5.79241943359375, + "learning_rate": 9.864655784244146e-05, + "loss": 0.18896229267120362, + "step": 9540 + }, + { + "epoch": 1.3555713271823988, + "grad_norm": 7.353359699249268, + "learning_rate": 9.864513839602554e-05, + "loss": 0.19705621004104615, + "step": 9550 + }, + { + "epoch": 1.3569907735982967, + "grad_norm": 6.934425354003906, + "learning_rate": 9.864371894960966e-05, + "loss": 0.17384577989578248, + "step": 9560 + }, + { + "epoch": 1.3584102200141945, + "grad_norm": 5.2685394287109375, + "learning_rate": 9.864229950319375e-05, + "loss": 0.2469557285308838, + "step": 9570 + }, + { + "epoch": 1.3598296664300924, + "grad_norm": 6.054180145263672, + "learning_rate": 9.864088005677786e-05, + "loss": 0.2497105598449707, + "step": 9580 + }, + { + "epoch": 1.36124911284599, + "grad_norm": 3.806577444076538, + "learning_rate": 9.863946061036196e-05, + "loss": 0.16005023717880248, + "step": 9590 + }, + { + "epoch": 1.362668559261888, + "grad_norm": 9.077430725097656, + "learning_rate": 9.863804116394607e-05, + "loss": 0.24311597347259523, + "step": 9600 + }, + { + "epoch": 1.3640880056777855, + "grad_norm": 5.967398166656494, + "learning_rate": 9.863662171753017e-05, + "loss": 0.2098919153213501, + "step": 9610 + }, + { + "epoch": 1.3655074520936834, + "grad_norm": 5.3782172203063965, + "learning_rate": 9.863520227111427e-05, + "loss": 0.22856371402740477, + "step": 9620 + }, + { + "epoch": 1.3669268985095813, + "grad_norm": 7.211184501647949, + "learning_rate": 9.863378282469838e-05, + "loss": 0.19752051830291747, + "step": 9630 + }, + { + "epoch": 1.3683463449254791, + "grad_norm": 2.611245632171631, + "learning_rate": 9.863236337828247e-05, + "loss": 0.20763750076293946, + "step": 9640 + }, + { + "epoch": 1.369765791341377, + "grad_norm": 7.055820465087891, + "learning_rate": 9.863094393186659e-05, + "loss": 0.18712767362594604, + "step": 9650 + }, + { + "epoch": 1.3711852377572746, + "grad_norm": 7.2558112144470215, + "learning_rate": 9.862952448545067e-05, + "loss": 0.24251337051391603, + "step": 9660 + }, + { + "epoch": 1.3726046841731725, + "grad_norm": 6.948854446411133, + "learning_rate": 9.862810503903478e-05, + "loss": 0.1610349178314209, + "step": 9670 + }, + { + "epoch": 1.3740241305890701, + "grad_norm": 6.58130407333374, + "learning_rate": 9.862668559261888e-05, + "loss": 0.1934449315071106, + "step": 9680 + }, + { + "epoch": 1.375443577004968, + "grad_norm": 3.3496904373168945, + "learning_rate": 9.862526614620299e-05, + "loss": 0.17610930204391478, + "step": 9690 + }, + { + "epoch": 1.3768630234208659, + "grad_norm": 9.198835372924805, + "learning_rate": 9.862384669978709e-05, + "loss": 0.17025632858276368, + "step": 9700 + }, + { + "epoch": 1.3782824698367637, + "grad_norm": 1.7735481262207031, + "learning_rate": 9.862242725337118e-05, + "loss": 0.20825440883636476, + "step": 9710 + }, + { + "epoch": 1.3797019162526616, + "grad_norm": 6.809709548950195, + "learning_rate": 9.86210078069553e-05, + "loss": 0.18874866962432862, + "step": 9720 + }, + { + "epoch": 1.3811213626685592, + "grad_norm": 8.268877029418945, + "learning_rate": 9.861958836053939e-05, + "loss": 0.26922762393951416, + "step": 9730 + }, + { + "epoch": 1.382540809084457, + "grad_norm": 2.897256851196289, + "learning_rate": 9.86181689141235e-05, + "loss": 0.24385275840759277, + "step": 9740 + }, + { + "epoch": 1.3839602555003547, + "grad_norm": 3.334864616394043, + "learning_rate": 9.86167494677076e-05, + "loss": 0.16869350671768188, + "step": 9750 + }, + { + "epoch": 1.3853797019162526, + "grad_norm": 7.382256984710693, + "learning_rate": 9.86153300212917e-05, + "loss": 0.18727898597717285, + "step": 9760 + }, + { + "epoch": 1.3867991483321505, + "grad_norm": 3.0756566524505615, + "learning_rate": 9.86139105748758e-05, + "loss": 0.1948513627052307, + "step": 9770 + }, + { + "epoch": 1.3882185947480483, + "grad_norm": 7.820052146911621, + "learning_rate": 9.86124911284599e-05, + "loss": 0.1906062364578247, + "step": 9780 + }, + { + "epoch": 1.3896380411639462, + "grad_norm": 5.2213263511657715, + "learning_rate": 9.8611071682044e-05, + "loss": 0.19792075157165528, + "step": 9790 + }, + { + "epoch": 1.3910574875798438, + "grad_norm": 9.714534759521484, + "learning_rate": 9.860965223562811e-05, + "loss": 0.17712973356246947, + "step": 9800 + }, + { + "epoch": 1.3924769339957417, + "grad_norm": 4.078144073486328, + "learning_rate": 9.860823278921221e-05, + "loss": 0.18135050535202027, + "step": 9810 + }, + { + "epoch": 1.3938963804116393, + "grad_norm": 5.219580173492432, + "learning_rate": 9.860681334279631e-05, + "loss": 0.227278733253479, + "step": 9820 + }, + { + "epoch": 1.3953158268275372, + "grad_norm": 6.879891395568848, + "learning_rate": 9.860539389638042e-05, + "loss": 0.215889835357666, + "step": 9830 + }, + { + "epoch": 1.396735273243435, + "grad_norm": 9.455697059631348, + "learning_rate": 9.860397444996452e-05, + "loss": 0.16740819215774536, + "step": 9840 + }, + { + "epoch": 1.398154719659333, + "grad_norm": 4.630984306335449, + "learning_rate": 9.860255500354863e-05, + "loss": 0.22700212001800538, + "step": 9850 + }, + { + "epoch": 1.3995741660752308, + "grad_norm": 6.121819972991943, + "learning_rate": 9.860113555713271e-05, + "loss": 0.220161509513855, + "step": 9860 + }, + { + "epoch": 1.4009936124911284, + "grad_norm": 2.6966371536254883, + "learning_rate": 9.859971611071682e-05, + "loss": 0.18548699617385864, + "step": 9870 + }, + { + "epoch": 1.4024130589070263, + "grad_norm": 4.1472554206848145, + "learning_rate": 9.859829666430092e-05, + "loss": 0.18523939847946166, + "step": 9880 + }, + { + "epoch": 1.4038325053229241, + "grad_norm": 7.051137924194336, + "learning_rate": 9.859687721788503e-05, + "loss": 0.1325202226638794, + "step": 9890 + }, + { + "epoch": 1.4052519517388218, + "grad_norm": 5.540129661560059, + "learning_rate": 9.859545777146913e-05, + "loss": 0.16468173265457153, + "step": 9900 + }, + { + "epoch": 1.4066713981547196, + "grad_norm": 6.817564487457275, + "learning_rate": 9.859403832505323e-05, + "loss": 0.12863141298294067, + "step": 9910 + }, + { + "epoch": 1.4080908445706175, + "grad_norm": 2.415663719177246, + "learning_rate": 9.859261887863734e-05, + "loss": 0.1454537630081177, + "step": 9920 + }, + { + "epoch": 1.4095102909865154, + "grad_norm": 5.63126277923584, + "learning_rate": 9.859119943222143e-05, + "loss": 0.20712642669677733, + "step": 9930 + }, + { + "epoch": 1.410929737402413, + "grad_norm": 3.990525484085083, + "learning_rate": 9.858977998580555e-05, + "loss": 0.14999470710754395, + "step": 9940 + }, + { + "epoch": 1.4123491838183109, + "grad_norm": 4.665277004241943, + "learning_rate": 9.858836053938964e-05, + "loss": 0.1735332727432251, + "step": 9950 + }, + { + "epoch": 1.4137686302342087, + "grad_norm": 6.532275676727295, + "learning_rate": 9.858694109297375e-05, + "loss": 0.18187229633331298, + "step": 9960 + }, + { + "epoch": 1.4151880766501064, + "grad_norm": 10.086085319519043, + "learning_rate": 9.858552164655784e-05, + "loss": 0.25496907234191896, + "step": 9970 + }, + { + "epoch": 1.4166075230660042, + "grad_norm": 8.85912036895752, + "learning_rate": 9.858410220014195e-05, + "loss": 0.21260628700256348, + "step": 9980 + }, + { + "epoch": 1.418026969481902, + "grad_norm": 3.1774983406066895, + "learning_rate": 9.858268275372605e-05, + "loss": 0.16666808128356933, + "step": 9990 + }, + { + "epoch": 1.4194464158978, + "grad_norm": 8.12264633178711, + "learning_rate": 9.858126330731016e-05, + "loss": 0.13021547794342042, + "step": 10000 + }, + { + "epoch": 1.4194464158978, + "eval_accuracy": 0.9303745151650029, + "eval_loss": 0.2065460979938507, + "eval_runtime": 32.8099, + "eval_samples_per_second": 479.338, + "eval_steps_per_second": 14.995, + "step": 10000 + }, + { + "epoch": 1.4208658623136976, + "grad_norm": 3.760587453842163, + "learning_rate": 9.857984386089427e-05, + "loss": 0.21676597595214844, + "step": 10010 + }, + { + "epoch": 1.4222853087295955, + "grad_norm": 6.741761207580566, + "learning_rate": 9.857842441447835e-05, + "loss": 0.22888615131378173, + "step": 10020 + }, + { + "epoch": 1.4237047551454933, + "grad_norm": 4.405668258666992, + "learning_rate": 9.857700496806246e-05, + "loss": 0.13688948154449462, + "step": 10030 + }, + { + "epoch": 1.425124201561391, + "grad_norm": 5.534117698669434, + "learning_rate": 9.857558552164656e-05, + "loss": 0.14423273801803588, + "step": 10040 + }, + { + "epoch": 1.4265436479772888, + "grad_norm": 5.10047721862793, + "learning_rate": 9.857416607523067e-05, + "loss": 0.2310737133026123, + "step": 10050 + }, + { + "epoch": 1.4279630943931867, + "grad_norm": 3.052246570587158, + "learning_rate": 9.857274662881477e-05, + "loss": 0.20977180004119872, + "step": 10060 + }, + { + "epoch": 1.4293825408090846, + "grad_norm": 9.701653480529785, + "learning_rate": 9.857132718239887e-05, + "loss": 0.22714948654174805, + "step": 10070 + }, + { + "epoch": 1.4308019872249822, + "grad_norm": 2.72581148147583, + "learning_rate": 9.856990773598296e-05, + "loss": 0.2333024263381958, + "step": 10080 + }, + { + "epoch": 1.43222143364088, + "grad_norm": 8.234984397888184, + "learning_rate": 9.856848828956707e-05, + "loss": 0.21033647060394287, + "step": 10090 + }, + { + "epoch": 1.433640880056778, + "grad_norm": 4.618515491485596, + "learning_rate": 9.856706884315118e-05, + "loss": 0.2534619331359863, + "step": 10100 + }, + { + "epoch": 1.4350603264726756, + "grad_norm": 3.2053143978118896, + "learning_rate": 9.856564939673528e-05, + "loss": 0.18584598302841188, + "step": 10110 + }, + { + "epoch": 1.4364797728885734, + "grad_norm": 5.643956661224365, + "learning_rate": 9.856422995031938e-05, + "loss": 0.16008204221725464, + "step": 10120 + }, + { + "epoch": 1.4378992193044713, + "grad_norm": 7.6051201820373535, + "learning_rate": 9.856281050390348e-05, + "loss": 0.19140913486480712, + "step": 10130 + }, + { + "epoch": 1.4393186657203692, + "grad_norm": 8.58385181427002, + "learning_rate": 9.856139105748759e-05, + "loss": 0.22861852645874023, + "step": 10140 + }, + { + "epoch": 1.4407381121362668, + "grad_norm": 3.0554444789886475, + "learning_rate": 9.855997161107168e-05, + "loss": 0.14198927879333495, + "step": 10150 + }, + { + "epoch": 1.4421575585521647, + "grad_norm": 3.255782127380371, + "learning_rate": 9.85585521646558e-05, + "loss": 0.17290072441101073, + "step": 10160 + }, + { + "epoch": 1.4435770049680625, + "grad_norm": 4.403168678283691, + "learning_rate": 9.855713271823988e-05, + "loss": 0.19940041303634642, + "step": 10170 + }, + { + "epoch": 1.4449964513839602, + "grad_norm": 8.145320892333984, + "learning_rate": 9.855571327182399e-05, + "loss": 0.21902050971984863, + "step": 10180 + }, + { + "epoch": 1.446415897799858, + "grad_norm": 5.803956508636475, + "learning_rate": 9.85542938254081e-05, + "loss": 0.21828086376190187, + "step": 10190 + }, + { + "epoch": 1.4478353442157559, + "grad_norm": 8.805460929870605, + "learning_rate": 9.85528743789922e-05, + "loss": 0.23348815441131593, + "step": 10200 + }, + { + "epoch": 1.4492547906316537, + "grad_norm": 7.180856704711914, + "learning_rate": 9.855145493257631e-05, + "loss": 0.18313560485839844, + "step": 10210 + }, + { + "epoch": 1.4506742370475514, + "grad_norm": 7.773831844329834, + "learning_rate": 9.85500354861604e-05, + "loss": 0.18291949033737182, + "step": 10220 + }, + { + "epoch": 1.4520936834634492, + "grad_norm": 1.713024616241455, + "learning_rate": 9.85486160397445e-05, + "loss": 0.11751105785369872, + "step": 10230 + }, + { + "epoch": 1.453513129879347, + "grad_norm": 2.2637596130371094, + "learning_rate": 9.85471965933286e-05, + "loss": 0.14805399179458617, + "step": 10240 + }, + { + "epoch": 1.4549325762952448, + "grad_norm": 8.369937896728516, + "learning_rate": 9.854577714691271e-05, + "loss": 0.2501375198364258, + "step": 10250 + }, + { + "epoch": 1.4563520227111426, + "grad_norm": 9.403657913208008, + "learning_rate": 9.854435770049681e-05, + "loss": 0.1835735559463501, + "step": 10260 + }, + { + "epoch": 1.4577714691270405, + "grad_norm": 7.980884075164795, + "learning_rate": 9.854293825408091e-05, + "loss": 0.2255629301071167, + "step": 10270 + }, + { + "epoch": 1.4591909155429383, + "grad_norm": 13.038922309875488, + "learning_rate": 9.854151880766502e-05, + "loss": 0.1810195565223694, + "step": 10280 + }, + { + "epoch": 1.460610361958836, + "grad_norm": 6.806441783905029, + "learning_rate": 9.854009936124912e-05, + "loss": 0.20559656620025635, + "step": 10290 + }, + { + "epoch": 1.4620298083747338, + "grad_norm": 1.5737494230270386, + "learning_rate": 9.853867991483323e-05, + "loss": 0.17797669172286987, + "step": 10300 + }, + { + "epoch": 1.4634492547906317, + "grad_norm": 10.547101020812988, + "learning_rate": 9.853726046841732e-05, + "loss": 0.14445135593414307, + "step": 10310 + }, + { + "epoch": 1.4648687012065293, + "grad_norm": 7.028156757354736, + "learning_rate": 9.853584102200144e-05, + "loss": 0.19645894765853883, + "step": 10320 + }, + { + "epoch": 1.4662881476224272, + "grad_norm": 8.557269096374512, + "learning_rate": 9.853442157558552e-05, + "loss": 0.14470189809799194, + "step": 10330 + }, + { + "epoch": 1.467707594038325, + "grad_norm": 3.8612992763519287, + "learning_rate": 9.853300212916963e-05, + "loss": 0.18914811611175536, + "step": 10340 + }, + { + "epoch": 1.469127040454223, + "grad_norm": 1.5628553628921509, + "learning_rate": 9.853158268275373e-05, + "loss": 0.15799893140792848, + "step": 10350 + }, + { + "epoch": 1.4705464868701206, + "grad_norm": 1.3893674612045288, + "learning_rate": 9.853016323633784e-05, + "loss": 0.20945143699645996, + "step": 10360 + }, + { + "epoch": 1.4719659332860184, + "grad_norm": 5.654598712921143, + "learning_rate": 9.852874378992194e-05, + "loss": 0.18789818286895751, + "step": 10370 + }, + { + "epoch": 1.4733853797019163, + "grad_norm": 2.126235008239746, + "learning_rate": 9.852732434350603e-05, + "loss": 0.18574261665344238, + "step": 10380 + }, + { + "epoch": 1.474804826117814, + "grad_norm": 6.465456008911133, + "learning_rate": 9.852590489709014e-05, + "loss": 0.2622290849685669, + "step": 10390 + }, + { + "epoch": 1.4762242725337118, + "grad_norm": 0.5080237984657288, + "learning_rate": 9.852448545067424e-05, + "loss": 0.1537003517150879, + "step": 10400 + }, + { + "epoch": 1.4776437189496097, + "grad_norm": 1.72958505153656, + "learning_rate": 9.852306600425835e-05, + "loss": 0.15624310970306396, + "step": 10410 + }, + { + "epoch": 1.4790631653655075, + "grad_norm": 4.848511695861816, + "learning_rate": 9.852164655784245e-05, + "loss": 0.12883809804916382, + "step": 10420 + }, + { + "epoch": 1.4804826117814054, + "grad_norm": 5.730294227600098, + "learning_rate": 9.852022711142655e-05, + "loss": 0.14428837299346925, + "step": 10430 + }, + { + "epoch": 1.481902058197303, + "grad_norm": 4.0559539794921875, + "learning_rate": 9.851880766501064e-05, + "loss": 0.1629919409751892, + "step": 10440 + }, + { + "epoch": 1.483321504613201, + "grad_norm": 4.338459014892578, + "learning_rate": 9.851738821859476e-05, + "loss": 0.17030248641967774, + "step": 10450 + }, + { + "epoch": 1.4847409510290985, + "grad_norm": 10.856430053710938, + "learning_rate": 9.851596877217885e-05, + "loss": 0.23294711112976074, + "step": 10460 + }, + { + "epoch": 1.4861603974449964, + "grad_norm": 5.3764729499816895, + "learning_rate": 9.851454932576296e-05, + "loss": 0.1908231258392334, + "step": 10470 + }, + { + "epoch": 1.4875798438608943, + "grad_norm": 7.5525736808776855, + "learning_rate": 9.851312987934706e-05, + "loss": 0.1458095669746399, + "step": 10480 + }, + { + "epoch": 1.4889992902767921, + "grad_norm": 4.017747402191162, + "learning_rate": 9.851171043293116e-05, + "loss": 0.09822410345077515, + "step": 10490 + }, + { + "epoch": 1.49041873669269, + "grad_norm": 3.671755075454712, + "learning_rate": 9.851029098651527e-05, + "loss": 0.2174128770828247, + "step": 10500 + }, + { + "epoch": 1.49041873669269, + "eval_accuracy": 0.9363514974248108, + "eval_loss": 0.18055449426174164, + "eval_runtime": 32.7495, + "eval_samples_per_second": 480.221, + "eval_steps_per_second": 15.023, + "step": 10500 + }, + { + "epoch": 1.4918381831085876, + "grad_norm": 5.814731597900391, + "learning_rate": 9.850887154009937e-05, + "loss": 0.20221278667449952, + "step": 10510 + }, + { + "epoch": 1.4932576295244855, + "grad_norm": 4.894477367401123, + "learning_rate": 9.850745209368348e-05, + "loss": 0.1364034056663513, + "step": 10520 + }, + { + "epoch": 1.4946770759403831, + "grad_norm": 9.05544662475586, + "learning_rate": 9.850603264726756e-05, + "loss": 0.2525052785873413, + "step": 10530 + }, + { + "epoch": 1.496096522356281, + "grad_norm": 4.482929706573486, + "learning_rate": 9.850461320085167e-05, + "loss": 0.16218397617340088, + "step": 10540 + }, + { + "epoch": 1.4975159687721789, + "grad_norm": 6.634395599365234, + "learning_rate": 9.850319375443577e-05, + "loss": 0.14512306451797485, + "step": 10550 + }, + { + "epoch": 1.4989354151880767, + "grad_norm": 8.131645202636719, + "learning_rate": 9.850177430801988e-05, + "loss": 0.1850733518600464, + "step": 10560 + }, + { + "epoch": 1.5003548616039746, + "grad_norm": 7.16902494430542, + "learning_rate": 9.850035486160398e-05, + "loss": 0.232697057723999, + "step": 10570 + }, + { + "epoch": 1.5017743080198722, + "grad_norm": 9.409531593322754, + "learning_rate": 9.849893541518808e-05, + "loss": 0.13974694013595582, + "step": 10580 + }, + { + "epoch": 1.50319375443577, + "grad_norm": 6.473144054412842, + "learning_rate": 9.849751596877219e-05, + "loss": 0.1807733178138733, + "step": 10590 + }, + { + "epoch": 1.5046132008516677, + "grad_norm": 2.1681149005889893, + "learning_rate": 9.849609652235628e-05, + "loss": 0.12265112400054931, + "step": 10600 + }, + { + "epoch": 1.5060326472675656, + "grad_norm": 5.138197898864746, + "learning_rate": 9.84946770759404e-05, + "loss": 0.14840331077575683, + "step": 10610 + }, + { + "epoch": 1.5074520936834634, + "grad_norm": 7.284664630889893, + "learning_rate": 9.849325762952449e-05, + "loss": 0.14850282669067383, + "step": 10620 + }, + { + "epoch": 1.5088715400993613, + "grad_norm": 3.7971346378326416, + "learning_rate": 9.84918381831086e-05, + "loss": 0.1547774314880371, + "step": 10630 + }, + { + "epoch": 1.5102909865152592, + "grad_norm": 6.039275169372559, + "learning_rate": 9.849041873669269e-05, + "loss": 0.197337806224823, + "step": 10640 + }, + { + "epoch": 1.5117104329311568, + "grad_norm": 3.9703164100646973, + "learning_rate": 9.84889992902768e-05, + "loss": 0.2073758363723755, + "step": 10650 + }, + { + "epoch": 1.5131298793470547, + "grad_norm": 9.968624114990234, + "learning_rate": 9.84875798438609e-05, + "loss": 0.1673255443572998, + "step": 10660 + }, + { + "epoch": 1.5145493257629523, + "grad_norm": 5.294106483459473, + "learning_rate": 9.8486160397445e-05, + "loss": 0.1461545467376709, + "step": 10670 + }, + { + "epoch": 1.5159687721788502, + "grad_norm": 10.589927673339844, + "learning_rate": 9.84847409510291e-05, + "loss": 0.1678829312324524, + "step": 10680 + }, + { + "epoch": 1.517388218594748, + "grad_norm": 8.75311279296875, + "learning_rate": 9.84833215046132e-05, + "loss": 0.1493905782699585, + "step": 10690 + }, + { + "epoch": 1.518807665010646, + "grad_norm": 5.052854061126709, + "learning_rate": 9.848190205819731e-05, + "loss": 0.16829880475997924, + "step": 10700 + }, + { + "epoch": 1.5202271114265438, + "grad_norm": 10.165739059448242, + "learning_rate": 9.848048261178141e-05, + "loss": 0.1630192756652832, + "step": 10710 + }, + { + "epoch": 1.5216465578424414, + "grad_norm": 4.576249599456787, + "learning_rate": 9.847906316536552e-05, + "loss": 0.18904685974121094, + "step": 10720 + }, + { + "epoch": 1.5230660042583393, + "grad_norm": 6.297980308532715, + "learning_rate": 9.847764371894962e-05, + "loss": 0.20620598793029785, + "step": 10730 + }, + { + "epoch": 1.524485450674237, + "grad_norm": 6.77498197555542, + "learning_rate": 9.847622427253371e-05, + "loss": 0.16875416040420532, + "step": 10740 + }, + { + "epoch": 1.5259048970901348, + "grad_norm": 3.679386854171753, + "learning_rate": 9.847480482611781e-05, + "loss": 0.17838630676269532, + "step": 10750 + }, + { + "epoch": 1.5273243435060326, + "grad_norm": 9.312896728515625, + "learning_rate": 9.847338537970192e-05, + "loss": 0.21157798767089844, + "step": 10760 + }, + { + "epoch": 1.5287437899219305, + "grad_norm": 7.985523223876953, + "learning_rate": 9.847196593328602e-05, + "loss": 0.18047035932540895, + "step": 10770 + }, + { + "epoch": 1.5301632363378284, + "grad_norm": 6.29368257522583, + "learning_rate": 9.847054648687013e-05, + "loss": 0.1568093180656433, + "step": 10780 + }, + { + "epoch": 1.531582682753726, + "grad_norm": 5.2899322509765625, + "learning_rate": 9.846912704045423e-05, + "loss": 0.14504846334457397, + "step": 10790 + }, + { + "epoch": 1.5330021291696239, + "grad_norm": 1.8608068227767944, + "learning_rate": 9.846770759403833e-05, + "loss": 0.10261296033859253, + "step": 10800 + }, + { + "epoch": 1.5344215755855215, + "grad_norm": 7.755560398101807, + "learning_rate": 9.846628814762244e-05, + "loss": 0.20737462043762206, + "step": 10810 + }, + { + "epoch": 1.5358410220014194, + "grad_norm": 5.849984645843506, + "learning_rate": 9.846486870120653e-05, + "loss": 0.13056904077529907, + "step": 10820 + }, + { + "epoch": 1.5372604684173172, + "grad_norm": 12.66482162475586, + "learning_rate": 9.846344925479065e-05, + "loss": 0.18910495042800904, + "step": 10830 + }, + { + "epoch": 1.538679914833215, + "grad_norm": 5.568217754364014, + "learning_rate": 9.846202980837473e-05, + "loss": 0.21616907119750978, + "step": 10840 + }, + { + "epoch": 1.540099361249113, + "grad_norm": 7.100687503814697, + "learning_rate": 9.846061036195884e-05, + "loss": 0.2003716230392456, + "step": 10850 + }, + { + "epoch": 1.5415188076650106, + "grad_norm": 5.5214009284973145, + "learning_rate": 9.845919091554294e-05, + "loss": 0.17750124931335448, + "step": 10860 + }, + { + "epoch": 1.5429382540809085, + "grad_norm": 7.188937664031982, + "learning_rate": 9.845777146912705e-05, + "loss": 0.18738465309143065, + "step": 10870 + }, + { + "epoch": 1.544357700496806, + "grad_norm": 6.263291358947754, + "learning_rate": 9.845635202271115e-05, + "loss": 0.14714010953903198, + "step": 10880 + }, + { + "epoch": 1.545777146912704, + "grad_norm": 1.6037124395370483, + "learning_rate": 9.845493257629524e-05, + "loss": 0.16528385877609253, + "step": 10890 + }, + { + "epoch": 1.5471965933286018, + "grad_norm": 6.341423034667969, + "learning_rate": 9.845351312987935e-05, + "loss": 0.16852269172668458, + "step": 10900 + }, + { + "epoch": 1.5486160397444997, + "grad_norm": 1.0601999759674072, + "learning_rate": 9.845209368346345e-05, + "loss": 0.165651535987854, + "step": 10910 + }, + { + "epoch": 1.5500354861603975, + "grad_norm": 6.944467544555664, + "learning_rate": 9.845067423704756e-05, + "loss": 0.21995656490325927, + "step": 10920 + }, + { + "epoch": 1.5514549325762954, + "grad_norm": 6.1232380867004395, + "learning_rate": 9.844925479063166e-05, + "loss": 0.23545873165130615, + "step": 10930 + }, + { + "epoch": 1.552874378992193, + "grad_norm": 5.78615665435791, + "learning_rate": 9.844783534421576e-05, + "loss": 0.20628550052642822, + "step": 10940 + }, + { + "epoch": 1.5542938254080907, + "grad_norm": 2.3399593830108643, + "learning_rate": 9.844641589779985e-05, + "loss": 0.1314982771873474, + "step": 10950 + }, + { + "epoch": 1.5557132718239886, + "grad_norm": 8.838848114013672, + "learning_rate": 9.844499645138397e-05, + "loss": 0.17209669351577758, + "step": 10960 + }, + { + "epoch": 1.5571327182398864, + "grad_norm": 6.756653308868408, + "learning_rate": 9.844357700496806e-05, + "loss": 0.2233790397644043, + "step": 10970 + }, + { + "epoch": 1.5585521646557843, + "grad_norm": 3.664095163345337, + "learning_rate": 9.844215755855217e-05, + "loss": 0.14182189702987671, + "step": 10980 + }, + { + "epoch": 1.5599716110716821, + "grad_norm": 6.118113040924072, + "learning_rate": 9.844073811213627e-05, + "loss": 0.1605884075164795, + "step": 10990 + }, + { + "epoch": 1.56139105748758, + "grad_norm": 3.3329458236694336, + "learning_rate": 9.843931866572037e-05, + "loss": 0.15648469924926758, + "step": 11000 + }, + { + "epoch": 1.56139105748758, + "eval_accuracy": 0.9343803649774274, + "eval_loss": 0.18083110451698303, + "eval_runtime": 31.9521, + "eval_samples_per_second": 492.205, + "eval_steps_per_second": 15.398, + "step": 11000 + }, + { + "epoch": 1.5628105039034776, + "grad_norm": 2.8265178203582764, + "learning_rate": 9.843789921930448e-05, + "loss": 0.1055110216140747, + "step": 11010 + }, + { + "epoch": 1.5642299503193753, + "grad_norm": 7.40562105178833, + "learning_rate": 9.843647977288858e-05, + "loss": 0.1931678533554077, + "step": 11020 + }, + { + "epoch": 1.5656493967352731, + "grad_norm": 5.846470355987549, + "learning_rate": 9.843506032647269e-05, + "loss": 0.16744234561920165, + "step": 11030 + }, + { + "epoch": 1.567068843151171, + "grad_norm": 10.13637924194336, + "learning_rate": 9.843364088005678e-05, + "loss": 0.16841363906860352, + "step": 11040 + }, + { + "epoch": 1.5684882895670689, + "grad_norm": 8.881434440612793, + "learning_rate": 9.843222143364088e-05, + "loss": 0.11868530511856079, + "step": 11050 + }, + { + "epoch": 1.5699077359829667, + "grad_norm": 3.2120912075042725, + "learning_rate": 9.843080198722498e-05, + "loss": 0.25566916465759276, + "step": 11060 + }, + { + "epoch": 1.5713271823988646, + "grad_norm": 8.856307983398438, + "learning_rate": 9.842938254080909e-05, + "loss": 0.16841399669647217, + "step": 11070 + }, + { + "epoch": 1.5727466288147622, + "grad_norm": 5.458991050720215, + "learning_rate": 9.842796309439319e-05, + "loss": 0.1553714632987976, + "step": 11080 + }, + { + "epoch": 1.5741660752306599, + "grad_norm": 7.29731559753418, + "learning_rate": 9.84265436479773e-05, + "loss": 0.12889499664306642, + "step": 11090 + }, + { + "epoch": 1.5755855216465577, + "grad_norm": 4.352165699005127, + "learning_rate": 9.84251242015614e-05, + "loss": 0.17049648761749267, + "step": 11100 + }, + { + "epoch": 1.5770049680624556, + "grad_norm": 3.659630060195923, + "learning_rate": 9.84237047551455e-05, + "loss": 0.11960989236831665, + "step": 11110 + }, + { + "epoch": 1.5784244144783535, + "grad_norm": 9.198236465454102, + "learning_rate": 9.84222853087296e-05, + "loss": 0.13858609199523925, + "step": 11120 + }, + { + "epoch": 1.5798438608942513, + "grad_norm": 4.7100510597229, + "learning_rate": 9.84208658623137e-05, + "loss": 0.15008503198623657, + "step": 11130 + }, + { + "epoch": 1.5812633073101492, + "grad_norm": 7.331428050994873, + "learning_rate": 9.841944641589781e-05, + "loss": 0.1811345934867859, + "step": 11140 + }, + { + "epoch": 1.5826827537260468, + "grad_norm": 7.792325019836426, + "learning_rate": 9.84180269694819e-05, + "loss": 0.22963361740112304, + "step": 11150 + }, + { + "epoch": 1.5841022001419447, + "grad_norm": 1.6901665925979614, + "learning_rate": 9.841660752306601e-05, + "loss": 0.12061529159545899, + "step": 11160 + }, + { + "epoch": 1.5855216465578423, + "grad_norm": 6.294560432434082, + "learning_rate": 9.84151880766501e-05, + "loss": 0.1813538670539856, + "step": 11170 + }, + { + "epoch": 1.5869410929737402, + "grad_norm": 5.661618232727051, + "learning_rate": 9.841376863023422e-05, + "loss": 0.13598719835281373, + "step": 11180 + }, + { + "epoch": 1.588360539389638, + "grad_norm": 4.586926460266113, + "learning_rate": 9.841234918381831e-05, + "loss": 0.151306414604187, + "step": 11190 + }, + { + "epoch": 1.589779985805536, + "grad_norm": 3.2611052989959717, + "learning_rate": 9.841092973740241e-05, + "loss": 0.202089524269104, + "step": 11200 + }, + { + "epoch": 1.5911994322214338, + "grad_norm": 5.5583109855651855, + "learning_rate": 9.840951029098652e-05, + "loss": 0.13323140144348145, + "step": 11210 + }, + { + "epoch": 1.5926188786373314, + "grad_norm": 2.7712435722351074, + "learning_rate": 9.840809084457062e-05, + "loss": 0.2039250135421753, + "step": 11220 + }, + { + "epoch": 1.5940383250532293, + "grad_norm": 5.573919773101807, + "learning_rate": 9.840667139815473e-05, + "loss": 0.22665846347808838, + "step": 11230 + }, + { + "epoch": 1.595457771469127, + "grad_norm": 4.785495758056641, + "learning_rate": 9.840525195173883e-05, + "loss": 0.13016164302825928, + "step": 11240 + }, + { + "epoch": 1.5968772178850248, + "grad_norm": 5.181567668914795, + "learning_rate": 9.840383250532292e-05, + "loss": 0.1920285105705261, + "step": 11250 + }, + { + "epoch": 1.5982966643009227, + "grad_norm": 6.854187488555908, + "learning_rate": 9.840255500354862e-05, + "loss": 0.17289340496063232, + "step": 11260 + }, + { + "epoch": 1.5997161107168205, + "grad_norm": 5.818141937255859, + "learning_rate": 9.840113555713272e-05, + "loss": 0.1366284132003784, + "step": 11270 + }, + { + "epoch": 1.6011355571327184, + "grad_norm": 5.610560417175293, + "learning_rate": 9.839971611071682e-05, + "loss": 0.15053837299346923, + "step": 11280 + }, + { + "epoch": 1.602555003548616, + "grad_norm": 3.7539663314819336, + "learning_rate": 9.839829666430093e-05, + "loss": 0.14345501661300658, + "step": 11290 + }, + { + "epoch": 1.6039744499645139, + "grad_norm": 7.876579284667969, + "learning_rate": 9.839687721788503e-05, + "loss": 0.13623604774475098, + "step": 11300 + }, + { + "epoch": 1.6053938963804115, + "grad_norm": 7.193563461303711, + "learning_rate": 9.839545777146914e-05, + "loss": 0.21021018028259278, + "step": 11310 + }, + { + "epoch": 1.6068133427963094, + "grad_norm": 3.236804485321045, + "learning_rate": 9.839403832505323e-05, + "loss": 0.1547287106513977, + "step": 11320 + }, + { + "epoch": 1.6082327892122072, + "grad_norm": 5.831701278686523, + "learning_rate": 9.839261887863733e-05, + "loss": 0.2037062644958496, + "step": 11330 + }, + { + "epoch": 1.609652235628105, + "grad_norm": 11.167473793029785, + "learning_rate": 9.839119943222144e-05, + "loss": 0.23104898929595946, + "step": 11340 + }, + { + "epoch": 1.611071682044003, + "grad_norm": 8.400900840759277, + "learning_rate": 9.838977998580554e-05, + "loss": 0.18747899532318116, + "step": 11350 + }, + { + "epoch": 1.6124911284599006, + "grad_norm": 5.5414042472839355, + "learning_rate": 9.838836053938965e-05, + "loss": 0.20507404804229737, + "step": 11360 + }, + { + "epoch": 1.6139105748757985, + "grad_norm": 5.533061504364014, + "learning_rate": 9.838694109297375e-05, + "loss": 0.17890411615371704, + "step": 11370 + }, + { + "epoch": 1.6153300212916961, + "grad_norm": 2.9510483741760254, + "learning_rate": 9.838552164655785e-05, + "loss": 0.16628677845001222, + "step": 11380 + }, + { + "epoch": 1.616749467707594, + "grad_norm": 5.596954822540283, + "learning_rate": 9.838410220014194e-05, + "loss": 0.14340368509292603, + "step": 11390 + }, + { + "epoch": 1.6181689141234918, + "grad_norm": 1.025497555732727, + "learning_rate": 9.838268275372605e-05, + "loss": 0.1132912278175354, + "step": 11400 + }, + { + "epoch": 1.6195883605393897, + "grad_norm": 8.293600082397461, + "learning_rate": 9.838126330731015e-05, + "loss": 0.15983034372329713, + "step": 11410 + }, + { + "epoch": 1.6210078069552876, + "grad_norm": 6.942419052124023, + "learning_rate": 9.837984386089426e-05, + "loss": 0.18471511602401733, + "step": 11420 + }, + { + "epoch": 1.6224272533711852, + "grad_norm": 7.051154613494873, + "learning_rate": 9.837842441447836e-05, + "loss": 0.17162368297576905, + "step": 11430 + }, + { + "epoch": 1.623846699787083, + "grad_norm": 4.608026504516602, + "learning_rate": 9.837700496806246e-05, + "loss": 0.17447967529296876, + "step": 11440 + }, + { + "epoch": 1.6252661462029807, + "grad_norm": 2.5280375480651855, + "learning_rate": 9.837558552164657e-05, + "loss": 0.13198750019073485, + "step": 11450 + }, + { + "epoch": 1.6266855926188786, + "grad_norm": 5.921835422515869, + "learning_rate": 9.837416607523067e-05, + "loss": 0.19506406784057617, + "step": 11460 + }, + { + "epoch": 1.6281050390347764, + "grad_norm": 1.4568758010864258, + "learning_rate": 9.837274662881478e-05, + "loss": 0.12564977407455444, + "step": 11470 + }, + { + "epoch": 1.6295244854506743, + "grad_norm": 4.619745254516602, + "learning_rate": 9.837132718239886e-05, + "loss": 0.1366949200630188, + "step": 11480 + }, + { + "epoch": 1.6309439318665722, + "grad_norm": 13.973068237304688, + "learning_rate": 9.836990773598297e-05, + "loss": 0.2520665168762207, + "step": 11490 + }, + { + "epoch": 1.6323633782824698, + "grad_norm": 5.616090297698975, + "learning_rate": 9.836848828956707e-05, + "loss": 0.24036917686462403, + "step": 11500 + }, + { + "epoch": 1.6323633782824698, + "eval_accuracy": 0.938894894131112, + "eval_loss": 0.17282415926456451, + "eval_runtime": 32.6586, + "eval_samples_per_second": 481.558, + "eval_steps_per_second": 15.065, + "step": 11500 + }, + { + "epoch": 1.6337828246983677, + "grad_norm": 2.5921289920806885, + "learning_rate": 9.836706884315118e-05, + "loss": 0.1288065195083618, + "step": 11510 + }, + { + "epoch": 1.6352022711142653, + "grad_norm": 3.20184326171875, + "learning_rate": 9.836564939673528e-05, + "loss": 0.14583102464675904, + "step": 11520 + }, + { + "epoch": 1.6366217175301632, + "grad_norm": 5.127830505371094, + "learning_rate": 9.836422995031937e-05, + "loss": 0.18197163343429565, + "step": 11530 + }, + { + "epoch": 1.638041163946061, + "grad_norm": 7.125634670257568, + "learning_rate": 9.836281050390349e-05, + "loss": 0.1912643551826477, + "step": 11540 + }, + { + "epoch": 1.639460610361959, + "grad_norm": 2.9785008430480957, + "learning_rate": 9.836139105748758e-05, + "loss": 0.13757799863815307, + "step": 11550 + }, + { + "epoch": 1.6408800567778568, + "grad_norm": 1.8115347623825073, + "learning_rate": 9.83599716110717e-05, + "loss": 0.1510754942893982, + "step": 11560 + }, + { + "epoch": 1.6422995031937544, + "grad_norm": 3.6485488414764404, + "learning_rate": 9.835855216465579e-05, + "loss": 0.17528530359268188, + "step": 11570 + }, + { + "epoch": 1.6437189496096523, + "grad_norm": 5.931766510009766, + "learning_rate": 9.835713271823989e-05, + "loss": 0.20811958312988282, + "step": 11580 + }, + { + "epoch": 1.64513839602555, + "grad_norm": 7.735183238983154, + "learning_rate": 9.835571327182399e-05, + "loss": 0.1395600199699402, + "step": 11590 + }, + { + "epoch": 1.6465578424414478, + "grad_norm": 5.529693603515625, + "learning_rate": 9.83542938254081e-05, + "loss": 0.14511030912399292, + "step": 11600 + }, + { + "epoch": 1.6479772888573456, + "grad_norm": 4.704524993896484, + "learning_rate": 9.83528743789922e-05, + "loss": 0.1279573082923889, + "step": 11610 + }, + { + "epoch": 1.6493967352732435, + "grad_norm": 11.802435874938965, + "learning_rate": 9.83514549325763e-05, + "loss": 0.14364974498748778, + "step": 11620 + }, + { + "epoch": 1.6508161816891413, + "grad_norm": 7.839514255523682, + "learning_rate": 9.83500354861604e-05, + "loss": 0.17981865406036376, + "step": 11630 + }, + { + "epoch": 1.652235628105039, + "grad_norm": 6.616874694824219, + "learning_rate": 9.83486160397445e-05, + "loss": 0.2129373550415039, + "step": 11640 + }, + { + "epoch": 1.6536550745209369, + "grad_norm": 2.111496925354004, + "learning_rate": 9.834719659332861e-05, + "loss": 0.21924855709075927, + "step": 11650 + }, + { + "epoch": 1.6550745209368345, + "grad_norm": 10.006966590881348, + "learning_rate": 9.834577714691271e-05, + "loss": 0.17941123247146606, + "step": 11660 + }, + { + "epoch": 1.6564939673527324, + "grad_norm": 5.636976718902588, + "learning_rate": 9.834435770049682e-05, + "loss": 0.166895854473114, + "step": 11670 + }, + { + "epoch": 1.6579134137686302, + "grad_norm": 1.7106539011001587, + "learning_rate": 9.834293825408092e-05, + "loss": 0.16953905820846557, + "step": 11680 + }, + { + "epoch": 1.659332860184528, + "grad_norm": 5.924720764160156, + "learning_rate": 9.834151880766501e-05, + "loss": 0.12511081695556642, + "step": 11690 + }, + { + "epoch": 1.660752306600426, + "grad_norm": 8.140963554382324, + "learning_rate": 9.834009936124911e-05, + "loss": 0.15308539867401122, + "step": 11700 + }, + { + "epoch": 1.6621717530163236, + "grad_norm": 2.5716195106506348, + "learning_rate": 9.833867991483322e-05, + "loss": 0.1372369647026062, + "step": 11710 + }, + { + "epoch": 1.6635911994322214, + "grad_norm": 7.952601909637451, + "learning_rate": 9.833726046841732e-05, + "loss": 0.14670779705047607, + "step": 11720 + }, + { + "epoch": 1.665010645848119, + "grad_norm": 1.4507794380187988, + "learning_rate": 9.833584102200143e-05, + "loss": 0.1868760108947754, + "step": 11730 + }, + { + "epoch": 1.666430092264017, + "grad_norm": 7.695814609527588, + "learning_rate": 9.833442157558553e-05, + "loss": 0.24691624641418458, + "step": 11740 + }, + { + "epoch": 1.6678495386799148, + "grad_norm": 10.15262508392334, + "learning_rate": 9.833300212916962e-05, + "loss": 0.2450582504272461, + "step": 11750 + }, + { + "epoch": 1.6692689850958127, + "grad_norm": 5.300413131713867, + "learning_rate": 9.833158268275374e-05, + "loss": 0.17981985807418824, + "step": 11760 + }, + { + "epoch": 1.6706884315117105, + "grad_norm": 10.736809730529785, + "learning_rate": 9.833016323633783e-05, + "loss": 0.12192434072494507, + "step": 11770 + }, + { + "epoch": 1.6721078779276084, + "grad_norm": 2.6130592823028564, + "learning_rate": 9.832874378992194e-05, + "loss": 0.1472996473312378, + "step": 11780 + }, + { + "epoch": 1.673527324343506, + "grad_norm": 6.176468849182129, + "learning_rate": 9.832732434350603e-05, + "loss": 0.12378195524215699, + "step": 11790 + }, + { + "epoch": 1.6749467707594037, + "grad_norm": 12.4953031539917, + "learning_rate": 9.832590489709014e-05, + "loss": 0.18659558296203613, + "step": 11800 + }, + { + "epoch": 1.6763662171753015, + "grad_norm": 6.664957046508789, + "learning_rate": 9.832448545067424e-05, + "loss": 0.17845855951309203, + "step": 11810 + }, + { + "epoch": 1.6777856635911994, + "grad_norm": 4.767297267913818, + "learning_rate": 9.832306600425835e-05, + "loss": 0.20129690170288086, + "step": 11820 + }, + { + "epoch": 1.6792051100070973, + "grad_norm": 8.662429809570312, + "learning_rate": 9.832164655784244e-05, + "loss": 0.19204812049865722, + "step": 11830 + }, + { + "epoch": 1.6806245564229951, + "grad_norm": 4.443410873413086, + "learning_rate": 9.832022711142654e-05, + "loss": 0.17241191864013672, + "step": 11840 + }, + { + "epoch": 1.682044002838893, + "grad_norm": 6.706130027770996, + "learning_rate": 9.831880766501065e-05, + "loss": 0.14194031953811645, + "step": 11850 + }, + { + "epoch": 1.6834634492547906, + "grad_norm": 4.810044288635254, + "learning_rate": 9.831738821859475e-05, + "loss": 0.1292971134185791, + "step": 11860 + }, + { + "epoch": 1.6848828956706883, + "grad_norm": 4.945130348205566, + "learning_rate": 9.831596877217886e-05, + "loss": 0.13104760646820068, + "step": 11870 + }, + { + "epoch": 1.6863023420865861, + "grad_norm": 7.412860870361328, + "learning_rate": 9.831454932576296e-05, + "loss": 0.18914194107055665, + "step": 11880 + }, + { + "epoch": 1.687721788502484, + "grad_norm": 1.9591195583343506, + "learning_rate": 9.831312987934706e-05, + "loss": 0.1756757378578186, + "step": 11890 + }, + { + "epoch": 1.6891412349183819, + "grad_norm": 2.857415199279785, + "learning_rate": 9.831171043293115e-05, + "loss": 0.10278797149658203, + "step": 11900 + }, + { + "epoch": 1.6905606813342797, + "grad_norm": 2.342369556427002, + "learning_rate": 9.831029098651526e-05, + "loss": 0.12141529321670533, + "step": 11910 + }, + { + "epoch": 1.6919801277501776, + "grad_norm": 5.84676456451416, + "learning_rate": 9.830887154009936e-05, + "loss": 0.20085587501525878, + "step": 11920 + }, + { + "epoch": 1.6933995741660752, + "grad_norm": 3.6309845447540283, + "learning_rate": 9.830745209368347e-05, + "loss": 0.15413752794265748, + "step": 11930 + }, + { + "epoch": 1.6948190205819729, + "grad_norm": 2.3892900943756104, + "learning_rate": 9.830603264726757e-05, + "loss": 0.15552257299423217, + "step": 11940 + }, + { + "epoch": 1.6962384669978707, + "grad_norm": 0.9857825636863708, + "learning_rate": 9.830461320085167e-05, + "loss": 0.15181114673614501, + "step": 11950 + }, + { + "epoch": 1.6976579134137686, + "grad_norm": 6.49855375289917, + "learning_rate": 9.830319375443578e-05, + "loss": 0.17083282470703126, + "step": 11960 + }, + { + "epoch": 1.6990773598296665, + "grad_norm": 1.0913960933685303, + "learning_rate": 9.830177430801988e-05, + "loss": 0.2133202314376831, + "step": 11970 + }, + { + "epoch": 1.7004968062455643, + "grad_norm": 4.437821388244629, + "learning_rate": 9.830035486160399e-05, + "loss": 0.0879701018333435, + "step": 11980 + }, + { + "epoch": 1.7019162526614622, + "grad_norm": 4.715758800506592, + "learning_rate": 9.829893541518807e-05, + "loss": 0.15447641611099244, + "step": 11990 + }, + { + "epoch": 1.7033356990773598, + "grad_norm": 8.367589950561523, + "learning_rate": 9.829751596877218e-05, + "loss": 0.17715357542037963, + "step": 12000 + }, + { + "epoch": 1.7033356990773598, + "eval_accuracy": 0.9378775354485916, + "eval_loss": 0.17906926572322845, + "eval_runtime": 33.4925, + "eval_samples_per_second": 469.568, + "eval_steps_per_second": 14.69, + "step": 12000 + }, + { + "epoch": 1.7047551454932577, + "grad_norm": 8.013254165649414, + "learning_rate": 9.829609652235628e-05, + "loss": 0.1866832494735718, + "step": 12010 + }, + { + "epoch": 1.7061745919091553, + "grad_norm": 7.372905731201172, + "learning_rate": 9.829467707594039e-05, + "loss": 0.124139404296875, + "step": 12020 + }, + { + "epoch": 1.7075940383250532, + "grad_norm": 6.6865739822387695, + "learning_rate": 9.829325762952449e-05, + "loss": 0.12705342769622802, + "step": 12030 + }, + { + "epoch": 1.709013484740951, + "grad_norm": 4.504441738128662, + "learning_rate": 9.82918381831086e-05, + "loss": 0.1867109179496765, + "step": 12040 + }, + { + "epoch": 1.710432931156849, + "grad_norm": 1.8893638849258423, + "learning_rate": 9.82904187366927e-05, + "loss": 0.14493658542633056, + "step": 12050 + }, + { + "epoch": 1.7118523775727468, + "grad_norm": 1.72226083278656, + "learning_rate": 9.828899929027679e-05, + "loss": 0.1554844617843628, + "step": 12060 + }, + { + "epoch": 1.7132718239886444, + "grad_norm": 5.362784385681152, + "learning_rate": 9.82875798438609e-05, + "loss": 0.18286285400390626, + "step": 12070 + }, + { + "epoch": 1.7146912704045423, + "grad_norm": 9.535138130187988, + "learning_rate": 9.8286160397445e-05, + "loss": 0.1454553484916687, + "step": 12080 + }, + { + "epoch": 1.71611071682044, + "grad_norm": 5.757817268371582, + "learning_rate": 9.828474095102911e-05, + "loss": 0.14671599864959717, + "step": 12090 + }, + { + "epoch": 1.7175301632363378, + "grad_norm": 5.000237464904785, + "learning_rate": 9.82833215046132e-05, + "loss": 0.21178703308105468, + "step": 12100 + }, + { + "epoch": 1.7189496096522356, + "grad_norm": 5.827192306518555, + "learning_rate": 9.82819020581973e-05, + "loss": 0.21477718353271485, + "step": 12110 + }, + { + "epoch": 1.7203690560681335, + "grad_norm": 3.8673248291015625, + "learning_rate": 9.82804826117814e-05, + "loss": 0.2367461919784546, + "step": 12120 + }, + { + "epoch": 1.7217885024840314, + "grad_norm": 4.519773006439209, + "learning_rate": 9.827906316536551e-05, + "loss": 0.12398046255111694, + "step": 12130 + }, + { + "epoch": 1.723207948899929, + "grad_norm": 7.634313583374023, + "learning_rate": 9.827764371894961e-05, + "loss": 0.12134796380996704, + "step": 12140 + }, + { + "epoch": 1.7246273953158269, + "grad_norm": 7.9592766761779785, + "learning_rate": 9.827622427253371e-05, + "loss": 0.18058866262435913, + "step": 12150 + }, + { + "epoch": 1.7260468417317245, + "grad_norm": 6.438409805297852, + "learning_rate": 9.827480482611782e-05, + "loss": 0.17642263174057007, + "step": 12160 + }, + { + "epoch": 1.7274662881476224, + "grad_norm": 5.818785667419434, + "learning_rate": 9.827338537970192e-05, + "loss": 0.13319342136383056, + "step": 12170 + }, + { + "epoch": 1.7288857345635202, + "grad_norm": 5.536925315856934, + "learning_rate": 9.827196593328603e-05, + "loss": 0.13135639429092408, + "step": 12180 + }, + { + "epoch": 1.730305180979418, + "grad_norm": 5.665536403656006, + "learning_rate": 9.827054648687013e-05, + "loss": 0.12864874601364135, + "step": 12190 + }, + { + "epoch": 1.731724627395316, + "grad_norm": 5.198805809020996, + "learning_rate": 9.826912704045422e-05, + "loss": 0.09919618964195251, + "step": 12200 + }, + { + "epoch": 1.7331440738112136, + "grad_norm": 3.8186886310577393, + "learning_rate": 9.826770759403832e-05, + "loss": 0.15075846910476684, + "step": 12210 + }, + { + "epoch": 1.7345635202271115, + "grad_norm": 4.91066837310791, + "learning_rate": 9.826628814762243e-05, + "loss": 0.1283166766166687, + "step": 12220 + }, + { + "epoch": 1.735982966643009, + "grad_norm": 4.604067802429199, + "learning_rate": 9.826486870120653e-05, + "loss": 0.1516009211540222, + "step": 12230 + }, + { + "epoch": 1.737402413058907, + "grad_norm": 0.4906020164489746, + "learning_rate": 9.826344925479064e-05, + "loss": 0.1481213688850403, + "step": 12240 + }, + { + "epoch": 1.7388218594748048, + "grad_norm": 2.69415283203125, + "learning_rate": 9.826202980837474e-05, + "loss": 0.14420045614242555, + "step": 12250 + }, + { + "epoch": 1.7402413058907027, + "grad_norm": 10.119294166564941, + "learning_rate": 9.826061036195884e-05, + "loss": 0.1346837282180786, + "step": 12260 + }, + { + "epoch": 1.7416607523066006, + "grad_norm": 5.118008613586426, + "learning_rate": 9.825919091554295e-05, + "loss": 0.10409802198410034, + "step": 12270 + }, + { + "epoch": 1.7430801987224982, + "grad_norm": 9.627950668334961, + "learning_rate": 9.825777146912704e-05, + "loss": 0.12958219051361083, + "step": 12280 + }, + { + "epoch": 1.744499645138396, + "grad_norm": 7.486164093017578, + "learning_rate": 9.825635202271115e-05, + "loss": 0.15439097881317138, + "step": 12290 + }, + { + "epoch": 1.7459190915542937, + "grad_norm": 4.496451377868652, + "learning_rate": 9.825493257629524e-05, + "loss": 0.14370408058166503, + "step": 12300 + }, + { + "epoch": 1.7473385379701916, + "grad_norm": 1.7741354703903198, + "learning_rate": 9.825351312987935e-05, + "loss": 0.14793674945831298, + "step": 12310 + }, + { + "epoch": 1.7487579843860894, + "grad_norm": 6.230805397033691, + "learning_rate": 9.825209368346345e-05, + "loss": 0.12588064670562743, + "step": 12320 + }, + { + "epoch": 1.7501774308019873, + "grad_norm": 7.041757106781006, + "learning_rate": 9.825067423704756e-05, + "loss": 0.2671244144439697, + "step": 12330 + }, + { + "epoch": 1.7515968772178852, + "grad_norm": 8.067173957824707, + "learning_rate": 9.824925479063167e-05, + "loss": 0.18581972122192383, + "step": 12340 + }, + { + "epoch": 1.7530163236337828, + "grad_norm": 6.106922626495361, + "learning_rate": 9.824783534421575e-05, + "loss": 0.16915748119354249, + "step": 12350 + }, + { + "epoch": 1.7544357700496807, + "grad_norm": 6.7981743812561035, + "learning_rate": 9.824641589779986e-05, + "loss": 0.12603729963302612, + "step": 12360 + }, + { + "epoch": 1.7558552164655783, + "grad_norm": 5.5388360023498535, + "learning_rate": 9.824499645138396e-05, + "loss": 0.1549227714538574, + "step": 12370 + }, + { + "epoch": 1.7572746628814762, + "grad_norm": 6.960907459259033, + "learning_rate": 9.824357700496807e-05, + "loss": 0.18172571659088135, + "step": 12380 + }, + { + "epoch": 1.758694109297374, + "grad_norm": 4.753782272338867, + "learning_rate": 9.824215755855217e-05, + "loss": 0.14021997451782225, + "step": 12390 + }, + { + "epoch": 1.7601135557132719, + "grad_norm": 3.4172661304473877, + "learning_rate": 9.824073811213628e-05, + "loss": 0.13940014839172363, + "step": 12400 + }, + { + "epoch": 1.7615330021291697, + "grad_norm": 2.0530076026916504, + "learning_rate": 9.823931866572036e-05, + "loss": 0.16023153066635132, + "step": 12410 + }, + { + "epoch": 1.7629524485450674, + "grad_norm": 9.870774269104004, + "learning_rate": 9.823789921930447e-05, + "loss": 0.1769045352935791, + "step": 12420 + }, + { + "epoch": 1.7643718949609652, + "grad_norm": 2.381181001663208, + "learning_rate": 9.823647977288859e-05, + "loss": 0.10290155410766602, + "step": 12430 + }, + { + "epoch": 1.7657913413768629, + "grad_norm": 0.6588567495346069, + "learning_rate": 9.823506032647268e-05, + "loss": 0.07668265104293823, + "step": 12440 + }, + { + "epoch": 1.7672107877927608, + "grad_norm": 8.259925842285156, + "learning_rate": 9.82336408800568e-05, + "loss": 0.10816916227340698, + "step": 12450 + }, + { + "epoch": 1.7686302342086586, + "grad_norm": 10.110259056091309, + "learning_rate": 9.823222143364088e-05, + "loss": 0.1543756604194641, + "step": 12460 + }, + { + "epoch": 1.7700496806245565, + "grad_norm": 1.5917772054672241, + "learning_rate": 9.823080198722499e-05, + "loss": 0.1755792737007141, + "step": 12470 + }, + { + "epoch": 1.7714691270404543, + "grad_norm": 4.567733287811279, + "learning_rate": 9.822938254080909e-05, + "loss": 0.09556171298027039, + "step": 12480 + }, + { + "epoch": 1.772888573456352, + "grad_norm": 4.524011611938477, + "learning_rate": 9.82279630943932e-05, + "loss": 0.11977797746658325, + "step": 12490 + }, + { + "epoch": 1.7743080198722498, + "grad_norm": 3.390681266784668, + "learning_rate": 9.82265436479773e-05, + "loss": 0.20999493598937988, + "step": 12500 + }, + { + "epoch": 1.7743080198722498, + "eval_accuracy": 0.9404845170725504, + "eval_loss": 0.17900405824184418, + "eval_runtime": 33.0963, + "eval_samples_per_second": 475.19, + "eval_steps_per_second": 14.866, + "step": 12500 + }, + { + "epoch": 1.7757274662881475, + "grad_norm": 6.486291885375977, + "learning_rate": 9.822512420156139e-05, + "loss": 0.15412837266921997, + "step": 12510 + }, + { + "epoch": 1.7771469127040453, + "grad_norm": 8.4727201461792, + "learning_rate": 9.82237047551455e-05, + "loss": 0.1553104877471924, + "step": 12520 + }, + { + "epoch": 1.7785663591199432, + "grad_norm": 7.080015182495117, + "learning_rate": 9.82222853087296e-05, + "loss": 0.17961130142211915, + "step": 12530 + }, + { + "epoch": 1.779985805535841, + "grad_norm": 3.5858380794525146, + "learning_rate": 9.822086586231371e-05, + "loss": 0.16834441423416138, + "step": 12540 + }, + { + "epoch": 1.781405251951739, + "grad_norm": 1.947180986404419, + "learning_rate": 9.821944641589781e-05, + "loss": 0.16140348911285402, + "step": 12550 + }, + { + "epoch": 1.7828246983676366, + "grad_norm": 4.678013801574707, + "learning_rate": 9.82180269694819e-05, + "loss": 0.17220114469528197, + "step": 12560 + }, + { + "epoch": 1.7842441447835344, + "grad_norm": 1.8858182430267334, + "learning_rate": 9.8216607523066e-05, + "loss": 0.11123390197753906, + "step": 12570 + }, + { + "epoch": 1.785663591199432, + "grad_norm": 8.490455627441406, + "learning_rate": 9.821518807665011e-05, + "loss": 0.21482553482055664, + "step": 12580 + }, + { + "epoch": 1.78708303761533, + "grad_norm": 6.9470415115356445, + "learning_rate": 9.821376863023421e-05, + "loss": 0.22754549980163574, + "step": 12590 + }, + { + "epoch": 1.7885024840312278, + "grad_norm": 7.122620105743408, + "learning_rate": 9.821234918381832e-05, + "loss": 0.2618594169616699, + "step": 12600 + }, + { + "epoch": 1.7899219304471257, + "grad_norm": 4.771125316619873, + "learning_rate": 9.821092973740242e-05, + "loss": 0.1289076805114746, + "step": 12610 + }, + { + "epoch": 1.7913413768630235, + "grad_norm": 1.8268935680389404, + "learning_rate": 9.820951029098652e-05, + "loss": 0.18204834461212158, + "step": 12620 + }, + { + "epoch": 1.7927608232789212, + "grad_norm": 5.549787521362305, + "learning_rate": 9.820809084457063e-05, + "loss": 0.14632033109664916, + "step": 12630 + }, + { + "epoch": 1.794180269694819, + "grad_norm": 4.965446949005127, + "learning_rate": 9.820667139815473e-05, + "loss": 0.14237403869628906, + "step": 12640 + }, + { + "epoch": 1.7955997161107167, + "grad_norm": 3.6704654693603516, + "learning_rate": 9.820525195173884e-05, + "loss": 0.14324573278427125, + "step": 12650 + }, + { + "epoch": 1.7970191625266145, + "grad_norm": 2.443148612976074, + "learning_rate": 9.820383250532292e-05, + "loss": 0.1546507477760315, + "step": 12660 + }, + { + "epoch": 1.7984386089425124, + "grad_norm": 8.586228370666504, + "learning_rate": 9.820241305890703e-05, + "loss": 0.17691378593444823, + "step": 12670 + }, + { + "epoch": 1.7998580553584103, + "grad_norm": 3.938798666000366, + "learning_rate": 9.820099361249113e-05, + "loss": 0.11685086488723755, + "step": 12680 + }, + { + "epoch": 1.8012775017743081, + "grad_norm": 10.324106216430664, + "learning_rate": 9.819957416607524e-05, + "loss": 0.1108386754989624, + "step": 12690 + }, + { + "epoch": 1.802696948190206, + "grad_norm": 5.7965087890625, + "learning_rate": 9.819815471965934e-05, + "loss": 0.173872172832489, + "step": 12700 + }, + { + "epoch": 1.8041163946061036, + "grad_norm": 6.263943195343018, + "learning_rate": 9.819673527324343e-05, + "loss": 0.12461161613464355, + "step": 12710 + }, + { + "epoch": 1.8055358410220013, + "grad_norm": 3.52416729927063, + "learning_rate": 9.819531582682754e-05, + "loss": 0.1361951231956482, + "step": 12720 + }, + { + "epoch": 1.8069552874378991, + "grad_norm": 3.2541964054107666, + "learning_rate": 9.819389638041164e-05, + "loss": 0.13711843490600586, + "step": 12730 + }, + { + "epoch": 1.808374733853797, + "grad_norm": 2.708355188369751, + "learning_rate": 9.819247693399575e-05, + "loss": 0.16509486436843873, + "step": 12740 + }, + { + "epoch": 1.8097941802696949, + "grad_norm": 8.279736518859863, + "learning_rate": 9.819105748757985e-05, + "loss": 0.15762121677398683, + "step": 12750 + }, + { + "epoch": 1.8112136266855927, + "grad_norm": 4.580092906951904, + "learning_rate": 9.818963804116396e-05, + "loss": 0.1657193422317505, + "step": 12760 + }, + { + "epoch": 1.8126330731014906, + "grad_norm": 6.182056903839111, + "learning_rate": 9.818821859474805e-05, + "loss": 0.09075002670288086, + "step": 12770 + }, + { + "epoch": 1.8140525195173882, + "grad_norm": 2.8882968425750732, + "learning_rate": 9.818679914833216e-05, + "loss": 0.11564161777496337, + "step": 12780 + }, + { + "epoch": 1.8154719659332859, + "grad_norm": 1.9291869401931763, + "learning_rate": 9.818537970191625e-05, + "loss": 0.1788640022277832, + "step": 12790 + }, + { + "epoch": 1.8168914123491837, + "grad_norm": 1.8585617542266846, + "learning_rate": 9.818396025550036e-05, + "loss": 0.14054034948348998, + "step": 12800 + }, + { + "epoch": 1.8183108587650816, + "grad_norm": 3.6257970333099365, + "learning_rate": 9.818254080908446e-05, + "loss": 0.1330336332321167, + "step": 12810 + }, + { + "epoch": 1.8197303051809794, + "grad_norm": 6.263546943664551, + "learning_rate": 9.818112136266856e-05, + "loss": 0.1774816632270813, + "step": 12820 + }, + { + "epoch": 1.8211497515968773, + "grad_norm": 10.41680908203125, + "learning_rate": 9.817970191625267e-05, + "loss": 0.1763577938079834, + "step": 12830 + }, + { + "epoch": 1.8225691980127752, + "grad_norm": 9.07449722290039, + "learning_rate": 9.817828246983677e-05, + "loss": 0.1599531054496765, + "step": 12840 + }, + { + "epoch": 1.8239886444286728, + "grad_norm": 7.387566089630127, + "learning_rate": 9.817686302342088e-05, + "loss": 0.14263440370559693, + "step": 12850 + }, + { + "epoch": 1.8254080908445705, + "grad_norm": 5.237459659576416, + "learning_rate": 9.817544357700498e-05, + "loss": 0.22102766036987304, + "step": 12860 + }, + { + "epoch": 1.8268275372604683, + "grad_norm": 2.364966630935669, + "learning_rate": 9.817402413058907e-05, + "loss": 0.10828995704650879, + "step": 12870 + }, + { + "epoch": 1.8282469836763662, + "grad_norm": 4.197632789611816, + "learning_rate": 9.817260468417317e-05, + "loss": 0.11210172176361084, + "step": 12880 + }, + { + "epoch": 1.829666430092264, + "grad_norm": 9.747461318969727, + "learning_rate": 9.817118523775728e-05, + "loss": 0.20235188007354737, + "step": 12890 + }, + { + "epoch": 1.831085876508162, + "grad_norm": 1.4320733547210693, + "learning_rate": 9.816976579134138e-05, + "loss": 0.11145485639572143, + "step": 12900 + }, + { + "epoch": 1.8325053229240598, + "grad_norm": 4.429521560668945, + "learning_rate": 9.816834634492549e-05, + "loss": 0.10955873727798462, + "step": 12910 + }, + { + "epoch": 1.8339247693399574, + "grad_norm": 6.954484462738037, + "learning_rate": 9.816692689850959e-05, + "loss": 0.15254650115966797, + "step": 12920 + }, + { + "epoch": 1.8353442157558553, + "grad_norm": 5.583377361297607, + "learning_rate": 9.816550745209368e-05, + "loss": 0.17690763473510743, + "step": 12930 + }, + { + "epoch": 1.836763662171753, + "grad_norm": 5.169642925262451, + "learning_rate": 9.81640880056778e-05, + "loss": 0.1680360794067383, + "step": 12940 + }, + { + "epoch": 1.8381831085876508, + "grad_norm": 10.711297988891602, + "learning_rate": 9.816266855926189e-05, + "loss": 0.20626237392425537, + "step": 12950 + }, + { + "epoch": 1.8396025550035486, + "grad_norm": 6.396773338317871, + "learning_rate": 9.8161249112846e-05, + "loss": 0.12390644550323486, + "step": 12960 + }, + { + "epoch": 1.8410220014194465, + "grad_norm": 6.008213996887207, + "learning_rate": 9.815982966643009e-05, + "loss": 0.16526665687561035, + "step": 12970 + }, + { + "epoch": 1.8424414478353444, + "grad_norm": 2.8224973678588867, + "learning_rate": 9.81584102200142e-05, + "loss": 0.15004030466079712, + "step": 12980 + }, + { + "epoch": 1.843860894251242, + "grad_norm": 3.8376224040985107, + "learning_rate": 9.81569907735983e-05, + "loss": 0.12394638061523437, + "step": 12990 + }, + { + "epoch": 1.8452803406671399, + "grad_norm": 4.487581253051758, + "learning_rate": 9.81555713271824e-05, + "loss": 0.12469573020935058, + "step": 13000 + }, + { + "epoch": 1.8452803406671399, + "eval_accuracy": 0.9322184777770712, + "eval_loss": 0.20526456832885742, + "eval_runtime": 32.1483, + "eval_samples_per_second": 489.202, + "eval_steps_per_second": 15.304, + "step": 13000 + }, + { + "epoch": 1.8466997870830375, + "grad_norm": 7.591648101806641, + "learning_rate": 9.81541518807665e-05, + "loss": 0.1824552297592163, + "step": 13010 + }, + { + "epoch": 1.8481192334989354, + "grad_norm": 2.9393680095672607, + "learning_rate": 9.81527324343506e-05, + "loss": 0.18779258728027343, + "step": 13020 + }, + { + "epoch": 1.8495386799148332, + "grad_norm": 4.982316493988037, + "learning_rate": 9.815131298793471e-05, + "loss": 0.1856153726577759, + "step": 13030 + }, + { + "epoch": 1.850958126330731, + "grad_norm": 4.3030242919921875, + "learning_rate": 9.814989354151881e-05, + "loss": 0.13149327039718628, + "step": 13040 + }, + { + "epoch": 1.852377572746629, + "grad_norm": 3.1720340251922607, + "learning_rate": 9.814847409510292e-05, + "loss": 0.17401224374771118, + "step": 13050 + }, + { + "epoch": 1.8537970191625266, + "grad_norm": 5.330498218536377, + "learning_rate": 9.814705464868702e-05, + "loss": 0.18381781578063966, + "step": 13060 + }, + { + "epoch": 1.8552164655784245, + "grad_norm": 3.171062469482422, + "learning_rate": 9.814563520227113e-05, + "loss": 0.09782277941703796, + "step": 13070 + }, + { + "epoch": 1.856635911994322, + "grad_norm": 3.653743267059326, + "learning_rate": 9.814421575585521e-05, + "loss": 0.14549950361251832, + "step": 13080 + }, + { + "epoch": 1.85805535841022, + "grad_norm": 2.782893180847168, + "learning_rate": 9.814279630943932e-05, + "loss": 0.1609262704849243, + "step": 13090 + }, + { + "epoch": 1.8594748048261178, + "grad_norm": 7.247891426086426, + "learning_rate": 9.814137686302342e-05, + "loss": 0.14557520151138306, + "step": 13100 + }, + { + "epoch": 1.8608942512420157, + "grad_norm": 4.025136947631836, + "learning_rate": 9.813995741660753e-05, + "loss": 0.06900943517684936, + "step": 13110 + }, + { + "epoch": 1.8623136976579135, + "grad_norm": 2.248847007751465, + "learning_rate": 9.813853797019163e-05, + "loss": 0.12486515045166016, + "step": 13120 + }, + { + "epoch": 1.8637331440738112, + "grad_norm": 9.784401893615723, + "learning_rate": 9.813711852377573e-05, + "loss": 0.12270998954772949, + "step": 13130 + }, + { + "epoch": 1.865152590489709, + "grad_norm": 4.735940456390381, + "learning_rate": 9.813569907735984e-05, + "loss": 0.2059864282608032, + "step": 13140 + }, + { + "epoch": 1.8665720369056067, + "grad_norm": 5.477226257324219, + "learning_rate": 9.813427963094394e-05, + "loss": 0.10135586261749267, + "step": 13150 + }, + { + "epoch": 1.8679914833215046, + "grad_norm": 5.485146522521973, + "learning_rate": 9.813286018452805e-05, + "loss": 0.18213980197906493, + "step": 13160 + }, + { + "epoch": 1.8694109297374024, + "grad_norm": 4.844747543334961, + "learning_rate": 9.813144073811214e-05, + "loss": 0.10833338499069214, + "step": 13170 + }, + { + "epoch": 1.8708303761533003, + "grad_norm": 12.112831115722656, + "learning_rate": 9.813002129169624e-05, + "loss": 0.1866260290145874, + "step": 13180 + }, + { + "epoch": 1.8722498225691981, + "grad_norm": 1.797105073928833, + "learning_rate": 9.812860184528034e-05, + "loss": 0.1560835361480713, + "step": 13190 + }, + { + "epoch": 1.8736692689850958, + "grad_norm": 8.335697174072266, + "learning_rate": 9.812718239886445e-05, + "loss": 0.11914796829223633, + "step": 13200 + }, + { + "epoch": 1.8750887154009936, + "grad_norm": 4.479477405548096, + "learning_rate": 9.812576295244855e-05, + "loss": 0.18317773342132568, + "step": 13210 + }, + { + "epoch": 1.8765081618168913, + "grad_norm": 1.5853248834609985, + "learning_rate": 9.812434350603266e-05, + "loss": 0.09048664569854736, + "step": 13220 + }, + { + "epoch": 1.8779276082327891, + "grad_norm": 4.840945243835449, + "learning_rate": 9.812292405961675e-05, + "loss": 0.13578274250030517, + "step": 13230 + }, + { + "epoch": 1.879347054648687, + "grad_norm": 11.123950958251953, + "learning_rate": 9.812150461320085e-05, + "loss": 0.17634526491165162, + "step": 13240 + }, + { + "epoch": 1.8807665010645849, + "grad_norm": 4.322571754455566, + "learning_rate": 9.812008516678496e-05, + "loss": 0.10883429050445556, + "step": 13250 + }, + { + "epoch": 1.8821859474804827, + "grad_norm": 4.164629936218262, + "learning_rate": 9.811866572036906e-05, + "loss": 0.15946507453918457, + "step": 13260 + }, + { + "epoch": 1.8836053938963804, + "grad_norm": 4.701801300048828, + "learning_rate": 9.811724627395317e-05, + "loss": 0.15585731267929076, + "step": 13270 + }, + { + "epoch": 1.8850248403122782, + "grad_norm": 6.6244916915893555, + "learning_rate": 9.811582682753726e-05, + "loss": 0.1586725354194641, + "step": 13280 + }, + { + "epoch": 1.8864442867281759, + "grad_norm": 5.30622673034668, + "learning_rate": 9.811440738112137e-05, + "loss": 0.16929301023483276, + "step": 13290 + }, + { + "epoch": 1.8878637331440737, + "grad_norm": 7.866292476654053, + "learning_rate": 9.811298793470546e-05, + "loss": 0.1626114845275879, + "step": 13300 + }, + { + "epoch": 1.8892831795599716, + "grad_norm": 3.1928579807281494, + "learning_rate": 9.811156848828957e-05, + "loss": 0.11974685192108155, + "step": 13310 + }, + { + "epoch": 1.8907026259758695, + "grad_norm": 3.165278196334839, + "learning_rate": 9.811014904187367e-05, + "loss": 0.17966209650039672, + "step": 13320 + }, + { + "epoch": 1.8921220723917673, + "grad_norm": 7.965559959411621, + "learning_rate": 9.810872959545777e-05, + "loss": 0.1445131778717041, + "step": 13330 + }, + { + "epoch": 1.893541518807665, + "grad_norm": 7.0571722984313965, + "learning_rate": 9.810745209368347e-05, + "loss": 0.1508271336555481, + "step": 13340 + }, + { + "epoch": 1.8949609652235628, + "grad_norm": 6.5066351890563965, + "learning_rate": 9.810603264726757e-05, + "loss": 0.2136392116546631, + "step": 13350 + }, + { + "epoch": 1.8963804116394605, + "grad_norm": 5.8861517906188965, + "learning_rate": 9.810461320085168e-05, + "loss": 0.11962813138961792, + "step": 13360 + }, + { + "epoch": 1.8977998580553583, + "grad_norm": 12.299768447875977, + "learning_rate": 9.810319375443577e-05, + "loss": 0.14256292581558228, + "step": 13370 + }, + { + "epoch": 1.8992193044712562, + "grad_norm": 10.79692554473877, + "learning_rate": 9.810177430801988e-05, + "loss": 0.16675705909729005, + "step": 13380 + }, + { + "epoch": 1.900638750887154, + "grad_norm": 4.968460559844971, + "learning_rate": 9.810035486160398e-05, + "loss": 0.18271161317825318, + "step": 13390 + }, + { + "epoch": 1.902058197303052, + "grad_norm": 6.083104133605957, + "learning_rate": 9.809893541518809e-05, + "loss": 0.19613151550292968, + "step": 13400 + }, + { + "epoch": 1.9034776437189496, + "grad_norm": 7.929781913757324, + "learning_rate": 9.809751596877218e-05, + "loss": 0.12828643321990968, + "step": 13410 + }, + { + "epoch": 1.9048970901348474, + "grad_norm": 10.386966705322266, + "learning_rate": 9.809609652235629e-05, + "loss": 0.1059008240699768, + "step": 13420 + }, + { + "epoch": 1.906316536550745, + "grad_norm": 9.958741188049316, + "learning_rate": 9.809467707594038e-05, + "loss": 0.17238779067993165, + "step": 13430 + }, + { + "epoch": 1.907735982966643, + "grad_norm": 7.629611492156982, + "learning_rate": 9.80932576295245e-05, + "loss": 0.11009730100631714, + "step": 13440 + }, + { + "epoch": 1.9091554293825408, + "grad_norm": 4.110402584075928, + "learning_rate": 9.809183818310859e-05, + "loss": 0.15767955780029297, + "step": 13450 + }, + { + "epoch": 1.9105748757984387, + "grad_norm": 5.907031059265137, + "learning_rate": 9.809041873669269e-05, + "loss": 0.11883927583694458, + "step": 13460 + }, + { + "epoch": 1.9119943222143365, + "grad_norm": 6.367669105529785, + "learning_rate": 9.80889992902768e-05, + "loss": 0.15383024215698243, + "step": 13470 + }, + { + "epoch": 1.9134137686302342, + "grad_norm": 11.253113746643066, + "learning_rate": 9.80875798438609e-05, + "loss": 0.18761264085769652, + "step": 13480 + }, + { + "epoch": 1.914833215046132, + "grad_norm": 8.148927688598633, + "learning_rate": 9.808616039744501e-05, + "loss": 0.1913072109222412, + "step": 13490 + }, + { + "epoch": 1.9162526614620297, + "grad_norm": 5.086034774780273, + "learning_rate": 9.808474095102911e-05, + "loss": 0.1331562876701355, + "step": 13500 + }, + { + "epoch": 1.9162526614620297, + "eval_accuracy": 0.9270680994468112, + "eval_loss": 0.20431001484394073, + "eval_runtime": 33.1047, + "eval_samples_per_second": 475.068, + "eval_steps_per_second": 14.862, + "step": 13500 + }, + { + "epoch": 1.9176721078779275, + "grad_norm": 8.143988609313965, + "learning_rate": 9.80833215046132e-05, + "loss": 0.16751954555511475, + "step": 13510 + }, + { + "epoch": 1.9190915542938254, + "grad_norm": 8.666000366210938, + "learning_rate": 9.80819020581973e-05, + "loss": 0.10578331947326661, + "step": 13520 + }, + { + "epoch": 1.9205110007097232, + "grad_norm": 2.205212116241455, + "learning_rate": 9.808048261178141e-05, + "loss": 0.16295469999313356, + "step": 13530 + }, + { + "epoch": 1.921930447125621, + "grad_norm": 3.5031938552856445, + "learning_rate": 9.807906316536551e-05, + "loss": 0.19274975061416627, + "step": 13540 + }, + { + "epoch": 1.923349893541519, + "grad_norm": 6.0588884353637695, + "learning_rate": 9.807764371894962e-05, + "loss": 0.1572549819946289, + "step": 13550 + }, + { + "epoch": 1.9247693399574166, + "grad_norm": 5.022733688354492, + "learning_rate": 9.807622427253372e-05, + "loss": 0.1502652645111084, + "step": 13560 + }, + { + "epoch": 1.9261887863733143, + "grad_norm": 6.909353733062744, + "learning_rate": 9.807480482611782e-05, + "loss": 0.19446460008621216, + "step": 13570 + }, + { + "epoch": 1.9276082327892121, + "grad_norm": 4.539268970489502, + "learning_rate": 9.807338537970193e-05, + "loss": 0.1496061086654663, + "step": 13580 + }, + { + "epoch": 1.92902767920511, + "grad_norm": 5.273926258087158, + "learning_rate": 9.807196593328602e-05, + "loss": 0.1780215859413147, + "step": 13590 + }, + { + "epoch": 1.9304471256210078, + "grad_norm": 4.610520362854004, + "learning_rate": 9.807054648687014e-05, + "loss": 0.12462868690490722, + "step": 13600 + }, + { + "epoch": 1.9318665720369057, + "grad_norm": 7.675487041473389, + "learning_rate": 9.806912704045422e-05, + "loss": 0.17334070205688476, + "step": 13610 + }, + { + "epoch": 1.9332860184528036, + "grad_norm": 7.004896640777588, + "learning_rate": 9.806770759403833e-05, + "loss": 0.15332577228546143, + "step": 13620 + }, + { + "epoch": 1.9347054648687012, + "grad_norm": 2.8662800788879395, + "learning_rate": 9.806628814762243e-05, + "loss": 0.12613468170166015, + "step": 13630 + }, + { + "epoch": 1.9361249112845988, + "grad_norm": 3.3417696952819824, + "learning_rate": 9.806486870120654e-05, + "loss": 0.11488528251647949, + "step": 13640 + }, + { + "epoch": 1.9375443577004967, + "grad_norm": 8.002215385437012, + "learning_rate": 9.806344925479064e-05, + "loss": 0.12292193174362183, + "step": 13650 + }, + { + "epoch": 1.9389638041163946, + "grad_norm": 3.650278091430664, + "learning_rate": 9.806202980837473e-05, + "loss": 0.15752785205841063, + "step": 13660 + }, + { + "epoch": 1.9403832505322924, + "grad_norm": 3.4982657432556152, + "learning_rate": 9.806061036195884e-05, + "loss": 0.13047711849212645, + "step": 13670 + }, + { + "epoch": 1.9418026969481903, + "grad_norm": 7.711712837219238, + "learning_rate": 9.805919091554294e-05, + "loss": 0.144749915599823, + "step": 13680 + }, + { + "epoch": 1.9432221433640882, + "grad_norm": 5.939789772033691, + "learning_rate": 9.805777146912705e-05, + "loss": 0.13807902336120606, + "step": 13690 + }, + { + "epoch": 1.9446415897799858, + "grad_norm": 3.993557929992676, + "learning_rate": 9.805635202271115e-05, + "loss": 0.1018330454826355, + "step": 13700 + }, + { + "epoch": 1.9460610361958834, + "grad_norm": 6.909927845001221, + "learning_rate": 9.805493257629525e-05, + "loss": 0.1758143424987793, + "step": 13710 + }, + { + "epoch": 1.9474804826117813, + "grad_norm": 4.5612993240356445, + "learning_rate": 9.805351312987934e-05, + "loss": 0.13746780157089233, + "step": 13720 + }, + { + "epoch": 1.9488999290276792, + "grad_norm": 0.8813110589981079, + "learning_rate": 9.805209368346346e-05, + "loss": 0.13282377719879152, + "step": 13730 + }, + { + "epoch": 1.950319375443577, + "grad_norm": 4.4625630378723145, + "learning_rate": 9.805067423704755e-05, + "loss": 0.19286319017410278, + "step": 13740 + }, + { + "epoch": 1.951738821859475, + "grad_norm": 6.587796688079834, + "learning_rate": 9.804925479063166e-05, + "loss": 0.1381397008895874, + "step": 13750 + }, + { + "epoch": 1.9531582682753728, + "grad_norm": 7.006091594696045, + "learning_rate": 9.804783534421576e-05, + "loss": 0.10776946544647217, + "step": 13760 + }, + { + "epoch": 1.9545777146912704, + "grad_norm": 6.6057257652282715, + "learning_rate": 9.804641589779986e-05, + "loss": 0.1551327109336853, + "step": 13770 + }, + { + "epoch": 1.9559971611071683, + "grad_norm": 2.855726480484009, + "learning_rate": 9.804499645138397e-05, + "loss": 0.14515860080718995, + "step": 13780 + }, + { + "epoch": 1.957416607523066, + "grad_norm": 4.859558582305908, + "learning_rate": 9.804357700496807e-05, + "loss": 0.13317285776138305, + "step": 13790 + }, + { + "epoch": 1.9588360539389638, + "grad_norm": 4.010891437530518, + "learning_rate": 9.804215755855218e-05, + "loss": 0.21571955680847169, + "step": 13800 + }, + { + "epoch": 1.9602555003548616, + "grad_norm": 1.5958309173583984, + "learning_rate": 9.804073811213627e-05, + "loss": 0.11179524660110474, + "step": 13810 + }, + { + "epoch": 1.9616749467707595, + "grad_norm": 4.728942394256592, + "learning_rate": 9.803931866572037e-05, + "loss": 0.12224637269973755, + "step": 13820 + }, + { + "epoch": 1.9630943931866573, + "grad_norm": 5.639578342437744, + "learning_rate": 9.803789921930447e-05, + "loss": 0.10692014694213867, + "step": 13830 + }, + { + "epoch": 1.964513839602555, + "grad_norm": 3.7262027263641357, + "learning_rate": 9.803647977288858e-05, + "loss": 0.1023218035697937, + "step": 13840 + }, + { + "epoch": 1.9659332860184529, + "grad_norm": 6.50256872177124, + "learning_rate": 9.803506032647268e-05, + "loss": 0.12723206281661986, + "step": 13850 + }, + { + "epoch": 1.9673527324343505, + "grad_norm": 2.4793450832366943, + "learning_rate": 9.803364088005679e-05, + "loss": 0.18150064945220948, + "step": 13860 + }, + { + "epoch": 1.9687721788502484, + "grad_norm": 8.015069961547852, + "learning_rate": 9.803222143364089e-05, + "loss": 0.13160840272903443, + "step": 13870 + }, + { + "epoch": 1.9701916252661462, + "grad_norm": 2.3164284229278564, + "learning_rate": 9.803080198722498e-05, + "loss": 0.13569587469100952, + "step": 13880 + }, + { + "epoch": 1.971611071682044, + "grad_norm": 5.398233413696289, + "learning_rate": 9.80293825408091e-05, + "loss": 0.10830456018447876, + "step": 13890 + }, + { + "epoch": 1.973030518097942, + "grad_norm": 4.58472204208374, + "learning_rate": 9.802796309439319e-05, + "loss": 0.12152203321456909, + "step": 13900 + }, + { + "epoch": 1.9744499645138396, + "grad_norm": 3.399158239364624, + "learning_rate": 9.80265436479773e-05, + "loss": 0.09602898955345154, + "step": 13910 + }, + { + "epoch": 1.9758694109297374, + "grad_norm": 5.37898063659668, + "learning_rate": 9.802512420156139e-05, + "loss": 0.16220704317092896, + "step": 13920 + }, + { + "epoch": 1.977288857345635, + "grad_norm": 8.282011985778809, + "learning_rate": 9.80237047551455e-05, + "loss": 0.1817216157913208, + "step": 13930 + }, + { + "epoch": 1.978708303761533, + "grad_norm": 8.454946517944336, + "learning_rate": 9.80222853087296e-05, + "loss": 0.10207384824752808, + "step": 13940 + }, + { + "epoch": 1.9801277501774308, + "grad_norm": 5.604420185089111, + "learning_rate": 9.80208658623137e-05, + "loss": 0.13896651268005372, + "step": 13950 + }, + { + "epoch": 1.9815471965933287, + "grad_norm": 5.782528400421143, + "learning_rate": 9.80194464158978e-05, + "loss": 0.16523996591567994, + "step": 13960 + }, + { + "epoch": 1.9829666430092265, + "grad_norm": 7.257541656494141, + "learning_rate": 9.80180269694819e-05, + "loss": 0.1670131802558899, + "step": 13970 + }, + { + "epoch": 1.9843860894251242, + "grad_norm": 1.4823135137557983, + "learning_rate": 9.801660752306601e-05, + "loss": 0.09150451421737671, + "step": 13980 + }, + { + "epoch": 1.985805535841022, + "grad_norm": 11.689827919006348, + "learning_rate": 9.801518807665011e-05, + "loss": 0.12286759614944458, + "step": 13990 + }, + { + "epoch": 1.9872249822569197, + "grad_norm": 2.379868268966675, + "learning_rate": 9.801376863023422e-05, + "loss": 0.08730307221412659, + "step": 14000 + }, + { + "epoch": 1.9872249822569197, + "eval_accuracy": 0.9489413111210021, + "eval_loss": 0.14637306332588196, + "eval_runtime": 33.0818, + "eval_samples_per_second": 475.397, + "eval_steps_per_second": 14.872, + "step": 14000 + }, + { + "epoch": 1.9886444286728175, + "grad_norm": 3.562831163406372, + "learning_rate": 9.801234918381832e-05, + "loss": 0.10573784112930298, + "step": 14010 + }, + { + "epoch": 1.9900638750887154, + "grad_norm": 1.7032339572906494, + "learning_rate": 9.801092973740241e-05, + "loss": 0.1144748330116272, + "step": 14020 + }, + { + "epoch": 1.9914833215046133, + "grad_norm": 9.984017372131348, + "learning_rate": 9.800951029098651e-05, + "loss": 0.2368067979812622, + "step": 14030 + }, + { + "epoch": 1.9929027679205111, + "grad_norm": 4.510107517242432, + "learning_rate": 9.800809084457062e-05, + "loss": 0.11444370746612549, + "step": 14040 + }, + { + "epoch": 1.9943222143364088, + "grad_norm": 2.9397714138031006, + "learning_rate": 9.800667139815472e-05, + "loss": 0.08882022500038148, + "step": 14050 + }, + { + "epoch": 1.9957416607523066, + "grad_norm": 5.492639064788818, + "learning_rate": 9.800525195173883e-05, + "loss": 0.13332669734954833, + "step": 14060 + }, + { + "epoch": 1.9971611071682043, + "grad_norm": 6.94230318069458, + "learning_rate": 9.800383250532293e-05, + "loss": 0.1107181191444397, + "step": 14070 + }, + { + "epoch": 1.9985805535841021, + "grad_norm": 1.4583178758621216, + "learning_rate": 9.800241305890703e-05, + "loss": 0.1853145956993103, + "step": 14080 + }, + { + "epoch": 2.0, + "grad_norm": 3.6740102767944336, + "learning_rate": 9.800099361249114e-05, + "loss": 0.1035921812057495, + "step": 14090 + }, + { + "epoch": 2.001419446415898, + "grad_norm": 7.763698101043701, + "learning_rate": 9.799957416607523e-05, + "loss": 0.11998735666275025, + "step": 14100 + }, + { + "epoch": 2.0028388928317957, + "grad_norm": 9.761672019958496, + "learning_rate": 9.799815471965935e-05, + "loss": 0.130437171459198, + "step": 14110 + }, + { + "epoch": 2.0042583392476936, + "grad_norm": 6.725173473358154, + "learning_rate": 9.799673527324344e-05, + "loss": 0.1438794732093811, + "step": 14120 + }, + { + "epoch": 2.005677785663591, + "grad_norm": 2.627002477645874, + "learning_rate": 9.799531582682754e-05, + "loss": 0.13035544157028198, + "step": 14130 + }, + { + "epoch": 2.007097232079489, + "grad_norm": 1.8587443828582764, + "learning_rate": 9.799389638041164e-05, + "loss": 0.10760440826416015, + "step": 14140 + }, + { + "epoch": 2.0085166784953867, + "grad_norm": 5.432860851287842, + "learning_rate": 9.799247693399575e-05, + "loss": 0.08797118067741394, + "step": 14150 + }, + { + "epoch": 2.0099361249112846, + "grad_norm": 8.000253677368164, + "learning_rate": 9.799105748757985e-05, + "loss": 0.13834741115570068, + "step": 14160 + }, + { + "epoch": 2.0113555713271825, + "grad_norm": 4.846225738525391, + "learning_rate": 9.798963804116396e-05, + "loss": 0.1457647442817688, + "step": 14170 + }, + { + "epoch": 2.0127750177430803, + "grad_norm": 11.00196361541748, + "learning_rate": 9.798821859474805e-05, + "loss": 0.1213072657585144, + "step": 14180 + }, + { + "epoch": 2.014194464158978, + "grad_norm": 10.398648262023926, + "learning_rate": 9.798679914833215e-05, + "loss": 0.13774160146713257, + "step": 14190 + }, + { + "epoch": 2.0156139105748756, + "grad_norm": 2.693225145339966, + "learning_rate": 9.798537970191626e-05, + "loss": 0.1436489462852478, + "step": 14200 + }, + { + "epoch": 2.0170333569907735, + "grad_norm": 2.0098676681518555, + "learning_rate": 9.798396025550036e-05, + "loss": 0.11806844472885132, + "step": 14210 + }, + { + "epoch": 2.0184528034066713, + "grad_norm": 3.5687620639801025, + "learning_rate": 9.798254080908447e-05, + "loss": 0.11548566818237305, + "step": 14220 + }, + { + "epoch": 2.019872249822569, + "grad_norm": 4.691004276275635, + "learning_rate": 9.798112136266855e-05, + "loss": 0.11631312370300292, + "step": 14230 + }, + { + "epoch": 2.021291696238467, + "grad_norm": 5.144685745239258, + "learning_rate": 9.797970191625267e-05, + "loss": 0.08912101984024048, + "step": 14240 + }, + { + "epoch": 2.022711142654365, + "grad_norm": 10.743430137634277, + "learning_rate": 9.797828246983676e-05, + "loss": 0.10923216342926026, + "step": 14250 + }, + { + "epoch": 2.0241305890702628, + "grad_norm": 1.788232445716858, + "learning_rate": 9.797686302342087e-05, + "loss": 0.10165914297103881, + "step": 14260 + }, + { + "epoch": 2.02555003548616, + "grad_norm": 1.6243984699249268, + "learning_rate": 9.797544357700497e-05, + "loss": 0.07863327860832214, + "step": 14270 + }, + { + "epoch": 2.026969481902058, + "grad_norm": 4.447552680969238, + "learning_rate": 9.797402413058907e-05, + "loss": 0.09306793808937072, + "step": 14280 + }, + { + "epoch": 2.028388928317956, + "grad_norm": 6.648647308349609, + "learning_rate": 9.797260468417318e-05, + "loss": 0.13603001832962036, + "step": 14290 + }, + { + "epoch": 2.029808374733854, + "grad_norm": 6.4532952308654785, + "learning_rate": 9.797118523775728e-05, + "loss": 0.13374946117401124, + "step": 14300 + }, + { + "epoch": 2.0312278211497516, + "grad_norm": 3.549644708633423, + "learning_rate": 9.796976579134139e-05, + "loss": 0.11156256198883056, + "step": 14310 + }, + { + "epoch": 2.0326472675656495, + "grad_norm": 5.188971042633057, + "learning_rate": 9.796834634492548e-05, + "loss": 0.1163739800453186, + "step": 14320 + }, + { + "epoch": 2.0340667139815474, + "grad_norm": 2.5170130729675293, + "learning_rate": 9.796692689850958e-05, + "loss": 0.17147536277770997, + "step": 14330 + }, + { + "epoch": 2.035486160397445, + "grad_norm": 1.3498976230621338, + "learning_rate": 9.796550745209368e-05, + "loss": 0.10244355201721192, + "step": 14340 + }, + { + "epoch": 2.0369056068133427, + "grad_norm": 1.6554956436157227, + "learning_rate": 9.796408800567779e-05, + "loss": 0.10223543643951416, + "step": 14350 + }, + { + "epoch": 2.0383250532292405, + "grad_norm": 7.838418006896973, + "learning_rate": 9.796266855926189e-05, + "loss": 0.11812844276428222, + "step": 14360 + }, + { + "epoch": 2.0397444996451384, + "grad_norm": 1.8078879117965698, + "learning_rate": 9.7961249112846e-05, + "loss": 0.1252034544944763, + "step": 14370 + }, + { + "epoch": 2.0411639460610362, + "grad_norm": 3.4205777645111084, + "learning_rate": 9.79598296664301e-05, + "loss": 0.10178905725479126, + "step": 14380 + }, + { + "epoch": 2.042583392476934, + "grad_norm": 6.722558498382568, + "learning_rate": 9.79584102200142e-05, + "loss": 0.13192167282104492, + "step": 14390 + }, + { + "epoch": 2.044002838892832, + "grad_norm": 3.837047576904297, + "learning_rate": 9.79569907735983e-05, + "loss": 0.12296985387802124, + "step": 14400 + }, + { + "epoch": 2.0454222853087294, + "grad_norm": 2.1457889080047607, + "learning_rate": 9.79555713271824e-05, + "loss": 0.16315003633499145, + "step": 14410 + }, + { + "epoch": 2.0468417317246272, + "grad_norm": 6.29680871963501, + "learning_rate": 9.795415188076651e-05, + "loss": 0.12061352729797363, + "step": 14420 + }, + { + "epoch": 2.048261178140525, + "grad_norm": 6.541940689086914, + "learning_rate": 9.79527324343506e-05, + "loss": 0.2011786699295044, + "step": 14430 + }, + { + "epoch": 2.049680624556423, + "grad_norm": 4.376636505126953, + "learning_rate": 9.795131298793471e-05, + "loss": 0.10220627784729004, + "step": 14440 + }, + { + "epoch": 2.051100070972321, + "grad_norm": 3.3631985187530518, + "learning_rate": 9.79498935415188e-05, + "loss": 0.12176470756530762, + "step": 14450 + }, + { + "epoch": 2.0525195173882187, + "grad_norm": 3.7540066242218018, + "learning_rate": 9.794847409510292e-05, + "loss": 0.13856956958770753, + "step": 14460 + }, + { + "epoch": 2.0539389638041166, + "grad_norm": 4.199720859527588, + "learning_rate": 9.794705464868701e-05, + "loss": 0.11578547954559326, + "step": 14470 + }, + { + "epoch": 2.055358410220014, + "grad_norm": 2.478891134262085, + "learning_rate": 9.794563520227112e-05, + "loss": 0.11448420286178589, + "step": 14480 + }, + { + "epoch": 2.056777856635912, + "grad_norm": 10.809943199157715, + "learning_rate": 9.794421575585522e-05, + "loss": 0.11974853277206421, + "step": 14490 + }, + { + "epoch": 2.0581973030518097, + "grad_norm": 3.9403326511383057, + "learning_rate": 9.794279630943932e-05, + "loss": 0.09595261812210083, + "step": 14500 + }, + { + "epoch": 2.0581973030518097, + "eval_accuracy": 0.9520569720862212, + "eval_loss": 0.1421024352312088, + "eval_runtime": 32.3117, + "eval_samples_per_second": 486.728, + "eval_steps_per_second": 15.227, + "step": 14500 + }, + { + "epoch": 2.0596167494677076, + "grad_norm": 9.631017684936523, + "learning_rate": 9.794137686302343e-05, + "loss": 0.15254437923431396, + "step": 14510 + }, + { + "epoch": 2.0610361958836054, + "grad_norm": 4.611459255218506, + "learning_rate": 9.793995741660753e-05, + "loss": 0.09197093248367309, + "step": 14520 + }, + { + "epoch": 2.0624556422995033, + "grad_norm": 5.0104756355285645, + "learning_rate": 9.793853797019164e-05, + "loss": 0.17470468282699586, + "step": 14530 + }, + { + "epoch": 2.063875088715401, + "grad_norm": 6.290011882781982, + "learning_rate": 9.793711852377572e-05, + "loss": 0.13710517883300782, + "step": 14540 + }, + { + "epoch": 2.065294535131299, + "grad_norm": 5.759206771850586, + "learning_rate": 9.793569907735983e-05, + "loss": 0.08785209059715271, + "step": 14550 + }, + { + "epoch": 2.0667139815471964, + "grad_norm": 3.606126308441162, + "learning_rate": 9.793427963094393e-05, + "loss": 0.1606206178665161, + "step": 14560 + }, + { + "epoch": 2.0681334279630943, + "grad_norm": 1.4751636981964111, + "learning_rate": 9.793286018452804e-05, + "loss": 0.09843673706054687, + "step": 14570 + }, + { + "epoch": 2.069552874378992, + "grad_norm": 6.7842864990234375, + "learning_rate": 9.793144073811215e-05, + "loss": 0.12192797660827637, + "step": 14580 + }, + { + "epoch": 2.07097232079489, + "grad_norm": 0.8541110754013062, + "learning_rate": 9.793002129169624e-05, + "loss": 0.16259843111038208, + "step": 14590 + }, + { + "epoch": 2.072391767210788, + "grad_norm": 1.672593116760254, + "learning_rate": 9.792860184528035e-05, + "loss": 0.09362624883651734, + "step": 14600 + }, + { + "epoch": 2.0738112136266857, + "grad_norm": 1.834715485572815, + "learning_rate": 9.792718239886444e-05, + "loss": 0.09099584221839904, + "step": 14610 + }, + { + "epoch": 2.0752306600425836, + "grad_norm": 2.21016001701355, + "learning_rate": 9.792576295244856e-05, + "loss": 0.12747323513031006, + "step": 14620 + }, + { + "epoch": 2.076650106458481, + "grad_norm": 2.8152081966400146, + "learning_rate": 9.792434350603265e-05, + "loss": 0.08871068954467773, + "step": 14630 + }, + { + "epoch": 2.078069552874379, + "grad_norm": 10.869599342346191, + "learning_rate": 9.792292405961675e-05, + "loss": 0.09311275482177735, + "step": 14640 + }, + { + "epoch": 2.0794889992902768, + "grad_norm": 7.580860614776611, + "learning_rate": 9.792150461320085e-05, + "loss": 0.10084123611450195, + "step": 14650 + }, + { + "epoch": 2.0809084457061746, + "grad_norm": 4.795779228210449, + "learning_rate": 9.792008516678496e-05, + "loss": 0.11776796579360962, + "step": 14660 + }, + { + "epoch": 2.0823278921220725, + "grad_norm": 8.302618980407715, + "learning_rate": 9.791866572036907e-05, + "loss": 0.1491849184036255, + "step": 14670 + }, + { + "epoch": 2.0837473385379703, + "grad_norm": 0.23616167902946472, + "learning_rate": 9.791724627395317e-05, + "loss": 0.09274361729621887, + "step": 14680 + }, + { + "epoch": 2.085166784953868, + "grad_norm": 4.930098056793213, + "learning_rate": 9.791582682753726e-05, + "loss": 0.10362660884857178, + "step": 14690 + }, + { + "epoch": 2.0865862313697656, + "grad_norm": 5.442007064819336, + "learning_rate": 9.791440738112136e-05, + "loss": 0.16730997562408448, + "step": 14700 + }, + { + "epoch": 2.0880056777856635, + "grad_norm": 2.312178134918213, + "learning_rate": 9.791298793470547e-05, + "loss": 0.09510490894317628, + "step": 14710 + }, + { + "epoch": 2.0894251242015613, + "grad_norm": 4.624721527099609, + "learning_rate": 9.791156848828957e-05, + "loss": 0.11144552230834961, + "step": 14720 + }, + { + "epoch": 2.090844570617459, + "grad_norm": 4.009274482727051, + "learning_rate": 9.791014904187368e-05, + "loss": 0.05063519477844238, + "step": 14730 + }, + { + "epoch": 2.092264017033357, + "grad_norm": 3.2653450965881348, + "learning_rate": 9.790872959545776e-05, + "loss": 0.08952829837799073, + "step": 14740 + }, + { + "epoch": 2.093683463449255, + "grad_norm": 5.824209690093994, + "learning_rate": 9.790731014904188e-05, + "loss": 0.15206855535507202, + "step": 14750 + }, + { + "epoch": 2.095102909865153, + "grad_norm": 9.619600296020508, + "learning_rate": 9.790589070262599e-05, + "loss": 0.09403921961784363, + "step": 14760 + }, + { + "epoch": 2.09652235628105, + "grad_norm": 9.709185600280762, + "learning_rate": 9.790447125621008e-05, + "loss": 0.14637627601623535, + "step": 14770 + }, + { + "epoch": 2.097941802696948, + "grad_norm": 5.918253421783447, + "learning_rate": 9.79030518097942e-05, + "loss": 0.1368915319442749, + "step": 14780 + }, + { + "epoch": 2.099361249112846, + "grad_norm": 4.801339626312256, + "learning_rate": 9.790163236337828e-05, + "loss": 0.12445158958435058, + "step": 14790 + }, + { + "epoch": 2.100780695528744, + "grad_norm": 4.204085826873779, + "learning_rate": 9.790021291696239e-05, + "loss": 0.10883952379226684, + "step": 14800 + }, + { + "epoch": 2.1022001419446417, + "grad_norm": 2.81545352935791, + "learning_rate": 9.789879347054649e-05, + "loss": 0.14513410329818727, + "step": 14810 + }, + { + "epoch": 2.1036195883605395, + "grad_norm": 10.400982856750488, + "learning_rate": 9.78973740241306e-05, + "loss": 0.1663369655609131, + "step": 14820 + }, + { + "epoch": 2.1050390347764374, + "grad_norm": 4.7983078956604, + "learning_rate": 9.78959545777147e-05, + "loss": 0.10346471071243286, + "step": 14830 + }, + { + "epoch": 2.106458481192335, + "grad_norm": 6.536756992340088, + "learning_rate": 9.78945351312988e-05, + "loss": 0.12118889093399048, + "step": 14840 + }, + { + "epoch": 2.1078779276082327, + "grad_norm": 4.13341760635376, + "learning_rate": 9.78931156848829e-05, + "loss": 0.09681417346000672, + "step": 14850 + }, + { + "epoch": 2.1092973740241305, + "grad_norm": 6.235330581665039, + "learning_rate": 9.7891696238467e-05, + "loss": 0.11153937578201294, + "step": 14860 + }, + { + "epoch": 2.1107168204400284, + "grad_norm": 4.928127765655518, + "learning_rate": 9.789027679205111e-05, + "loss": 0.07672246694564819, + "step": 14870 + }, + { + "epoch": 2.1121362668559263, + "grad_norm": 4.837932109832764, + "learning_rate": 9.788885734563521e-05, + "loss": 0.07635858654975891, + "step": 14880 + }, + { + "epoch": 2.113555713271824, + "grad_norm": 7.02380895614624, + "learning_rate": 9.788743789921932e-05, + "loss": 0.07125227451324463, + "step": 14890 + }, + { + "epoch": 2.114975159687722, + "grad_norm": 5.700672149658203, + "learning_rate": 9.78860184528034e-05, + "loss": 0.19001219272613526, + "step": 14900 + }, + { + "epoch": 2.1163946061036194, + "grad_norm": 8.149482727050781, + "learning_rate": 9.788459900638751e-05, + "loss": 0.13992477655410768, + "step": 14910 + }, + { + "epoch": 2.1178140525195173, + "grad_norm": 2.9586234092712402, + "learning_rate": 9.788317955997161e-05, + "loss": 0.12763415575027465, + "step": 14920 + }, + { + "epoch": 2.119233498935415, + "grad_norm": 8.272931098937988, + "learning_rate": 9.788176011355572e-05, + "loss": 0.14072943925857545, + "step": 14930 + }, + { + "epoch": 2.120652945351313, + "grad_norm": 10.288031578063965, + "learning_rate": 9.788034066713982e-05, + "loss": 0.12365868091583251, + "step": 14940 + }, + { + "epoch": 2.122072391767211, + "grad_norm": 3.203730821609497, + "learning_rate": 9.787892122072392e-05, + "loss": 0.16196365356445314, + "step": 14950 + }, + { + "epoch": 2.1234918381831087, + "grad_norm": 1.575235366821289, + "learning_rate": 9.787750177430803e-05, + "loss": 0.10702955722808838, + "step": 14960 + }, + { + "epoch": 2.1249112845990066, + "grad_norm": 3.2818377017974854, + "learning_rate": 9.787608232789213e-05, + "loss": 0.109703528881073, + "step": 14970 + }, + { + "epoch": 2.126330731014904, + "grad_norm": 2.6222288608551025, + "learning_rate": 9.787466288147624e-05, + "loss": 0.13249775171279907, + "step": 14980 + }, + { + "epoch": 2.127750177430802, + "grad_norm": 2.1232478618621826, + "learning_rate": 9.787324343506033e-05, + "loss": 0.07887126207351684, + "step": 14990 + }, + { + "epoch": 2.1291696238466997, + "grad_norm": 2.6810293197631836, + "learning_rate": 9.787182398864443e-05, + "loss": 0.07232893705368042, + "step": 15000 + }, + { + "epoch": 2.1291696238466997, + "eval_accuracy": 0.9323456476123864, + "eval_loss": 0.19697453081607819, + "eval_runtime": 33.1486, + "eval_samples_per_second": 474.44, + "eval_steps_per_second": 14.842, + "step": 15000 + }, + { + "epoch": 2.1305890702625976, + "grad_norm": 1.1261463165283203, + "learning_rate": 9.787040454222853e-05, + "loss": 0.15110697746276855, + "step": 15010 + }, + { + "epoch": 2.1320085166784954, + "grad_norm": 7.047489166259766, + "learning_rate": 9.786898509581264e-05, + "loss": 0.12342967987060546, + "step": 15020 + }, + { + "epoch": 2.1334279630943933, + "grad_norm": 2.4421699047088623, + "learning_rate": 9.786756564939674e-05, + "loss": 0.10898158550262452, + "step": 15030 + }, + { + "epoch": 2.134847409510291, + "grad_norm": 13.27920913696289, + "learning_rate": 9.786614620298085e-05, + "loss": 0.17320735454559327, + "step": 15040 + }, + { + "epoch": 2.1362668559261886, + "grad_norm": 2.1594645977020264, + "learning_rate": 9.786472675656495e-05, + "loss": 0.1370407223701477, + "step": 15050 + }, + { + "epoch": 2.1376863023420865, + "grad_norm": 3.3465182781219482, + "learning_rate": 9.786330731014904e-05, + "loss": 0.0927284300327301, + "step": 15060 + }, + { + "epoch": 2.1391057487579843, + "grad_norm": 4.845798015594482, + "learning_rate": 9.786188786373315e-05, + "loss": 0.09592834115028381, + "step": 15070 + }, + { + "epoch": 2.140525195173882, + "grad_norm": 5.797274112701416, + "learning_rate": 9.786046841731725e-05, + "loss": 0.09021830558776855, + "step": 15080 + }, + { + "epoch": 2.14194464158978, + "grad_norm": 6.726304054260254, + "learning_rate": 9.785904897090136e-05, + "loss": 0.08812606334686279, + "step": 15090 + }, + { + "epoch": 2.143364088005678, + "grad_norm": 11.3377046585083, + "learning_rate": 9.785762952448545e-05, + "loss": 0.17364519834518433, + "step": 15100 + }, + { + "epoch": 2.1447835344215758, + "grad_norm": 3.504915237426758, + "learning_rate": 9.785621007806956e-05, + "loss": 0.11160609722137452, + "step": 15110 + }, + { + "epoch": 2.146202980837473, + "grad_norm": 8.797595024108887, + "learning_rate": 9.785479063165365e-05, + "loss": 0.19877324104309083, + "step": 15120 + }, + { + "epoch": 2.147622427253371, + "grad_norm": 3.8671157360076904, + "learning_rate": 9.785337118523777e-05, + "loss": 0.1070638656616211, + "step": 15130 + }, + { + "epoch": 2.149041873669269, + "grad_norm": 1.9480023384094238, + "learning_rate": 9.785195173882186e-05, + "loss": 0.08838028907775879, + "step": 15140 + }, + { + "epoch": 2.1504613200851668, + "grad_norm": 0.8382003903388977, + "learning_rate": 9.785053229240596e-05, + "loss": 0.13476892709732055, + "step": 15150 + }, + { + "epoch": 2.1518807665010646, + "grad_norm": 1.5311458110809326, + "learning_rate": 9.784911284599007e-05, + "loss": 0.1371008038520813, + "step": 15160 + }, + { + "epoch": 2.1533002129169625, + "grad_norm": 4.248318672180176, + "learning_rate": 9.784769339957417e-05, + "loss": 0.142839252948761, + "step": 15170 + }, + { + "epoch": 2.1547196593328604, + "grad_norm": 5.336694717407227, + "learning_rate": 9.784627395315828e-05, + "loss": 0.15205401182174683, + "step": 15180 + }, + { + "epoch": 2.156139105748758, + "grad_norm": 1.6950732469558716, + "learning_rate": 9.784485450674238e-05, + "loss": 0.09157877564430236, + "step": 15190 + }, + { + "epoch": 2.1575585521646556, + "grad_norm": 0.8742321133613586, + "learning_rate": 9.784343506032649e-05, + "loss": 0.07795000672340394, + "step": 15200 + }, + { + "epoch": 2.1589779985805535, + "grad_norm": 9.622370719909668, + "learning_rate": 9.784201561391057e-05, + "loss": 0.12661195993423463, + "step": 15210 + }, + { + "epoch": 2.1603974449964514, + "grad_norm": 2.450603723526001, + "learning_rate": 9.784059616749468e-05, + "loss": 0.07968658804893494, + "step": 15220 + }, + { + "epoch": 2.1618168914123492, + "grad_norm": 6.467986583709717, + "learning_rate": 9.783917672107878e-05, + "loss": 0.09993529319763184, + "step": 15230 + }, + { + "epoch": 2.163236337828247, + "grad_norm": 4.023931980133057, + "learning_rate": 9.783775727466289e-05, + "loss": 0.13655495643615723, + "step": 15240 + }, + { + "epoch": 2.164655784244145, + "grad_norm": 6.877175807952881, + "learning_rate": 9.783633782824699e-05, + "loss": 0.11687321662902832, + "step": 15250 + }, + { + "epoch": 2.1660752306600424, + "grad_norm": 6.720952033996582, + "learning_rate": 9.783491838183109e-05, + "loss": 0.1210485816001892, + "step": 15260 + }, + { + "epoch": 2.1674946770759402, + "grad_norm": 3.8507208824157715, + "learning_rate": 9.78334989354152e-05, + "loss": 0.132388699054718, + "step": 15270 + }, + { + "epoch": 2.168914123491838, + "grad_norm": 1.8653970956802368, + "learning_rate": 9.78320794889993e-05, + "loss": 0.08510831594467164, + "step": 15280 + }, + { + "epoch": 2.170333569907736, + "grad_norm": 2.0540809631347656, + "learning_rate": 9.78306600425834e-05, + "loss": 0.07614290714263916, + "step": 15290 + }, + { + "epoch": 2.171753016323634, + "grad_norm": 3.400786876678467, + "learning_rate": 9.78292405961675e-05, + "loss": 0.1373605966567993, + "step": 15300 + }, + { + "epoch": 2.1731724627395317, + "grad_norm": 4.475280284881592, + "learning_rate": 9.78278211497516e-05, + "loss": 0.170183527469635, + "step": 15310 + }, + { + "epoch": 2.1745919091554295, + "grad_norm": 1.2852575778961182, + "learning_rate": 9.78264017033357e-05, + "loss": 0.09261202812194824, + "step": 15320 + }, + { + "epoch": 2.176011355571327, + "grad_norm": 2.492828369140625, + "learning_rate": 9.782498225691981e-05, + "loss": 0.1506461977958679, + "step": 15330 + }, + { + "epoch": 2.177430801987225, + "grad_norm": 1.1873884201049805, + "learning_rate": 9.78235628105039e-05, + "loss": 0.1407165050506592, + "step": 15340 + }, + { + "epoch": 2.1788502484031227, + "grad_norm": 6.442225933074951, + "learning_rate": 9.782214336408802e-05, + "loss": 0.10227712392807006, + "step": 15350 + }, + { + "epoch": 2.1802696948190206, + "grad_norm": 4.296558856964111, + "learning_rate": 9.782072391767211e-05, + "loss": 0.1007123589515686, + "step": 15360 + }, + { + "epoch": 2.1816891412349184, + "grad_norm": 5.814218044281006, + "learning_rate": 9.781944641589781e-05, + "loss": 0.19718022346496583, + "step": 15370 + }, + { + "epoch": 2.1831085876508163, + "grad_norm": 4.71889066696167, + "learning_rate": 9.78180269694819e-05, + "loss": 0.18047010898590088, + "step": 15380 + }, + { + "epoch": 2.184528034066714, + "grad_norm": 4.318767070770264, + "learning_rate": 9.7816607523066e-05, + "loss": 0.15934972763061522, + "step": 15390 + }, + { + "epoch": 2.1859474804826116, + "grad_norm": 5.206693172454834, + "learning_rate": 9.78151880766501e-05, + "loss": 0.16389219760894774, + "step": 15400 + }, + { + "epoch": 2.1873669268985094, + "grad_norm": 5.830376148223877, + "learning_rate": 9.781376863023421e-05, + "loss": 0.09744818210601806, + "step": 15410 + }, + { + "epoch": 2.1887863733144073, + "grad_norm": 3.7071948051452637, + "learning_rate": 9.781234918381833e-05, + "loss": 0.06997872591018676, + "step": 15420 + }, + { + "epoch": 2.190205819730305, + "grad_norm": 1.3492387533187866, + "learning_rate": 9.781092973740241e-05, + "loss": 0.12530778646469115, + "step": 15430 + }, + { + "epoch": 2.191625266146203, + "grad_norm": 4.588033199310303, + "learning_rate": 9.780951029098652e-05, + "loss": 0.09968525767326356, + "step": 15440 + }, + { + "epoch": 2.193044712562101, + "grad_norm": 7.795054912567139, + "learning_rate": 9.780809084457062e-05, + "loss": 0.14231202602386475, + "step": 15450 + }, + { + "epoch": 2.1944641589779987, + "grad_norm": 3.2043259143829346, + "learning_rate": 9.780667139815473e-05, + "loss": 0.13562475442886351, + "step": 15460 + }, + { + "epoch": 2.195883605393896, + "grad_norm": 4.458872318267822, + "learning_rate": 9.780525195173883e-05, + "loss": 0.12291073799133301, + "step": 15470 + }, + { + "epoch": 2.197303051809794, + "grad_norm": 0.49556025862693787, + "learning_rate": 9.780383250532294e-05, + "loss": 0.030131521821022033, + "step": 15480 + }, + { + "epoch": 2.198722498225692, + "grad_norm": 10.009795188903809, + "learning_rate": 9.780241305890702e-05, + "loss": 0.13439586162567138, + "step": 15490 + }, + { + "epoch": 2.2001419446415897, + "grad_norm": 9.650060653686523, + "learning_rate": 9.780099361249113e-05, + "loss": 0.15003018379211425, + "step": 15500 + }, + { + "epoch": 2.2001419446415897, + "eval_accuracy": 0.9476696127678514, + "eval_loss": 0.15076443552970886, + "eval_runtime": 33.5101, + "eval_samples_per_second": 469.322, + "eval_steps_per_second": 14.682, + "step": 15500 + }, + { + "epoch": 2.2015613910574876, + "grad_norm": 3.4228737354278564, + "learning_rate": 9.779957416607524e-05, + "loss": 0.08043778538703919, + "step": 15510 + }, + { + "epoch": 2.2029808374733855, + "grad_norm": 7.456453800201416, + "learning_rate": 9.779815471965934e-05, + "loss": 0.08067357540130615, + "step": 15520 + }, + { + "epoch": 2.2044002838892833, + "grad_norm": 7.92563533782959, + "learning_rate": 9.779673527324345e-05, + "loss": 0.15267107486724854, + "step": 15530 + }, + { + "epoch": 2.2058197303051807, + "grad_norm": 7.132428169250488, + "learning_rate": 9.779531582682753e-05, + "loss": 0.20551769733428954, + "step": 15540 + }, + { + "epoch": 2.2072391767210786, + "grad_norm": 5.588425636291504, + "learning_rate": 9.779389638041165e-05, + "loss": 0.0594519853591919, + "step": 15550 + }, + { + "epoch": 2.2086586231369765, + "grad_norm": 0.8327229619026184, + "learning_rate": 9.779247693399574e-05, + "loss": 0.09828418493270874, + "step": 15560 + }, + { + "epoch": 2.2100780695528743, + "grad_norm": 4.466777324676514, + "learning_rate": 9.779105748757985e-05, + "loss": 0.0886389136314392, + "step": 15570 + }, + { + "epoch": 2.211497515968772, + "grad_norm": 6.381712913513184, + "learning_rate": 9.778963804116395e-05, + "loss": 0.11927787065505982, + "step": 15580 + }, + { + "epoch": 2.21291696238467, + "grad_norm": 6.469443321228027, + "learning_rate": 9.778821859474805e-05, + "loss": 0.17326163053512572, + "step": 15590 + }, + { + "epoch": 2.214336408800568, + "grad_norm": 6.632884502410889, + "learning_rate": 9.778679914833216e-05, + "loss": 0.11724759340286255, + "step": 15600 + }, + { + "epoch": 2.215755855216466, + "grad_norm": 3.7693932056427, + "learning_rate": 9.778537970191626e-05, + "loss": 0.12318531274795533, + "step": 15610 + }, + { + "epoch": 2.217175301632363, + "grad_norm": 11.708182334899902, + "learning_rate": 9.778396025550037e-05, + "loss": 0.1665675401687622, + "step": 15620 + }, + { + "epoch": 2.218594748048261, + "grad_norm": 6.708708763122559, + "learning_rate": 9.778254080908447e-05, + "loss": 0.09552123546600341, + "step": 15630 + }, + { + "epoch": 2.220014194464159, + "grad_norm": 3.537140130996704, + "learning_rate": 9.778112136266856e-05, + "loss": 0.17162953615188598, + "step": 15640 + }, + { + "epoch": 2.221433640880057, + "grad_norm": 3.47255802154541, + "learning_rate": 9.777970191625266e-05, + "loss": 0.11431492567062378, + "step": 15650 + }, + { + "epoch": 2.2228530872959547, + "grad_norm": 2.390170097351074, + "learning_rate": 9.777828246983677e-05, + "loss": 0.1374788761138916, + "step": 15660 + }, + { + "epoch": 2.2242725337118525, + "grad_norm": 8.488000869750977, + "learning_rate": 9.777686302342087e-05, + "loss": 0.075135737657547, + "step": 15670 + }, + { + "epoch": 2.2256919801277504, + "grad_norm": 5.250071048736572, + "learning_rate": 9.777544357700498e-05, + "loss": 0.15566228628158568, + "step": 15680 + }, + { + "epoch": 2.227111426543648, + "grad_norm": 1.0439021587371826, + "learning_rate": 9.777402413058908e-05, + "loss": 0.08581479787826538, + "step": 15690 + }, + { + "epoch": 2.2285308729595457, + "grad_norm": 5.081490993499756, + "learning_rate": 9.777260468417317e-05, + "loss": 0.0691333532333374, + "step": 15700 + }, + { + "epoch": 2.2299503193754435, + "grad_norm": 4.931427478790283, + "learning_rate": 9.777118523775729e-05, + "loss": 0.08706582188606263, + "step": 15710 + }, + { + "epoch": 2.2313697657913414, + "grad_norm": 2.0620617866516113, + "learning_rate": 9.776976579134138e-05, + "loss": 0.09351248145103455, + "step": 15720 + }, + { + "epoch": 2.2327892122072392, + "grad_norm": 11.9086275100708, + "learning_rate": 9.77683463449255e-05, + "loss": 0.167766273021698, + "step": 15730 + }, + { + "epoch": 2.234208658623137, + "grad_norm": 7.802628993988037, + "learning_rate": 9.776692689850958e-05, + "loss": 0.08956191539764405, + "step": 15740 + }, + { + "epoch": 2.235628105039035, + "grad_norm": 6.4769134521484375, + "learning_rate": 9.776550745209369e-05, + "loss": 0.0949668049812317, + "step": 15750 + }, + { + "epoch": 2.2370475514549324, + "grad_norm": 5.48812198638916, + "learning_rate": 9.776408800567779e-05, + "loss": 0.10781463384628295, + "step": 15760 + }, + { + "epoch": 2.2384669978708303, + "grad_norm": 4.095717430114746, + "learning_rate": 9.77626685592619e-05, + "loss": 0.10710879564285278, + "step": 15770 + }, + { + "epoch": 2.239886444286728, + "grad_norm": 7.886163234710693, + "learning_rate": 9.7761249112846e-05, + "loss": 0.16387512683868408, + "step": 15780 + }, + { + "epoch": 2.241305890702626, + "grad_norm": 5.275144577026367, + "learning_rate": 9.775982966643009e-05, + "loss": 0.13074166774749757, + "step": 15790 + }, + { + "epoch": 2.242725337118524, + "grad_norm": 6.263736248016357, + "learning_rate": 9.77584102200142e-05, + "loss": 0.1308918595314026, + "step": 15800 + }, + { + "epoch": 2.2441447835344217, + "grad_norm": 8.084881782531738, + "learning_rate": 9.77569907735983e-05, + "loss": 0.15410442352294923, + "step": 15810 + }, + { + "epoch": 2.2455642299503196, + "grad_norm": 5.83068323135376, + "learning_rate": 9.775557132718241e-05, + "loss": 0.15612525939941407, + "step": 15820 + }, + { + "epoch": 2.246983676366217, + "grad_norm": 0.40490075945854187, + "learning_rate": 9.775415188076651e-05, + "loss": 0.05670689940452576, + "step": 15830 + }, + { + "epoch": 2.248403122782115, + "grad_norm": 9.664972305297852, + "learning_rate": 9.775273243435062e-05, + "loss": 0.08322632312774658, + "step": 15840 + }, + { + "epoch": 2.2498225691980127, + "grad_norm": 5.599974632263184, + "learning_rate": 9.77513129879347e-05, + "loss": 0.0942413330078125, + "step": 15850 + }, + { + "epoch": 2.2512420156139106, + "grad_norm": 4.52598762512207, + "learning_rate": 9.774989354151881e-05, + "loss": 0.1192929744720459, + "step": 15860 + }, + { + "epoch": 2.2526614620298084, + "grad_norm": 8.435208320617676, + "learning_rate": 9.774847409510291e-05, + "loss": 0.10933787822723388, + "step": 15870 + }, + { + "epoch": 2.2540809084457063, + "grad_norm": 6.769467353820801, + "learning_rate": 9.774705464868702e-05, + "loss": 0.08203907608985901, + "step": 15880 + }, + { + "epoch": 2.255500354861604, + "grad_norm": 7.499700546264648, + "learning_rate": 9.774563520227112e-05, + "loss": 0.1266704320907593, + "step": 15890 + }, + { + "epoch": 2.2569198012775016, + "grad_norm": 4.320639133453369, + "learning_rate": 9.774421575585522e-05, + "loss": 0.13925156593322754, + "step": 15900 + }, + { + "epoch": 2.2583392476933994, + "grad_norm": 5.2828168869018555, + "learning_rate": 9.774279630943933e-05, + "loss": 0.12411700487136841, + "step": 15910 + }, + { + "epoch": 2.2597586941092973, + "grad_norm": 7.704649448394775, + "learning_rate": 9.774137686302343e-05, + "loss": 0.17451765537261962, + "step": 15920 + }, + { + "epoch": 2.261178140525195, + "grad_norm": 10.570831298828125, + "learning_rate": 9.773995741660754e-05, + "loss": 0.14159404039382933, + "step": 15930 + }, + { + "epoch": 2.262597586941093, + "grad_norm": 1.9156538248062134, + "learning_rate": 9.773853797019163e-05, + "loss": 0.10246649980545045, + "step": 15940 + }, + { + "epoch": 2.264017033356991, + "grad_norm": 10.271675109863281, + "learning_rate": 9.773711852377573e-05, + "loss": 0.1498422145843506, + "step": 15950 + }, + { + "epoch": 2.2654364797728888, + "grad_norm": 2.2951345443725586, + "learning_rate": 9.773569907735983e-05, + "loss": 0.1181708812713623, + "step": 15960 + }, + { + "epoch": 2.2668559261887866, + "grad_norm": 7.073802471160889, + "learning_rate": 9.773427963094394e-05, + "loss": 0.13307657241821289, + "step": 15970 + }, + { + "epoch": 2.268275372604684, + "grad_norm": 3.94195556640625, + "learning_rate": 9.773286018452804e-05, + "loss": 0.06159374713897705, + "step": 15980 + }, + { + "epoch": 2.269694819020582, + "grad_norm": 0.3510136902332306, + "learning_rate": 9.773144073811215e-05, + "loss": 0.05166938900947571, + "step": 15990 + }, + { + "epoch": 2.2711142654364798, + "grad_norm": 2.7349507808685303, + "learning_rate": 9.773002129169624e-05, + "loss": 0.12719658613204957, + "step": 16000 + }, + { + "epoch": 2.2711142654364798, + "eval_accuracy": 0.9534558402746869, + "eval_loss": 0.12974673509597778, + "eval_runtime": 32.9436, + "eval_samples_per_second": 477.391, + "eval_steps_per_second": 14.935, + "step": 16000 + }, + { + "epoch": 2.2725337118523776, + "grad_norm": 3.8580965995788574, + "learning_rate": 9.772860184528034e-05, + "loss": 0.07134815454483032, + "step": 16010 + }, + { + "epoch": 2.2739531582682755, + "grad_norm": 10.57183837890625, + "learning_rate": 9.772718239886445e-05, + "loss": 0.11877801418304443, + "step": 16020 + }, + { + "epoch": 2.2753726046841733, + "grad_norm": 8.526998519897461, + "learning_rate": 9.772576295244855e-05, + "loss": 0.11683057546615601, + "step": 16030 + }, + { + "epoch": 2.276792051100071, + "grad_norm": 2.470162868499756, + "learning_rate": 9.772434350603266e-05, + "loss": 0.0911303460597992, + "step": 16040 + }, + { + "epoch": 2.2782114975159686, + "grad_norm": 8.097274780273438, + "learning_rate": 9.772292405961675e-05, + "loss": 0.1780623197555542, + "step": 16050 + }, + { + "epoch": 2.2796309439318665, + "grad_norm": 4.431247234344482, + "learning_rate": 9.772150461320086e-05, + "loss": 0.13148776292800904, + "step": 16060 + }, + { + "epoch": 2.2810503903477644, + "grad_norm": 7.473452568054199, + "learning_rate": 9.772008516678495e-05, + "loss": 0.09967323541641235, + "step": 16070 + }, + { + "epoch": 2.282469836763662, + "grad_norm": 2.283681869506836, + "learning_rate": 9.771866572036906e-05, + "loss": 0.06625600457191468, + "step": 16080 + }, + { + "epoch": 2.28388928317956, + "grad_norm": 0.9107749462127686, + "learning_rate": 9.771724627395316e-05, + "loss": 0.07753741145133972, + "step": 16090 + }, + { + "epoch": 2.285308729595458, + "grad_norm": 2.082306146621704, + "learning_rate": 9.771582682753726e-05, + "loss": 0.06911807656288146, + "step": 16100 + }, + { + "epoch": 2.286728176011356, + "grad_norm": 8.424261093139648, + "learning_rate": 9.771440738112137e-05, + "loss": 0.06900658011436463, + "step": 16110 + }, + { + "epoch": 2.2881476224272532, + "grad_norm": 2.821417808532715, + "learning_rate": 9.771298793470547e-05, + "loss": 0.10042606592178345, + "step": 16120 + }, + { + "epoch": 2.289567068843151, + "grad_norm": 4.486814975738525, + "learning_rate": 9.771156848828958e-05, + "loss": 0.1290997862815857, + "step": 16130 + }, + { + "epoch": 2.290986515259049, + "grad_norm": 8.3433198928833, + "learning_rate": 9.771014904187368e-05, + "loss": 0.14453980922698975, + "step": 16140 + }, + { + "epoch": 2.292405961674947, + "grad_norm": 9.422966003417969, + "learning_rate": 9.770872959545777e-05, + "loss": 0.13661658763885498, + "step": 16150 + }, + { + "epoch": 2.2938254080908447, + "grad_norm": 6.411171913146973, + "learning_rate": 9.770731014904187e-05, + "loss": 0.09912009239196777, + "step": 16160 + }, + { + "epoch": 2.2952448545067425, + "grad_norm": 4.763072490692139, + "learning_rate": 9.770589070262598e-05, + "loss": 0.10291681289672852, + "step": 16170 + }, + { + "epoch": 2.2966643009226404, + "grad_norm": 5.987633228302002, + "learning_rate": 9.770447125621008e-05, + "loss": 0.15251626968383789, + "step": 16180 + }, + { + "epoch": 2.298083747338538, + "grad_norm": 10.529451370239258, + "learning_rate": 9.770305180979419e-05, + "loss": 0.17285287380218506, + "step": 16190 + }, + { + "epoch": 2.2995031937544357, + "grad_norm": 1.2355297803878784, + "learning_rate": 9.770163236337829e-05, + "loss": 0.16878000497817994, + "step": 16200 + }, + { + "epoch": 2.3009226401703335, + "grad_norm": 2.409059762954712, + "learning_rate": 9.770021291696238e-05, + "loss": 0.08963816165924073, + "step": 16210 + }, + { + "epoch": 2.3023420865862314, + "grad_norm": 0.2473367154598236, + "learning_rate": 9.76987934705465e-05, + "loss": 0.07898592352867126, + "step": 16220 + }, + { + "epoch": 2.3037615330021293, + "grad_norm": 3.4052321910858154, + "learning_rate": 9.769737402413059e-05, + "loss": 0.13420867919921875, + "step": 16230 + }, + { + "epoch": 2.305180979418027, + "grad_norm": 2.8136518001556396, + "learning_rate": 9.76959545777147e-05, + "loss": 0.08897106051445007, + "step": 16240 + }, + { + "epoch": 2.306600425833925, + "grad_norm": 4.1067094802856445, + "learning_rate": 9.76945351312988e-05, + "loss": 0.11498106718063354, + "step": 16250 + }, + { + "epoch": 2.3080198722498224, + "grad_norm": 3.161066770553589, + "learning_rate": 9.76931156848829e-05, + "loss": 0.12238447666168213, + "step": 16260 + }, + { + "epoch": 2.3094393186657203, + "grad_norm": 8.762333869934082, + "learning_rate": 9.7691696238467e-05, + "loss": 0.07079674601554871, + "step": 16270 + }, + { + "epoch": 2.310858765081618, + "grad_norm": 2.2034451961517334, + "learning_rate": 9.76902767920511e-05, + "loss": 0.1289450168609619, + "step": 16280 + }, + { + "epoch": 2.312278211497516, + "grad_norm": 3.3836324214935303, + "learning_rate": 9.76888573456352e-05, + "loss": 0.08217411041259766, + "step": 16290 + }, + { + "epoch": 2.313697657913414, + "grad_norm": 2.8655858039855957, + "learning_rate": 9.768743789921932e-05, + "loss": 0.09491733908653259, + "step": 16300 + }, + { + "epoch": 2.3151171043293117, + "grad_norm": 4.423978805541992, + "learning_rate": 9.768601845280341e-05, + "loss": 0.13365116119384765, + "step": 16310 + }, + { + "epoch": 2.3165365507452096, + "grad_norm": 8.303816795349121, + "learning_rate": 9.768459900638751e-05, + "loss": 0.15843117237091064, + "step": 16320 + }, + { + "epoch": 2.317955997161107, + "grad_norm": 0.4200175404548645, + "learning_rate": 9.768317955997162e-05, + "loss": 0.12860283851623536, + "step": 16330 + }, + { + "epoch": 2.319375443577005, + "grad_norm": 0.9817140698432922, + "learning_rate": 9.768176011355572e-05, + "loss": 0.0771494209766388, + "step": 16340 + }, + { + "epoch": 2.3207948899929027, + "grad_norm": 5.904425144195557, + "learning_rate": 9.768034066713983e-05, + "loss": 0.09748343229293824, + "step": 16350 + }, + { + "epoch": 2.3222143364088006, + "grad_norm": 11.307563781738281, + "learning_rate": 9.767892122072391e-05, + "loss": 0.16953353881835936, + "step": 16360 + }, + { + "epoch": 2.3236337828246985, + "grad_norm": 4.251320838928223, + "learning_rate": 9.767750177430802e-05, + "loss": 0.10789685249328614, + "step": 16370 + }, + { + "epoch": 2.3250532292405963, + "grad_norm": 3.149813175201416, + "learning_rate": 9.767608232789212e-05, + "loss": 0.09740127325057983, + "step": 16380 + }, + { + "epoch": 2.326472675656494, + "grad_norm": 9.757298469543457, + "learning_rate": 9.767466288147623e-05, + "loss": 0.16251888275146484, + "step": 16390 + }, + { + "epoch": 2.3278921220723916, + "grad_norm": 3.574176073074341, + "learning_rate": 9.767324343506033e-05, + "loss": 0.08429834246635437, + "step": 16400 + }, + { + "epoch": 2.3293115684882895, + "grad_norm": 3.4276225566864014, + "learning_rate": 9.767182398864443e-05, + "loss": 0.08981868624687195, + "step": 16410 + }, + { + "epoch": 2.3307310149041873, + "grad_norm": 7.491410732269287, + "learning_rate": 9.767040454222854e-05, + "loss": 0.1534734010696411, + "step": 16420 + }, + { + "epoch": 2.332150461320085, + "grad_norm": 7.178809642791748, + "learning_rate": 9.766898509581264e-05, + "loss": 0.13512442111968995, + "step": 16430 + }, + { + "epoch": 2.333569907735983, + "grad_norm": 7.452297687530518, + "learning_rate": 9.766756564939675e-05, + "loss": 0.15903291702270508, + "step": 16440 + }, + { + "epoch": 2.334989354151881, + "grad_norm": 4.820403575897217, + "learning_rate": 9.766614620298084e-05, + "loss": 0.1309017300605774, + "step": 16450 + }, + { + "epoch": 2.3364088005677788, + "grad_norm": 7.638652801513672, + "learning_rate": 9.766472675656494e-05, + "loss": 0.12363828420639038, + "step": 16460 + }, + { + "epoch": 2.337828246983676, + "grad_norm": 9.250051498413086, + "learning_rate": 9.766330731014904e-05, + "loss": 0.15233538150787354, + "step": 16470 + }, + { + "epoch": 2.339247693399574, + "grad_norm": 4.459556579589844, + "learning_rate": 9.766188786373315e-05, + "loss": 0.16799700260162354, + "step": 16480 + }, + { + "epoch": 2.340667139815472, + "grad_norm": 5.2020955085754395, + "learning_rate": 9.766046841731725e-05, + "loss": 0.0790201485157013, + "step": 16490 + }, + { + "epoch": 2.34208658623137, + "grad_norm": 1.893151879310608, + "learning_rate": 9.765904897090136e-05, + "loss": 0.07257702350616455, + "step": 16500 + }, + { + "epoch": 2.34208658623137, + "eval_accuracy": 0.952883576015769, + "eval_loss": 0.14282935857772827, + "eval_runtime": 32.8134, + "eval_samples_per_second": 479.287, + "eval_steps_per_second": 14.994, + "step": 16500 + }, + { + "epoch": 2.3435060326472676, + "grad_norm": 2.2250635623931885, + "learning_rate": 9.765762952448545e-05, + "loss": 0.11261917352676391, + "step": 16510 + }, + { + "epoch": 2.3449254790631655, + "grad_norm": 1.5180538892745972, + "learning_rate": 9.765621007806955e-05, + "loss": 0.08184219598770141, + "step": 16520 + }, + { + "epoch": 2.3463449254790634, + "grad_norm": 3.8298745155334473, + "learning_rate": 9.765479063165366e-05, + "loss": 0.08645458817481995, + "step": 16530 + }, + { + "epoch": 2.347764371894961, + "grad_norm": 3.3084588050842285, + "learning_rate": 9.765337118523776e-05, + "loss": 0.06830872893333435, + "step": 16540 + }, + { + "epoch": 2.3491838183108587, + "grad_norm": 0.7720867395401001, + "learning_rate": 9.765195173882187e-05, + "loss": 0.1192325472831726, + "step": 16550 + }, + { + "epoch": 2.3506032647267565, + "grad_norm": 7.036698341369629, + "learning_rate": 9.765053229240597e-05, + "loss": 0.09893574118614197, + "step": 16560 + }, + { + "epoch": 2.3520227111426544, + "grad_norm": 7.439764499664307, + "learning_rate": 9.764911284599007e-05, + "loss": 0.09484468102455139, + "step": 16570 + }, + { + "epoch": 2.3534421575585522, + "grad_norm": 4.2301435470581055, + "learning_rate": 9.764769339957416e-05, + "loss": 0.11805753707885742, + "step": 16580 + }, + { + "epoch": 2.35486160397445, + "grad_norm": 6.39113712310791, + "learning_rate": 9.764627395315827e-05, + "loss": 0.09725428223609925, + "step": 16590 + }, + { + "epoch": 2.356281050390348, + "grad_norm": 6.1582841873168945, + "learning_rate": 9.764485450674237e-05, + "loss": 0.10667927265167236, + "step": 16600 + }, + { + "epoch": 2.3577004968062454, + "grad_norm": 3.7757277488708496, + "learning_rate": 9.764343506032648e-05, + "loss": 0.12746351957321167, + "step": 16610 + }, + { + "epoch": 2.3591199432221432, + "grad_norm": 5.895532131195068, + "learning_rate": 9.764201561391058e-05, + "loss": 0.13624510765075684, + "step": 16620 + }, + { + "epoch": 2.360539389638041, + "grad_norm": 3.6180717945098877, + "learning_rate": 9.764059616749468e-05, + "loss": 0.12134850025177002, + "step": 16630 + }, + { + "epoch": 2.361958836053939, + "grad_norm": 4.084766864776611, + "learning_rate": 9.763917672107879e-05, + "loss": 0.09110198616981506, + "step": 16640 + }, + { + "epoch": 2.363378282469837, + "grad_norm": 7.207777500152588, + "learning_rate": 9.763775727466289e-05, + "loss": 0.0987035095691681, + "step": 16650 + }, + { + "epoch": 2.3647977288857347, + "grad_norm": 7.370236396789551, + "learning_rate": 9.7636337828247e-05, + "loss": 0.15047061443328857, + "step": 16660 + }, + { + "epoch": 2.3662171753016326, + "grad_norm": 7.778202056884766, + "learning_rate": 9.763491838183108e-05, + "loss": 0.16292293071746827, + "step": 16670 + }, + { + "epoch": 2.36763662171753, + "grad_norm": 3.764970541000366, + "learning_rate": 9.763349893541519e-05, + "loss": 0.12445385456085205, + "step": 16680 + }, + { + "epoch": 2.369056068133428, + "grad_norm": 2.9177567958831787, + "learning_rate": 9.763207948899929e-05, + "loss": 0.12629375457763672, + "step": 16690 + }, + { + "epoch": 2.3704755145493257, + "grad_norm": 4.7777099609375, + "learning_rate": 9.76306600425834e-05, + "loss": 0.10483566522598267, + "step": 16700 + }, + { + "epoch": 2.3718949609652236, + "grad_norm": 2.476802349090576, + "learning_rate": 9.76292405961675e-05, + "loss": 0.07830199003219604, + "step": 16710 + }, + { + "epoch": 2.3733144073811214, + "grad_norm": 6.576395034790039, + "learning_rate": 9.76278211497516e-05, + "loss": 0.12722206115722656, + "step": 16720 + }, + { + "epoch": 2.3747338537970193, + "grad_norm": 1.5219242572784424, + "learning_rate": 9.76264017033357e-05, + "loss": 0.08835641741752624, + "step": 16730 + }, + { + "epoch": 2.376153300212917, + "grad_norm": 2.6990671157836914, + "learning_rate": 9.76249822569198e-05, + "loss": 0.10250411033630372, + "step": 16740 + }, + { + "epoch": 2.3775727466288146, + "grad_norm": 4.596541404724121, + "learning_rate": 9.762356281050391e-05, + "loss": 0.14535219669342042, + "step": 16750 + }, + { + "epoch": 2.3789921930447124, + "grad_norm": 2.865243434906006, + "learning_rate": 9.762214336408801e-05, + "loss": 0.061080020666122434, + "step": 16760 + }, + { + "epoch": 2.3804116394606103, + "grad_norm": 4.850032806396484, + "learning_rate": 9.762072391767211e-05, + "loss": 0.11783115863800049, + "step": 16770 + }, + { + "epoch": 2.381831085876508, + "grad_norm": 1.7372711896896362, + "learning_rate": 9.76193044712562e-05, + "loss": 0.09774195551872253, + "step": 16780 + }, + { + "epoch": 2.383250532292406, + "grad_norm": 7.511697769165039, + "learning_rate": 9.761788502484032e-05, + "loss": 0.1309769868850708, + "step": 16790 + }, + { + "epoch": 2.384669978708304, + "grad_norm": 8.27840805053711, + "learning_rate": 9.761646557842441e-05, + "loss": 0.17970755100250244, + "step": 16800 + }, + { + "epoch": 2.3860894251242017, + "grad_norm": 0.9087435603141785, + "learning_rate": 9.761504613200853e-05, + "loss": 0.07040458917617798, + "step": 16810 + }, + { + "epoch": 2.387508871540099, + "grad_norm": 3.8493130207061768, + "learning_rate": 9.761362668559262e-05, + "loss": 0.11651371717453003, + "step": 16820 + }, + { + "epoch": 2.388928317955997, + "grad_norm": 1.5010507106781006, + "learning_rate": 9.761220723917672e-05, + "loss": 0.08106373548507691, + "step": 16830 + }, + { + "epoch": 2.390347764371895, + "grad_norm": 6.315835475921631, + "learning_rate": 9.761078779276083e-05, + "loss": 0.11155383586883545, + "step": 16840 + }, + { + "epoch": 2.3917672107877928, + "grad_norm": 2.8264517784118652, + "learning_rate": 9.760936834634493e-05, + "loss": 0.12171386480331421, + "step": 16850 + }, + { + "epoch": 2.3931866572036906, + "grad_norm": 2.294635057449341, + "learning_rate": 9.760794889992904e-05, + "loss": 0.1550905466079712, + "step": 16860 + }, + { + "epoch": 2.3946061036195885, + "grad_norm": 4.574626445770264, + "learning_rate": 9.760652945351312e-05, + "loss": 0.11418824195861817, + "step": 16870 + }, + { + "epoch": 2.3960255500354863, + "grad_norm": 2.7776918411254883, + "learning_rate": 9.760511000709723e-05, + "loss": 0.12959576845169068, + "step": 16880 + }, + { + "epoch": 2.3974449964513838, + "grad_norm": 3.4543848037719727, + "learning_rate": 9.760369056068133e-05, + "loss": 0.11354950666427613, + "step": 16890 + }, + { + "epoch": 2.3988644428672816, + "grad_norm": 5.274985313415527, + "learning_rate": 9.760227111426544e-05, + "loss": 0.06138370633125305, + "step": 16900 + }, + { + "epoch": 2.4002838892831795, + "grad_norm": 6.934667110443115, + "learning_rate": 9.760085166784955e-05, + "loss": 0.1329074501991272, + "step": 16910 + }, + { + "epoch": 2.4017033356990773, + "grad_norm": 6.645686626434326, + "learning_rate": 9.759943222143365e-05, + "loss": 0.17836753129959107, + "step": 16920 + }, + { + "epoch": 2.403122782114975, + "grad_norm": 6.251645088195801, + "learning_rate": 9.759801277501775e-05, + "loss": 0.0962505280971527, + "step": 16930 + }, + { + "epoch": 2.404542228530873, + "grad_norm": 5.136745452880859, + "learning_rate": 9.759659332860185e-05, + "loss": 0.08273377418518066, + "step": 16940 + }, + { + "epoch": 2.405961674946771, + "grad_norm": 7.956725120544434, + "learning_rate": 9.759517388218596e-05, + "loss": 0.11856834888458252, + "step": 16950 + }, + { + "epoch": 2.4073811213626684, + "grad_norm": 2.631044387817383, + "learning_rate": 9.759375443577005e-05, + "loss": 0.11917568445205688, + "step": 16960 + }, + { + "epoch": 2.408800567778566, + "grad_norm": 5.937511444091797, + "learning_rate": 9.759233498935416e-05, + "loss": 0.07629096508026123, + "step": 16970 + }, + { + "epoch": 2.410220014194464, + "grad_norm": 5.794412612915039, + "learning_rate": 9.759091554293825e-05, + "loss": 0.1741081953048706, + "step": 16980 + }, + { + "epoch": 2.411639460610362, + "grad_norm": 6.313220977783203, + "learning_rate": 9.758949609652236e-05, + "loss": 0.07898733615875245, + "step": 16990 + }, + { + "epoch": 2.41305890702626, + "grad_norm": 7.137319087982178, + "learning_rate": 9.758807665010647e-05, + "loss": 0.11363914012908935, + "step": 17000 + }, + { + "epoch": 2.41305890702626, + "eval_accuracy": 0.9416926305080435, + "eval_loss": 0.1742754876613617, + "eval_runtime": 31.9943, + "eval_samples_per_second": 491.556, + "eval_steps_per_second": 15.378, + "step": 17000 + }, + { + "epoch": 2.4144783534421577, + "grad_norm": 5.010659217834473, + "learning_rate": 9.758665720369057e-05, + "loss": 0.15786590576171874, + "step": 17010 + }, + { + "epoch": 2.4158977998580555, + "grad_norm": 6.37407112121582, + "learning_rate": 9.758523775727468e-05, + "loss": 0.1406489849090576, + "step": 17020 + }, + { + "epoch": 2.417317246273953, + "grad_norm": 4.527013301849365, + "learning_rate": 9.758381831085876e-05, + "loss": 0.10702930688858033, + "step": 17030 + }, + { + "epoch": 2.418736692689851, + "grad_norm": 2.203209161758423, + "learning_rate": 9.758239886444287e-05, + "loss": 0.21100082397460937, + "step": 17040 + }, + { + "epoch": 2.4201561391057487, + "grad_norm": 2.5778391361236572, + "learning_rate": 9.758097941802697e-05, + "loss": 0.05981506705284119, + "step": 17050 + }, + { + "epoch": 2.4215755855216465, + "grad_norm": 6.347795486450195, + "learning_rate": 9.757955997161108e-05, + "loss": 0.12853623628616334, + "step": 17060 + }, + { + "epoch": 2.4229950319375444, + "grad_norm": 9.994209289550781, + "learning_rate": 9.757814052519518e-05, + "loss": 0.10259546041488647, + "step": 17070 + }, + { + "epoch": 2.4244144783534423, + "grad_norm": 3.367839813232422, + "learning_rate": 9.757672107877928e-05, + "loss": 0.06157753467559814, + "step": 17080 + }, + { + "epoch": 2.42583392476934, + "grad_norm": 3.509408473968506, + "learning_rate": 9.757530163236339e-05, + "loss": 0.08180438876152038, + "step": 17090 + }, + { + "epoch": 2.4272533711852375, + "grad_norm": 4.197175025939941, + "learning_rate": 9.757388218594748e-05, + "loss": 0.14403607845306396, + "step": 17100 + }, + { + "epoch": 2.4286728176011354, + "grad_norm": 4.370192527770996, + "learning_rate": 9.75724627395316e-05, + "loss": 0.16384668350219728, + "step": 17110 + }, + { + "epoch": 2.4300922640170333, + "grad_norm": 3.144803047180176, + "learning_rate": 9.757104329311569e-05, + "loss": 0.08878316283226013, + "step": 17120 + }, + { + "epoch": 2.431511710432931, + "grad_norm": 4.3488593101501465, + "learning_rate": 9.756962384669979e-05, + "loss": 0.17752463817596437, + "step": 17130 + }, + { + "epoch": 2.432931156848829, + "grad_norm": 9.861291885375977, + "learning_rate": 9.756820440028389e-05, + "loss": 0.10461457967758178, + "step": 17140 + }, + { + "epoch": 2.434350603264727, + "grad_norm": 2.252723217010498, + "learning_rate": 9.7566784953868e-05, + "loss": 0.09538206458091736, + "step": 17150 + }, + { + "epoch": 2.4357700496806247, + "grad_norm": 3.788640022277832, + "learning_rate": 9.75653655074521e-05, + "loss": 0.10890170335769653, + "step": 17160 + }, + { + "epoch": 2.437189496096522, + "grad_norm": 8.450477600097656, + "learning_rate": 9.756394606103621e-05, + "loss": 0.1873611330986023, + "step": 17170 + }, + { + "epoch": 2.43860894251242, + "grad_norm": 6.922235012054443, + "learning_rate": 9.75625266146203e-05, + "loss": 0.13029056787490845, + "step": 17180 + }, + { + "epoch": 2.440028388928318, + "grad_norm": 6.11525821685791, + "learning_rate": 9.75611071682044e-05, + "loss": 0.11692187786102295, + "step": 17190 + }, + { + "epoch": 2.4414478353442157, + "grad_norm": 7.727966785430908, + "learning_rate": 9.755968772178851e-05, + "loss": 0.18141931295394897, + "step": 17200 + }, + { + "epoch": 2.4428672817601136, + "grad_norm": 1.1188493967056274, + "learning_rate": 9.755826827537261e-05, + "loss": 0.14119462966918944, + "step": 17210 + }, + { + "epoch": 2.4442867281760114, + "grad_norm": 8.788047790527344, + "learning_rate": 9.755684882895672e-05, + "loss": 0.11063623428344727, + "step": 17220 + }, + { + "epoch": 2.4457061745919093, + "grad_norm": 4.968696117401123, + "learning_rate": 9.75554293825408e-05, + "loss": 0.11871033906936646, + "step": 17230 + }, + { + "epoch": 2.4471256210078067, + "grad_norm": 4.146373271942139, + "learning_rate": 9.755400993612492e-05, + "loss": 0.1038577675819397, + "step": 17240 + }, + { + "epoch": 2.4485450674237046, + "grad_norm": 4.578568458557129, + "learning_rate": 9.755259048970901e-05, + "loss": 0.1644783616065979, + "step": 17250 + }, + { + "epoch": 2.4499645138396025, + "grad_norm": 5.26609992980957, + "learning_rate": 9.755117104329312e-05, + "loss": 0.1413109540939331, + "step": 17260 + }, + { + "epoch": 2.4513839602555003, + "grad_norm": 5.410380840301514, + "learning_rate": 9.754975159687722e-05, + "loss": 0.10622183084487916, + "step": 17270 + }, + { + "epoch": 2.452803406671398, + "grad_norm": 8.643942832946777, + "learning_rate": 9.754833215046133e-05, + "loss": 0.12519901990890503, + "step": 17280 + }, + { + "epoch": 2.454222853087296, + "grad_norm": 5.121556282043457, + "learning_rate": 9.754691270404543e-05, + "loss": 0.1216310977935791, + "step": 17290 + }, + { + "epoch": 2.455642299503194, + "grad_norm": 4.879176139831543, + "learning_rate": 9.754549325762953e-05, + "loss": 0.07838413119316101, + "step": 17300 + }, + { + "epoch": 2.4570617459190913, + "grad_norm": 5.997292518615723, + "learning_rate": 9.754407381121364e-05, + "loss": 0.11862040758132934, + "step": 17310 + }, + { + "epoch": 2.458481192334989, + "grad_norm": 7.370124340057373, + "learning_rate": 9.754265436479774e-05, + "loss": 0.13782591819763185, + "step": 17320 + }, + { + "epoch": 2.459900638750887, + "grad_norm": 3.0784833431243896, + "learning_rate": 9.754123491838185e-05, + "loss": 0.12893285751342773, + "step": 17330 + }, + { + "epoch": 2.461320085166785, + "grad_norm": 4.132889747619629, + "learning_rate": 9.753981547196593e-05, + "loss": 0.1482453465461731, + "step": 17340 + }, + { + "epoch": 2.4627395315826828, + "grad_norm": 4.386025905609131, + "learning_rate": 9.753839602555004e-05, + "loss": 0.08701491355895996, + "step": 17350 + }, + { + "epoch": 2.4641589779985806, + "grad_norm": 7.536581516265869, + "learning_rate": 9.753697657913414e-05, + "loss": 0.1785440683364868, + "step": 17360 + }, + { + "epoch": 2.4655784244144785, + "grad_norm": 4.566206455230713, + "learning_rate": 9.753555713271825e-05, + "loss": 0.07483741641044617, + "step": 17370 + }, + { + "epoch": 2.466997870830376, + "grad_norm": 4.969336032867432, + "learning_rate": 9.753413768630235e-05, + "loss": 0.09664581418037414, + "step": 17380 + }, + { + "epoch": 2.468417317246274, + "grad_norm": 3.3608598709106445, + "learning_rate": 9.753271823988644e-05, + "loss": 0.08268053531646728, + "step": 17390 + }, + { + "epoch": 2.4698367636621716, + "grad_norm": 7.48677396774292, + "learning_rate": 9.753129879347055e-05, + "loss": 0.08111786842346191, + "step": 17400 + }, + { + "epoch": 2.4712562100780695, + "grad_norm": 2.8628151416778564, + "learning_rate": 9.752987934705465e-05, + "loss": 0.09410454630851746, + "step": 17410 + }, + { + "epoch": 2.4726756564939674, + "grad_norm": 5.564269065856934, + "learning_rate": 9.752845990063876e-05, + "loss": 0.09594557881355285, + "step": 17420 + }, + { + "epoch": 2.4740951029098652, + "grad_norm": 0.6636775135993958, + "learning_rate": 9.752704045422286e-05, + "loss": 0.09588454365730285, + "step": 17430 + }, + { + "epoch": 2.475514549325763, + "grad_norm": 6.354304313659668, + "learning_rate": 9.752562100780696e-05, + "loss": 0.10989620685577392, + "step": 17440 + }, + { + "epoch": 2.4769339957416605, + "grad_norm": 3.9579975605010986, + "learning_rate": 9.752434350603266e-05, + "loss": 0.10450366735458375, + "step": 17450 + }, + { + "epoch": 2.4783534421575584, + "grad_norm": 2.8820838928222656, + "learning_rate": 9.752292405961675e-05, + "loss": 0.09479145407676696, + "step": 17460 + }, + { + "epoch": 2.4797728885734562, + "grad_norm": 1.7476080656051636, + "learning_rate": 9.752150461320085e-05, + "loss": 0.12545448541641235, + "step": 17470 + }, + { + "epoch": 2.481192334989354, + "grad_norm": 7.19633150100708, + "learning_rate": 9.752008516678496e-05, + "loss": 0.0939016044139862, + "step": 17480 + }, + { + "epoch": 2.482611781405252, + "grad_norm": 11.924422264099121, + "learning_rate": 9.751866572036906e-05, + "loss": 0.16066315174102783, + "step": 17490 + }, + { + "epoch": 2.48403122782115, + "grad_norm": 2.1974613666534424, + "learning_rate": 9.751724627395317e-05, + "loss": 0.1423601746559143, + "step": 17500 + }, + { + "epoch": 2.48403122782115, + "eval_accuracy": 0.9445539518026325, + "eval_loss": 0.1727043092250824, + "eval_runtime": 32.757, + "eval_samples_per_second": 480.111, + "eval_steps_per_second": 15.02, + "step": 17500 + }, + { + "epoch": 2.4854506742370477, + "grad_norm": 6.336993217468262, + "learning_rate": 9.751582682753725e-05, + "loss": 0.13694591522216798, + "step": 17510 + }, + { + "epoch": 2.486870120652945, + "grad_norm": 4.340056896209717, + "learning_rate": 9.751440738112137e-05, + "loss": 0.1784249186515808, + "step": 17520 + }, + { + "epoch": 2.488289567068843, + "grad_norm": 4.247930526733398, + "learning_rate": 9.751298793470546e-05, + "loss": 0.10544888973236084, + "step": 17530 + }, + { + "epoch": 2.489709013484741, + "grad_norm": 2.6260440349578857, + "learning_rate": 9.751156848828957e-05, + "loss": 0.0672307550907135, + "step": 17540 + }, + { + "epoch": 2.4911284599006387, + "grad_norm": 1.9838597774505615, + "learning_rate": 9.751014904187367e-05, + "loss": 0.14270519018173217, + "step": 17550 + }, + { + "epoch": 2.4925479063165366, + "grad_norm": 1.2045660018920898, + "learning_rate": 9.750872959545777e-05, + "loss": 0.15043948888778685, + "step": 17560 + }, + { + "epoch": 2.4939673527324344, + "grad_norm": 7.862235069274902, + "learning_rate": 9.750731014904188e-05, + "loss": 0.07321544885635375, + "step": 17570 + }, + { + "epoch": 2.4953867991483323, + "grad_norm": 6.350536823272705, + "learning_rate": 9.750589070262598e-05, + "loss": 0.11304857730865478, + "step": 17580 + }, + { + "epoch": 2.49680624556423, + "grad_norm": 0.9608795046806335, + "learning_rate": 9.750447125621009e-05, + "loss": 0.08769638538360595, + "step": 17590 + }, + { + "epoch": 2.4982256919801276, + "grad_norm": 4.455130100250244, + "learning_rate": 9.750305180979418e-05, + "loss": 0.06901848912239075, + "step": 17600 + }, + { + "epoch": 2.4996451383960254, + "grad_norm": 3.236755132675171, + "learning_rate": 9.75016323633783e-05, + "loss": 0.10142921209335327, + "step": 17610 + }, + { + "epoch": 2.5010645848119233, + "grad_norm": 0.9103105068206787, + "learning_rate": 9.750021291696238e-05, + "loss": 0.12128010988235474, + "step": 17620 + }, + { + "epoch": 2.502484031227821, + "grad_norm": 3.3010218143463135, + "learning_rate": 9.749879347054649e-05, + "loss": 0.09445170164108277, + "step": 17630 + }, + { + "epoch": 2.503903477643719, + "grad_norm": 5.537515163421631, + "learning_rate": 9.749737402413059e-05, + "loss": 0.051540815830230714, + "step": 17640 + }, + { + "epoch": 2.505322924059617, + "grad_norm": 6.594273090362549, + "learning_rate": 9.74959545777147e-05, + "loss": 0.11053000688552857, + "step": 17650 + }, + { + "epoch": 2.5067423704755143, + "grad_norm": 6.973751068115234, + "learning_rate": 9.749453513129881e-05, + "loss": 0.17602165937423705, + "step": 17660 + }, + { + "epoch": 2.5081618168914126, + "grad_norm": 1.8898471593856812, + "learning_rate": 9.74931156848829e-05, + "loss": 0.09699593782424927, + "step": 17670 + }, + { + "epoch": 2.50958126330731, + "grad_norm": 8.757147789001465, + "learning_rate": 9.7491696238467e-05, + "loss": 0.09828613996505738, + "step": 17680 + }, + { + "epoch": 2.511000709723208, + "grad_norm": 5.698178291320801, + "learning_rate": 9.74902767920511e-05, + "loss": 0.09792088270187378, + "step": 17690 + }, + { + "epoch": 2.5124201561391057, + "grad_norm": 2.3245534896850586, + "learning_rate": 9.748885734563521e-05, + "loss": 0.08730112314224243, + "step": 17700 + }, + { + "epoch": 2.5138396025550036, + "grad_norm": 3.97782301902771, + "learning_rate": 9.748743789921931e-05, + "loss": 0.09204915165901184, + "step": 17710 + }, + { + "epoch": 2.5152590489709015, + "grad_norm": 2.635392904281616, + "learning_rate": 9.748601845280341e-05, + "loss": 0.08571889400482177, + "step": 17720 + }, + { + "epoch": 2.516678495386799, + "grad_norm": 4.555758476257324, + "learning_rate": 9.74845990063875e-05, + "loss": 0.10614382028579712, + "step": 17730 + }, + { + "epoch": 2.518097941802697, + "grad_norm": 6.458566665649414, + "learning_rate": 9.748317955997162e-05, + "loss": 0.1116061806678772, + "step": 17740 + }, + { + "epoch": 2.5195173882185946, + "grad_norm": 7.498642921447754, + "learning_rate": 9.748176011355573e-05, + "loss": 0.08102936148643494, + "step": 17750 + }, + { + "epoch": 2.5209368346344925, + "grad_norm": 8.974710464477539, + "learning_rate": 9.748034066713982e-05, + "loss": 0.15357725620269774, + "step": 17760 + }, + { + "epoch": 2.5223562810503903, + "grad_norm": 6.158868789672852, + "learning_rate": 9.747892122072392e-05, + "loss": 0.1006664514541626, + "step": 17770 + }, + { + "epoch": 2.523775727466288, + "grad_norm": 0.8831135630607605, + "learning_rate": 9.747750177430802e-05, + "loss": 0.07348037958145141, + "step": 17780 + }, + { + "epoch": 2.525195173882186, + "grad_norm": 8.365797996520996, + "learning_rate": 9.747608232789213e-05, + "loss": 0.09979128241539001, + "step": 17790 + }, + { + "epoch": 2.5266146202980835, + "grad_norm": 13.500819206237793, + "learning_rate": 9.747466288147623e-05, + "loss": 0.13896944522857665, + "step": 17800 + }, + { + "epoch": 2.528034066713982, + "grad_norm": 4.766392230987549, + "learning_rate": 9.747324343506034e-05, + "loss": 0.07932850122451782, + "step": 17810 + }, + { + "epoch": 2.529453513129879, + "grad_norm": 7.3413310050964355, + "learning_rate": 9.747182398864442e-05, + "loss": 0.07950088977813721, + "step": 17820 + }, + { + "epoch": 2.530872959545777, + "grad_norm": 3.8923566341400146, + "learning_rate": 9.747040454222853e-05, + "loss": 0.09398716688156128, + "step": 17830 + }, + { + "epoch": 2.532292405961675, + "grad_norm": 5.209949970245361, + "learning_rate": 9.746898509581264e-05, + "loss": 0.11348887681961059, + "step": 17840 + }, + { + "epoch": 2.533711852377573, + "grad_norm": 8.087526321411133, + "learning_rate": 9.746756564939674e-05, + "loss": 0.13804304599761963, + "step": 17850 + }, + { + "epoch": 2.5351312987934707, + "grad_norm": 4.874515056610107, + "learning_rate": 9.746614620298085e-05, + "loss": 0.12363841533660888, + "step": 17860 + }, + { + "epoch": 2.536550745209368, + "grad_norm": 9.139041900634766, + "learning_rate": 9.746472675656494e-05, + "loss": 0.09068549871444702, + "step": 17870 + }, + { + "epoch": 2.5379701916252664, + "grad_norm": 6.489454746246338, + "learning_rate": 9.746330731014905e-05, + "loss": 0.1587399125099182, + "step": 17880 + }, + { + "epoch": 2.539389638041164, + "grad_norm": 9.474618911743164, + "learning_rate": 9.746188786373314e-05, + "loss": 0.13566343784332274, + "step": 17890 + }, + { + "epoch": 2.5408090844570617, + "grad_norm": 3.8730716705322266, + "learning_rate": 9.746046841731726e-05, + "loss": 0.08422473669052125, + "step": 17900 + }, + { + "epoch": 2.5422285308729595, + "grad_norm": 2.2097864151000977, + "learning_rate": 9.745904897090135e-05, + "loss": 0.13542672395706176, + "step": 17910 + }, + { + "epoch": 2.5436479772888574, + "grad_norm": 15.095120429992676, + "learning_rate": 9.745762952448545e-05, + "loss": 0.14511890411376954, + "step": 17920 + }, + { + "epoch": 2.5450674237047552, + "grad_norm": 12.847689628601074, + "learning_rate": 9.745621007806956e-05, + "loss": 0.0919945478439331, + "step": 17930 + }, + { + "epoch": 2.5464868701206527, + "grad_norm": 2.031590223312378, + "learning_rate": 9.745479063165366e-05, + "loss": 0.13927642107009888, + "step": 17940 + }, + { + "epoch": 2.547906316536551, + "grad_norm": 4.216944694519043, + "learning_rate": 9.745337118523777e-05, + "loss": 0.10198723077774048, + "step": 17950 + }, + { + "epoch": 2.5493257629524484, + "grad_norm": 7.031200408935547, + "learning_rate": 9.745195173882187e-05, + "loss": 0.11566638946533203, + "step": 17960 + }, + { + "epoch": 2.5507452093683463, + "grad_norm": 5.59580135345459, + "learning_rate": 9.745053229240598e-05, + "loss": 0.0891038417816162, + "step": 17970 + }, + { + "epoch": 2.552164655784244, + "grad_norm": 8.706607818603516, + "learning_rate": 9.744911284599006e-05, + "loss": 0.09640666842460632, + "step": 17980 + }, + { + "epoch": 2.553584102200142, + "grad_norm": 3.204340934753418, + "learning_rate": 9.744769339957417e-05, + "loss": 0.10391557216644287, + "step": 17990 + }, + { + "epoch": 2.55500354861604, + "grad_norm": 6.2729573249816895, + "learning_rate": 9.744627395315827e-05, + "loss": 0.11966743469238281, + "step": 18000 + }, + { + "epoch": 2.55500354861604, + "eval_accuracy": 0.9338081007185096, + "eval_loss": 0.1904294788837433, + "eval_runtime": 32.5049, + "eval_samples_per_second": 483.835, + "eval_steps_per_second": 15.136, + "step": 18000 + }, + { + "epoch": 2.5564229950319377, + "grad_norm": 4.016758918762207, + "learning_rate": 9.744485450674238e-05, + "loss": 0.16458499431610107, + "step": 18010 + }, + { + "epoch": 2.5578424414478356, + "grad_norm": 9.767767906188965, + "learning_rate": 9.744343506032648e-05, + "loss": 0.11013137102127075, + "step": 18020 + }, + { + "epoch": 2.559261887863733, + "grad_norm": 10.628437042236328, + "learning_rate": 9.744201561391058e-05, + "loss": 0.1186720848083496, + "step": 18030 + }, + { + "epoch": 2.560681334279631, + "grad_norm": 4.2828545570373535, + "learning_rate": 9.744059616749469e-05, + "loss": 0.11388142108917236, + "step": 18040 + }, + { + "epoch": 2.5621007806955287, + "grad_norm": 5.870272636413574, + "learning_rate": 9.743917672107878e-05, + "loss": 0.09274822473526001, + "step": 18050 + }, + { + "epoch": 2.5635202271114266, + "grad_norm": 1.7781943082809448, + "learning_rate": 9.74377572746629e-05, + "loss": 0.10968050956726075, + "step": 18060 + }, + { + "epoch": 2.5649396735273244, + "grad_norm": 10.247567176818848, + "learning_rate": 9.743633782824699e-05, + "loss": 0.12503312826156615, + "step": 18070 + }, + { + "epoch": 2.5663591199432223, + "grad_norm": 5.602545261383057, + "learning_rate": 9.743491838183109e-05, + "loss": 0.09583965539932252, + "step": 18080 + }, + { + "epoch": 2.56777856635912, + "grad_norm": 1.3222918510437012, + "learning_rate": 9.743349893541519e-05, + "loss": 0.11057568788528442, + "step": 18090 + }, + { + "epoch": 2.5691980127750176, + "grad_norm": 2.3814685344696045, + "learning_rate": 9.74320794889993e-05, + "loss": 0.11936540603637695, + "step": 18100 + }, + { + "epoch": 2.5706174591909154, + "grad_norm": 2.4344863891601562, + "learning_rate": 9.74306600425834e-05, + "loss": 0.0944204032421112, + "step": 18110 + }, + { + "epoch": 2.5720369056068133, + "grad_norm": 8.206236839294434, + "learning_rate": 9.74292405961675e-05, + "loss": 0.08790295124053955, + "step": 18120 + }, + { + "epoch": 2.573456352022711, + "grad_norm": 6.2798566818237305, + "learning_rate": 9.74278211497516e-05, + "loss": 0.13661357164382934, + "step": 18130 + }, + { + "epoch": 2.574875798438609, + "grad_norm": 9.54171085357666, + "learning_rate": 9.74264017033357e-05, + "loss": 0.11890660524368286, + "step": 18140 + }, + { + "epoch": 2.576295244854507, + "grad_norm": 2.0758354663848877, + "learning_rate": 9.742498225691981e-05, + "loss": 0.14780707359313966, + "step": 18150 + }, + { + "epoch": 2.5777146912704048, + "grad_norm": 9.819342613220215, + "learning_rate": 9.742356281050391e-05, + "loss": 0.17009602785110473, + "step": 18160 + }, + { + "epoch": 2.579134137686302, + "grad_norm": 0.4771549105644226, + "learning_rate": 9.742214336408802e-05, + "loss": 0.09668282270431519, + "step": 18170 + }, + { + "epoch": 2.5805535841022, + "grad_norm": 3.620116710662842, + "learning_rate": 9.74207239176721e-05, + "loss": 0.09066780805587768, + "step": 18180 + }, + { + "epoch": 2.581973030518098, + "grad_norm": 2.4723594188690186, + "learning_rate": 9.741930447125621e-05, + "loss": 0.09381983876228332, + "step": 18190 + }, + { + "epoch": 2.5833924769339958, + "grad_norm": 8.35051441192627, + "learning_rate": 9.741788502484031e-05, + "loss": 0.15823612213134766, + "step": 18200 + }, + { + "epoch": 2.5848119233498936, + "grad_norm": 5.235237121582031, + "learning_rate": 9.741646557842442e-05, + "loss": 0.1483514666557312, + "step": 18210 + }, + { + "epoch": 2.5862313697657915, + "grad_norm": 4.181369781494141, + "learning_rate": 9.741504613200852e-05, + "loss": 0.07576992511749267, + "step": 18220 + }, + { + "epoch": 2.5876508161816894, + "grad_norm": 7.384850025177002, + "learning_rate": 9.741362668559262e-05, + "loss": 0.07849894762039185, + "step": 18230 + }, + { + "epoch": 2.5890702625975868, + "grad_norm": 2.344217300415039, + "learning_rate": 9.741220723917673e-05, + "loss": 0.10990880727767945, + "step": 18240 + }, + { + "epoch": 2.5904897090134846, + "grad_norm": 5.363242149353027, + "learning_rate": 9.741078779276083e-05, + "loss": 0.1412426710128784, + "step": 18250 + }, + { + "epoch": 2.5919091554293825, + "grad_norm": 3.7980527877807617, + "learning_rate": 9.740936834634494e-05, + "loss": 0.10421816110610962, + "step": 18260 + }, + { + "epoch": 2.5933286018452804, + "grad_norm": 9.759673118591309, + "learning_rate": 9.740794889992903e-05, + "loss": 0.11693978309631348, + "step": 18270 + }, + { + "epoch": 2.594748048261178, + "grad_norm": 2.0219240188598633, + "learning_rate": 9.740652945351315e-05, + "loss": 0.12884674072265626, + "step": 18280 + }, + { + "epoch": 2.596167494677076, + "grad_norm": 9.535964012145996, + "learning_rate": 9.740511000709723e-05, + "loss": 0.12031383514404297, + "step": 18290 + }, + { + "epoch": 2.597586941092974, + "grad_norm": 5.354515552520752, + "learning_rate": 9.740369056068134e-05, + "loss": 0.0845773994922638, + "step": 18300 + }, + { + "epoch": 2.5990063875088714, + "grad_norm": 1.1112140417099, + "learning_rate": 9.740227111426544e-05, + "loss": 0.1002803087234497, + "step": 18310 + }, + { + "epoch": 2.6004258339247692, + "grad_norm": 2.0215070247650146, + "learning_rate": 9.740085166784955e-05, + "loss": 0.10047941207885742, + "step": 18320 + }, + { + "epoch": 2.601845280340667, + "grad_norm": 6.67712926864624, + "learning_rate": 9.739943222143365e-05, + "loss": 0.13017858266830445, + "step": 18330 + }, + { + "epoch": 2.603264726756565, + "grad_norm": 10.4568452835083, + "learning_rate": 9.739801277501774e-05, + "loss": 0.19226794242858886, + "step": 18340 + }, + { + "epoch": 2.604684173172463, + "grad_norm": 6.936629772186279, + "learning_rate": 9.739659332860185e-05, + "loss": 0.1478518009185791, + "step": 18350 + }, + { + "epoch": 2.6061036195883607, + "grad_norm": 0.7439237236976624, + "learning_rate": 9.739517388218595e-05, + "loss": 0.11475565433502197, + "step": 18360 + }, + { + "epoch": 2.6075230660042585, + "grad_norm": 6.165897369384766, + "learning_rate": 9.739375443577006e-05, + "loss": 0.13509042263031007, + "step": 18370 + }, + { + "epoch": 2.608942512420156, + "grad_norm": 5.026000022888184, + "learning_rate": 9.739233498935416e-05, + "loss": 0.11895132064819336, + "step": 18380 + }, + { + "epoch": 2.610361958836054, + "grad_norm": 4.722821235656738, + "learning_rate": 9.739091554293826e-05, + "loss": 0.15483348369598388, + "step": 18390 + }, + { + "epoch": 2.6117814052519517, + "grad_norm": 4.340688705444336, + "learning_rate": 9.738949609652235e-05, + "loss": 0.09090102910995483, + "step": 18400 + }, + { + "epoch": 2.6132008516678495, + "grad_norm": 1.8677579164505005, + "learning_rate": 9.738807665010647e-05, + "loss": 0.12864718437194825, + "step": 18410 + }, + { + "epoch": 2.6146202980837474, + "grad_norm": 4.120899200439453, + "learning_rate": 9.738665720369056e-05, + "loss": 0.10905364751815796, + "step": 18420 + }, + { + "epoch": 2.6160397444996453, + "grad_norm": 2.1230714321136475, + "learning_rate": 9.738523775727467e-05, + "loss": 0.11330556869506836, + "step": 18430 + }, + { + "epoch": 2.617459190915543, + "grad_norm": 7.033359527587891, + "learning_rate": 9.738381831085877e-05, + "loss": 0.08752457499504089, + "step": 18440 + }, + { + "epoch": 2.6188786373314406, + "grad_norm": 5.958856105804443, + "learning_rate": 9.738239886444287e-05, + "loss": 0.07405679225921631, + "step": 18450 + }, + { + "epoch": 2.6202980837473384, + "grad_norm": 3.3164892196655273, + "learning_rate": 9.738097941802698e-05, + "loss": 0.049712374806404114, + "step": 18460 + }, + { + "epoch": 2.6217175301632363, + "grad_norm": 5.792750358581543, + "learning_rate": 9.737955997161108e-05, + "loss": 0.11241586208343506, + "step": 18470 + }, + { + "epoch": 2.623136976579134, + "grad_norm": 5.713932514190674, + "learning_rate": 9.737814052519519e-05, + "loss": 0.0947425127029419, + "step": 18480 + }, + { + "epoch": 2.624556422995032, + "grad_norm": 5.652758598327637, + "learning_rate": 9.737672107877927e-05, + "loss": 0.09331372976303101, + "step": 18490 + }, + { + "epoch": 2.62597586941093, + "grad_norm": 4.281705856323242, + "learning_rate": 9.737530163236338e-05, + "loss": 0.09365745782852172, + "step": 18500 + }, + { + "epoch": 2.62597586941093, + "eval_accuracy": 0.950721688815413, + "eval_loss": 0.1462646871805191, + "eval_runtime": 34.5176, + "eval_samples_per_second": 455.623, + "eval_steps_per_second": 14.254, + "step": 18500 + }, + { + "epoch": 2.6273953158268277, + "grad_norm": 4.376514911651611, + "learning_rate": 9.737388218594748e-05, + "loss": 0.06313493251800537, + "step": 18510 + }, + { + "epoch": 2.628814762242725, + "grad_norm": 7.006924629211426, + "learning_rate": 9.737246273953159e-05, + "loss": 0.1129868745803833, + "step": 18520 + }, + { + "epoch": 2.630234208658623, + "grad_norm": 6.207458972930908, + "learning_rate": 9.737104329311569e-05, + "loss": 0.15238604545593262, + "step": 18530 + }, + { + "epoch": 2.631653655074521, + "grad_norm": 0.35649651288986206, + "learning_rate": 9.736962384669979e-05, + "loss": 0.1252423644065857, + "step": 18540 + }, + { + "epoch": 2.6330731014904187, + "grad_norm": 4.224631309509277, + "learning_rate": 9.73682044002839e-05, + "loss": 0.11180676221847534, + "step": 18550 + }, + { + "epoch": 2.6344925479063166, + "grad_norm": 6.666781425476074, + "learning_rate": 9.7366784953868e-05, + "loss": 0.09207946062088013, + "step": 18560 + }, + { + "epoch": 2.6359119943222145, + "grad_norm": 5.663329124450684, + "learning_rate": 9.73653655074521e-05, + "loss": 0.09166657328605651, + "step": 18570 + }, + { + "epoch": 2.6373314407381123, + "grad_norm": 4.614907741546631, + "learning_rate": 9.73639460610362e-05, + "loss": 0.08460969924926758, + "step": 18580 + }, + { + "epoch": 2.6387508871540097, + "grad_norm": 4.568515300750732, + "learning_rate": 9.73625266146203e-05, + "loss": 0.0926063060760498, + "step": 18590 + }, + { + "epoch": 2.6401703335699076, + "grad_norm": 4.265593528747559, + "learning_rate": 9.73611071682044e-05, + "loss": 0.14236600399017335, + "step": 18600 + }, + { + "epoch": 2.6415897799858055, + "grad_norm": 3.393044948577881, + "learning_rate": 9.735968772178851e-05, + "loss": 0.06547205448150635, + "step": 18610 + }, + { + "epoch": 2.6430092264017033, + "grad_norm": 2.976576328277588, + "learning_rate": 9.73582682753726e-05, + "loss": 0.07752239108085632, + "step": 18620 + }, + { + "epoch": 2.644428672817601, + "grad_norm": 5.691226959228516, + "learning_rate": 9.735684882895672e-05, + "loss": 0.10452626943588257, + "step": 18630 + }, + { + "epoch": 2.645848119233499, + "grad_norm": 6.348296642303467, + "learning_rate": 9.735542938254081e-05, + "loss": 0.11550105810165405, + "step": 18640 + }, + { + "epoch": 2.647267565649397, + "grad_norm": 9.737822532653809, + "learning_rate": 9.735400993612491e-05, + "loss": 0.12678935527801513, + "step": 18650 + }, + { + "epoch": 2.6486870120652943, + "grad_norm": 1.7993618249893188, + "learning_rate": 9.735259048970902e-05, + "loss": 0.09803841710090637, + "step": 18660 + }, + { + "epoch": 2.650106458481192, + "grad_norm": 5.785006523132324, + "learning_rate": 9.735117104329312e-05, + "loss": 0.12399122714996338, + "step": 18670 + }, + { + "epoch": 2.65152590489709, + "grad_norm": 5.436007976531982, + "learning_rate": 9.734975159687723e-05, + "loss": 0.11214399337768555, + "step": 18680 + }, + { + "epoch": 2.652945351312988, + "grad_norm": 6.046454429626465, + "learning_rate": 9.734833215046133e-05, + "loss": 0.08356254100799561, + "step": 18690 + }, + { + "epoch": 2.654364797728886, + "grad_norm": 7.5290021896362305, + "learning_rate": 9.734691270404542e-05, + "loss": 0.101429283618927, + "step": 18700 + }, + { + "epoch": 2.6557842441447836, + "grad_norm": 3.0168631076812744, + "learning_rate": 9.734549325762952e-05, + "loss": 0.09058440327644349, + "step": 18710 + }, + { + "epoch": 2.6572036905606815, + "grad_norm": 8.676300048828125, + "learning_rate": 9.734407381121363e-05, + "loss": 0.13883825540542602, + "step": 18720 + }, + { + "epoch": 2.658623136976579, + "grad_norm": 10.840899467468262, + "learning_rate": 9.734265436479773e-05, + "loss": 0.11511178016662597, + "step": 18730 + }, + { + "epoch": 2.660042583392477, + "grad_norm": 2.287022113800049, + "learning_rate": 9.734123491838184e-05, + "loss": 0.10089895725250245, + "step": 18740 + }, + { + "epoch": 2.6614620298083747, + "grad_norm": 5.894728183746338, + "learning_rate": 9.733981547196594e-05, + "loss": 0.10052759647369384, + "step": 18750 + }, + { + "epoch": 2.6628814762242725, + "grad_norm": 3.954016923904419, + "learning_rate": 9.733839602555004e-05, + "loss": 0.1216499924659729, + "step": 18760 + }, + { + "epoch": 2.6643009226401704, + "grad_norm": 9.103641510009766, + "learning_rate": 9.733697657913415e-05, + "loss": 0.10710169076919555, + "step": 18770 + }, + { + "epoch": 2.6657203690560682, + "grad_norm": 0.37438610196113586, + "learning_rate": 9.733555713271824e-05, + "loss": 0.08723070025444031, + "step": 18780 + }, + { + "epoch": 2.667139815471966, + "grad_norm": 9.926944732666016, + "learning_rate": 9.733413768630236e-05, + "loss": 0.12807276248931884, + "step": 18790 + }, + { + "epoch": 2.6685592618878635, + "grad_norm": 9.92432689666748, + "learning_rate": 9.733271823988644e-05, + "loss": 0.18386597633361818, + "step": 18800 + }, + { + "epoch": 2.6699787083037614, + "grad_norm": 4.39555549621582, + "learning_rate": 9.733129879347055e-05, + "loss": 0.10847448110580445, + "step": 18810 + }, + { + "epoch": 2.6713981547196592, + "grad_norm": 4.371532440185547, + "learning_rate": 9.732987934705465e-05, + "loss": 0.11950172185897827, + "step": 18820 + }, + { + "epoch": 2.672817601135557, + "grad_norm": 3.7563788890838623, + "learning_rate": 9.732845990063876e-05, + "loss": 0.11064698696136474, + "step": 18830 + }, + { + "epoch": 2.674237047551455, + "grad_norm": 8.16103458404541, + "learning_rate": 9.732704045422286e-05, + "loss": 0.1522403836250305, + "step": 18840 + }, + { + "epoch": 2.675656493967353, + "grad_norm": 2.7513720989227295, + "learning_rate": 9.732562100780695e-05, + "loss": 0.14767955541610717, + "step": 18850 + }, + { + "epoch": 2.6770759403832507, + "grad_norm": 4.588718891143799, + "learning_rate": 9.732420156139106e-05, + "loss": 0.11084201335906982, + "step": 18860 + }, + { + "epoch": 2.678495386799148, + "grad_norm": 3.071213722229004, + "learning_rate": 9.732278211497516e-05, + "loss": 0.15097259283065795, + "step": 18870 + }, + { + "epoch": 2.679914833215046, + "grad_norm": 6.630822658538818, + "learning_rate": 9.732136266855927e-05, + "loss": 0.09166755676269531, + "step": 18880 + }, + { + "epoch": 2.681334279630944, + "grad_norm": 7.124295711517334, + "learning_rate": 9.731994322214337e-05, + "loss": 0.14961253404617308, + "step": 18890 + }, + { + "epoch": 2.6827537260468417, + "grad_norm": 8.885273933410645, + "learning_rate": 9.731852377572747e-05, + "loss": 0.1840854525566101, + "step": 18900 + }, + { + "epoch": 2.6841731724627396, + "grad_norm": 14.617013931274414, + "learning_rate": 9.731710432931156e-05, + "loss": 0.15676331520080566, + "step": 18910 + }, + { + "epoch": 2.6855926188786374, + "grad_norm": 5.9459452629089355, + "learning_rate": 9.731568488289568e-05, + "loss": 0.13418021202087402, + "step": 18920 + }, + { + "epoch": 2.6870120652945353, + "grad_norm": 1.0808570384979248, + "learning_rate": 9.731426543647977e-05, + "loss": 0.15757611989974976, + "step": 18930 + }, + { + "epoch": 2.6884315117104327, + "grad_norm": 1.862561583518982, + "learning_rate": 9.731284599006388e-05, + "loss": 0.09019602537155151, + "step": 18940 + }, + { + "epoch": 2.6898509581263306, + "grad_norm": 2.4577274322509766, + "learning_rate": 9.731142654364798e-05, + "loss": 0.06294019222259521, + "step": 18950 + }, + { + "epoch": 2.6912704045422284, + "grad_norm": 3.2663893699645996, + "learning_rate": 9.731000709723208e-05, + "loss": 0.06696848869323731, + "step": 18960 + }, + { + "epoch": 2.6926898509581263, + "grad_norm": 1.4709694385528564, + "learning_rate": 9.730858765081619e-05, + "loss": 0.061003082990646364, + "step": 18970 + }, + { + "epoch": 2.694109297374024, + "grad_norm": 2.4802117347717285, + "learning_rate": 9.730716820440029e-05, + "loss": 0.10601764917373657, + "step": 18980 + }, + { + "epoch": 2.695528743789922, + "grad_norm": 5.821985244750977, + "learning_rate": 9.73057487579844e-05, + "loss": 0.12596286535263063, + "step": 18990 + }, + { + "epoch": 2.69694819020582, + "grad_norm": 4.4037981033325195, + "learning_rate": 9.73043293115685e-05, + "loss": 0.08721169829368591, + "step": 19000 + }, + { + "epoch": 2.69694819020582, + "eval_accuracy": 0.9462071596617282, + "eval_loss": 0.1497952938079834, + "eval_runtime": 35.1407, + "eval_samples_per_second": 447.544, + "eval_steps_per_second": 14.001, + "step": 19000 + }, + { + "epoch": 2.6983676366217173, + "grad_norm": 5.155467987060547, + "learning_rate": 9.730290986515259e-05, + "loss": 0.16012940406799317, + "step": 19010 + }, + { + "epoch": 2.699787083037615, + "grad_norm": 6.539963245391846, + "learning_rate": 9.730149041873669e-05, + "loss": 0.13179491758346557, + "step": 19020 + }, + { + "epoch": 2.701206529453513, + "grad_norm": 5.117822647094727, + "learning_rate": 9.73000709723208e-05, + "loss": 0.11193997859954834, + "step": 19030 + }, + { + "epoch": 2.702625975869411, + "grad_norm": 13.319026947021484, + "learning_rate": 9.72986515259049e-05, + "loss": 0.06884243488311767, + "step": 19040 + }, + { + "epoch": 2.7040454222853088, + "grad_norm": 12.856066703796387, + "learning_rate": 9.729723207948901e-05, + "loss": 0.11155580282211304, + "step": 19050 + }, + { + "epoch": 2.7054648687012066, + "grad_norm": 3.3367395401000977, + "learning_rate": 9.72958126330731e-05, + "loss": 0.11018801927566528, + "step": 19060 + }, + { + "epoch": 2.7068843151171045, + "grad_norm": 2.5702414512634277, + "learning_rate": 9.72943931866572e-05, + "loss": 0.14847090244293212, + "step": 19070 + }, + { + "epoch": 2.708303761533002, + "grad_norm": 3.5079307556152344, + "learning_rate": 9.729297374024131e-05, + "loss": 0.12648016214370728, + "step": 19080 + }, + { + "epoch": 2.7097232079489, + "grad_norm": 7.1927642822265625, + "learning_rate": 9.729155429382541e-05, + "loss": 0.08001441359519959, + "step": 19090 + }, + { + "epoch": 2.7111426543647976, + "grad_norm": 2.3428845405578613, + "learning_rate": 9.729013484740952e-05, + "loss": 0.07565593719482422, + "step": 19100 + }, + { + "epoch": 2.7125621007806955, + "grad_norm": 5.344996929168701, + "learning_rate": 9.728871540099361e-05, + "loss": 0.06011520624160767, + "step": 19110 + }, + { + "epoch": 2.7139815471965933, + "grad_norm": 3.558228015899658, + "learning_rate": 9.728729595457772e-05, + "loss": 0.13906779289245605, + "step": 19120 + }, + { + "epoch": 2.715400993612491, + "grad_norm": 2.2271339893341064, + "learning_rate": 9.728587650816182e-05, + "loss": 0.06516092419624328, + "step": 19130 + }, + { + "epoch": 2.716820440028389, + "grad_norm": 6.620656490325928, + "learning_rate": 9.728445706174593e-05, + "loss": 0.08588937520980836, + "step": 19140 + }, + { + "epoch": 2.7182398864442865, + "grad_norm": 0.9995052218437195, + "learning_rate": 9.728303761533004e-05, + "loss": 0.07684165835380555, + "step": 19150 + }, + { + "epoch": 2.719659332860185, + "grad_norm": 2.3631653785705566, + "learning_rate": 9.728161816891412e-05, + "loss": 0.08287461400032044, + "step": 19160 + }, + { + "epoch": 2.721078779276082, + "grad_norm": 6.304315567016602, + "learning_rate": 9.728019872249823e-05, + "loss": 0.14411957263946534, + "step": 19170 + }, + { + "epoch": 2.72249822569198, + "grad_norm": 2.651029109954834, + "learning_rate": 9.727877927608233e-05, + "loss": 0.16562498807907106, + "step": 19180 + }, + { + "epoch": 2.723917672107878, + "grad_norm": 1.1602712869644165, + "learning_rate": 9.727735982966644e-05, + "loss": 0.0994363009929657, + "step": 19190 + }, + { + "epoch": 2.725337118523776, + "grad_norm": 2.081709384918213, + "learning_rate": 9.727594038325054e-05, + "loss": 0.1161266803741455, + "step": 19200 + }, + { + "epoch": 2.7267565649396737, + "grad_norm": 5.32574462890625, + "learning_rate": 9.727452093683463e-05, + "loss": 0.11266434192657471, + "step": 19210 + }, + { + "epoch": 2.728176011355571, + "grad_norm": 4.33624267578125, + "learning_rate": 9.727310149041873e-05, + "loss": 0.07457006573677064, + "step": 19220 + }, + { + "epoch": 2.7295954577714694, + "grad_norm": 9.516417503356934, + "learning_rate": 9.727168204400284e-05, + "loss": 0.06251566410064698, + "step": 19230 + }, + { + "epoch": 2.731014904187367, + "grad_norm": 7.441606044769287, + "learning_rate": 9.727026259758695e-05, + "loss": 0.11953941583633423, + "step": 19240 + }, + { + "epoch": 2.7324343506032647, + "grad_norm": 0.9915375113487244, + "learning_rate": 9.726884315117105e-05, + "loss": 0.10013129711151122, + "step": 19250 + }, + { + "epoch": 2.7338537970191625, + "grad_norm": 6.937955379486084, + "learning_rate": 9.726742370475515e-05, + "loss": 0.13717392683029175, + "step": 19260 + }, + { + "epoch": 2.7352732434350604, + "grad_norm": 6.149573802947998, + "learning_rate": 9.726600425833925e-05, + "loss": 0.11093438863754272, + "step": 19270 + }, + { + "epoch": 2.7366926898509583, + "grad_norm": 4.646894454956055, + "learning_rate": 9.726458481192336e-05, + "loss": 0.15733885765075684, + "step": 19280 + }, + { + "epoch": 2.7381121362668557, + "grad_norm": 5.516530513763428, + "learning_rate": 9.726316536550745e-05, + "loss": 0.06147825121879578, + "step": 19290 + }, + { + "epoch": 2.739531582682754, + "grad_norm": 3.121425151824951, + "learning_rate": 9.726174591909157e-05, + "loss": 0.06866928935050964, + "step": 19300 + }, + { + "epoch": 2.7409510290986514, + "grad_norm": 7.502362251281738, + "learning_rate": 9.726032647267565e-05, + "loss": 0.08418467044830322, + "step": 19310 + }, + { + "epoch": 2.7423704755145493, + "grad_norm": 2.791508436203003, + "learning_rate": 9.725890702625976e-05, + "loss": 0.11801939010620117, + "step": 19320 + }, + { + "epoch": 2.743789921930447, + "grad_norm": 7.064516544342041, + "learning_rate": 9.725748757984387e-05, + "loss": 0.13972241878509523, + "step": 19330 + }, + { + "epoch": 2.745209368346345, + "grad_norm": 5.9328932762146, + "learning_rate": 9.725606813342797e-05, + "loss": 0.12251147031784057, + "step": 19340 + }, + { + "epoch": 2.746628814762243, + "grad_norm": 6.175622940063477, + "learning_rate": 9.725464868701208e-05, + "loss": 0.06602987051010131, + "step": 19350 + }, + { + "epoch": 2.7480482611781403, + "grad_norm": 4.53786563873291, + "learning_rate": 9.725322924059618e-05, + "loss": 0.1297551393508911, + "step": 19360 + }, + { + "epoch": 2.7494677075940386, + "grad_norm": 3.098621368408203, + "learning_rate": 9.725180979418027e-05, + "loss": 0.1370749831199646, + "step": 19370 + }, + { + "epoch": 2.750887154009936, + "grad_norm": 3.015416383743286, + "learning_rate": 9.725039034776437e-05, + "loss": 0.12202317714691162, + "step": 19380 + }, + { + "epoch": 2.752306600425834, + "grad_norm": 2.518812656402588, + "learning_rate": 9.724897090134848e-05, + "loss": 0.08936739563941956, + "step": 19390 + }, + { + "epoch": 2.7537260468417317, + "grad_norm": 6.073837757110596, + "learning_rate": 9.724755145493258e-05, + "loss": 0.1370900511741638, + "step": 19400 + }, + { + "epoch": 2.7551454932576296, + "grad_norm": 5.372803211212158, + "learning_rate": 9.724613200851669e-05, + "loss": 0.16160420179367066, + "step": 19410 + }, + { + "epoch": 2.7565649396735274, + "grad_norm": 3.8927814960479736, + "learning_rate": 9.724471256210079e-05, + "loss": 0.18655315637588502, + "step": 19420 + }, + { + "epoch": 2.757984386089425, + "grad_norm": 6.601566314697266, + "learning_rate": 9.724329311568489e-05, + "loss": 0.06503421068191528, + "step": 19430 + }, + { + "epoch": 2.759403832505323, + "grad_norm": 8.965290069580078, + "learning_rate": 9.7241873669269e-05, + "loss": 0.15749263763427734, + "step": 19440 + }, + { + "epoch": 2.7608232789212206, + "grad_norm": 6.057149410247803, + "learning_rate": 9.72404542228531e-05, + "loss": 0.09035987257957459, + "step": 19450 + }, + { + "epoch": 2.7622427253371185, + "grad_norm": 3.8677871227264404, + "learning_rate": 9.72390347764372e-05, + "loss": 0.09661787152290344, + "step": 19460 + }, + { + "epoch": 2.7636621717530163, + "grad_norm": 1.7954285144805908, + "learning_rate": 9.723761533002129e-05, + "loss": 0.11428978443145751, + "step": 19470 + }, + { + "epoch": 2.765081618168914, + "grad_norm": 8.921133041381836, + "learning_rate": 9.72361958836054e-05, + "loss": 0.14268529415130615, + "step": 19480 + }, + { + "epoch": 2.766501064584812, + "grad_norm": 0.6554881930351257, + "learning_rate": 9.72347764371895e-05, + "loss": 0.05844693183898926, + "step": 19490 + }, + { + "epoch": 2.7679205110007095, + "grad_norm": 1.2021902799606323, + "learning_rate": 9.723335699077361e-05, + "loss": 0.048795363306999205, + "step": 19500 + }, + { + "epoch": 2.7679205110007095, + "eval_accuracy": 0.9642016913588097, + "eval_loss": 0.10457975417375565, + "eval_runtime": 32.1695, + "eval_samples_per_second": 488.879, + "eval_steps_per_second": 15.294, + "step": 19500 + }, + { + "epoch": 2.7693399574166078, + "grad_norm": 3.3482987880706787, + "learning_rate": 9.72319375443577e-05, + "loss": 0.08013315200805664, + "step": 19510 + }, + { + "epoch": 2.770759403832505, + "grad_norm": 7.4644036293029785, + "learning_rate": 9.72305180979418e-05, + "loss": 0.12772181034088134, + "step": 19520 + }, + { + "epoch": 2.772178850248403, + "grad_norm": 4.970337390899658, + "learning_rate": 9.722909865152591e-05, + "loss": 0.08325361609458923, + "step": 19530 + }, + { + "epoch": 2.773598296664301, + "grad_norm": 5.109130382537842, + "learning_rate": 9.722767920511001e-05, + "loss": 0.12823007106781006, + "step": 19540 + }, + { + "epoch": 2.7750177430801988, + "grad_norm": Infinity, + "learning_rate": 9.722625975869412e-05, + "loss": 0.07545018792152405, + "step": 19550 + }, + { + "epoch": 2.7764371894960966, + "grad_norm": 2.3274765014648438, + "learning_rate": 9.72249822569198e-05, + "loss": 0.09213562607765198, + "step": 19560 + }, + { + "epoch": 2.777856635911994, + "grad_norm": 1.3119785785675049, + "learning_rate": 9.72235628105039e-05, + "loss": 0.09134193658828735, + "step": 19570 + }, + { + "epoch": 2.7792760823278924, + "grad_norm": 1.7308454513549805, + "learning_rate": 9.722214336408801e-05, + "loss": 0.07336680889129639, + "step": 19580 + }, + { + "epoch": 2.78069552874379, + "grad_norm": 5.1270623207092285, + "learning_rate": 9.722072391767211e-05, + "loss": 0.10246673822402955, + "step": 19590 + }, + { + "epoch": 2.7821149751596876, + "grad_norm": 8.638457298278809, + "learning_rate": 9.721930447125621e-05, + "loss": 0.15175464153289794, + "step": 19600 + }, + { + "epoch": 2.7835344215755855, + "grad_norm": 2.7487826347351074, + "learning_rate": 9.721788502484032e-05, + "loss": 0.09026304483413697, + "step": 19610 + }, + { + "epoch": 2.7849538679914834, + "grad_norm": 1.0804003477096558, + "learning_rate": 9.721646557842442e-05, + "loss": 0.1334142804145813, + "step": 19620 + }, + { + "epoch": 2.7863733144073812, + "grad_norm": 4.871701717376709, + "learning_rate": 9.721504613200853e-05, + "loss": 0.0774698793888092, + "step": 19630 + }, + { + "epoch": 2.7877927608232786, + "grad_norm": 5.122735500335693, + "learning_rate": 9.721362668559261e-05, + "loss": 0.0750051498413086, + "step": 19640 + }, + { + "epoch": 2.789212207239177, + "grad_norm": 4.928715705871582, + "learning_rate": 9.721220723917672e-05, + "loss": 0.10383319854736328, + "step": 19650 + }, + { + "epoch": 2.7906316536550744, + "grad_norm": 4.654665470123291, + "learning_rate": 9.721078779276082e-05, + "loss": 0.07332990169525147, + "step": 19660 + }, + { + "epoch": 2.7920511000709722, + "grad_norm": 9.121614456176758, + "learning_rate": 9.720936834634493e-05, + "loss": 0.17799346446990966, + "step": 19670 + }, + { + "epoch": 2.79347054648687, + "grad_norm": 0.8097667694091797, + "learning_rate": 9.720794889992903e-05, + "loss": 0.13993927240371704, + "step": 19680 + }, + { + "epoch": 2.794889992902768, + "grad_norm": 6.301029682159424, + "learning_rate": 9.720652945351314e-05, + "loss": 0.049062016606330874, + "step": 19690 + }, + { + "epoch": 2.796309439318666, + "grad_norm": 7.916932582855225, + "learning_rate": 9.720511000709724e-05, + "loss": 0.13611079454421998, + "step": 19700 + }, + { + "epoch": 2.7977288857345637, + "grad_norm": 6.278209209442139, + "learning_rate": 9.720369056068134e-05, + "loss": 0.12774984836578368, + "step": 19710 + }, + { + "epoch": 2.7991483321504615, + "grad_norm": 8.645759582519531, + "learning_rate": 9.720227111426545e-05, + "loss": 0.09328774213790894, + "step": 19720 + }, + { + "epoch": 2.800567778566359, + "grad_norm": 3.0282325744628906, + "learning_rate": 9.720085166784954e-05, + "loss": 0.0923624575138092, + "step": 19730 + }, + { + "epoch": 2.801987224982257, + "grad_norm": 4.2578444480896, + "learning_rate": 9.719943222143365e-05, + "loss": 0.09177879095077515, + "step": 19740 + }, + { + "epoch": 2.8034066713981547, + "grad_norm": 7.6798996925354, + "learning_rate": 9.719801277501774e-05, + "loss": 0.12493581771850586, + "step": 19750 + }, + { + "epoch": 2.8048261178140526, + "grad_norm": 4.347507953643799, + "learning_rate": 9.719659332860185e-05, + "loss": 0.09963855147361755, + "step": 19760 + }, + { + "epoch": 2.8062455642299504, + "grad_norm": 4.931194305419922, + "learning_rate": 9.719517388218595e-05, + "loss": 0.07842986583709717, + "step": 19770 + }, + { + "epoch": 2.8076650106458483, + "grad_norm": 4.186477184295654, + "learning_rate": 9.719375443577006e-05, + "loss": 0.12233660221099854, + "step": 19780 + }, + { + "epoch": 2.809084457061746, + "grad_norm": 7.659719944000244, + "learning_rate": 9.719233498935415e-05, + "loss": 0.09655895829200745, + "step": 19790 + }, + { + "epoch": 2.8105039034776436, + "grad_norm": 0.47399571537971497, + "learning_rate": 9.719091554293825e-05, + "loss": 0.07599647045135498, + "step": 19800 + }, + { + "epoch": 2.8119233498935414, + "grad_norm": 4.59540319442749, + "learning_rate": 9.718949609652236e-05, + "loss": 0.07412179708480834, + "step": 19810 + }, + { + "epoch": 2.8133427963094393, + "grad_norm": 8.436945915222168, + "learning_rate": 9.718807665010646e-05, + "loss": 0.10687708854675293, + "step": 19820 + }, + { + "epoch": 2.814762242725337, + "grad_norm": 4.068880081176758, + "learning_rate": 9.718665720369057e-05, + "loss": 0.04072721004486084, + "step": 19830 + }, + { + "epoch": 2.816181689141235, + "grad_norm": 8.406689643859863, + "learning_rate": 9.718523775727467e-05, + "loss": 0.09728883504867554, + "step": 19840 + }, + { + "epoch": 2.817601135557133, + "grad_norm": 2.9611806869506836, + "learning_rate": 9.718381831085877e-05, + "loss": 0.0824375331401825, + "step": 19850 + }, + { + "epoch": 2.8190205819730307, + "grad_norm": 8.75788402557373, + "learning_rate": 9.718239886444286e-05, + "loss": 0.13575732707977295, + "step": 19860 + }, + { + "epoch": 2.820440028388928, + "grad_norm": 2.153355598449707, + "learning_rate": 9.718097941802697e-05, + "loss": 0.0826115369796753, + "step": 19870 + }, + { + "epoch": 2.821859474804826, + "grad_norm": 5.776090145111084, + "learning_rate": 9.717955997161107e-05, + "loss": 0.07727134227752686, + "step": 19880 + }, + { + "epoch": 2.823278921220724, + "grad_norm": 10.297713279724121, + "learning_rate": 9.717814052519518e-05, + "loss": 0.08978387117385864, + "step": 19890 + }, + { + "epoch": 2.8246983676366217, + "grad_norm": 4.710965156555176, + "learning_rate": 9.717672107877928e-05, + "loss": 0.14321819543838502, + "step": 19900 + }, + { + "epoch": 2.8261178140525196, + "grad_norm": 4.13072395324707, + "learning_rate": 9.717530163236338e-05, + "loss": 0.15760390758514403, + "step": 19910 + }, + { + "epoch": 2.8275372604684175, + "grad_norm": 0.497278094291687, + "learning_rate": 9.717388218594749e-05, + "loss": 0.08274838328361511, + "step": 19920 + }, + { + "epoch": 2.8289567068843153, + "grad_norm": 7.707274913787842, + "learning_rate": 9.717246273953159e-05, + "loss": 0.09570494294166565, + "step": 19930 + }, + { + "epoch": 2.8303761533002127, + "grad_norm": 5.368363857269287, + "learning_rate": 9.71710432931157e-05, + "loss": 0.07190582752227784, + "step": 19940 + }, + { + "epoch": 2.8317955997161106, + "grad_norm": 7.027709484100342, + "learning_rate": 9.716962384669978e-05, + "loss": 0.11582446098327637, + "step": 19950 + }, + { + "epoch": 2.8332150461320085, + "grad_norm": 12.213539123535156, + "learning_rate": 9.716820440028389e-05, + "loss": 0.10933125019073486, + "step": 19960 + }, + { + "epoch": 2.8346344925479063, + "grad_norm": 6.922082901000977, + "learning_rate": 9.716678495386799e-05, + "loss": 0.16551480293273926, + "step": 19970 + }, + { + "epoch": 2.836053938963804, + "grad_norm": 3.005093812942505, + "learning_rate": 9.71653655074521e-05, + "loss": 0.09381322860717774, + "step": 19980 + }, + { + "epoch": 2.837473385379702, + "grad_norm": 5.592711925506592, + "learning_rate": 9.716394606103621e-05, + "loss": 0.11934515237808227, + "step": 19990 + }, + { + "epoch": 2.8388928317956, + "grad_norm": 2.7002058029174805, + "learning_rate": 9.71625266146203e-05, + "loss": 0.06390081644058228, + "step": 20000 + }, + { + "epoch": 2.8388928317956, + "eval_accuracy": 0.9484326317797418, + "eval_loss": 0.14968876540660858, + "eval_runtime": 33.8107, + "eval_samples_per_second": 465.148, + "eval_steps_per_second": 14.552, + "step": 20000 + }, + { + "epoch": 2.8403122782114973, + "grad_norm": 4.695428371429443, + "learning_rate": 9.71611071682044e-05, + "loss": 0.11333894729614258, + "step": 20010 + }, + { + "epoch": 2.841731724627395, + "grad_norm": 0.6784132719039917, + "learning_rate": 9.71596877217885e-05, + "loss": 0.09425503015518188, + "step": 20020 + }, + { + "epoch": 2.843151171043293, + "grad_norm": 7.540246963500977, + "learning_rate": 9.715826827537261e-05, + "loss": 0.15037193298339843, + "step": 20030 + }, + { + "epoch": 2.844570617459191, + "grad_norm": 1.3910176753997803, + "learning_rate": 9.715684882895671e-05, + "loss": 0.10529568195343017, + "step": 20040 + }, + { + "epoch": 2.845990063875089, + "grad_norm": 10.363840103149414, + "learning_rate": 9.715542938254082e-05, + "loss": 0.13602850437164307, + "step": 20050 + }, + { + "epoch": 2.8474095102909867, + "grad_norm": 9.801745414733887, + "learning_rate": 9.71540099361249e-05, + "loss": 0.09394903779029846, + "step": 20060 + }, + { + "epoch": 2.8488289567068845, + "grad_norm": 4.273351192474365, + "learning_rate": 9.715259048970902e-05, + "loss": 0.12311586141586303, + "step": 20070 + }, + { + "epoch": 2.850248403122782, + "grad_norm": 11.77322006225586, + "learning_rate": 9.715117104329313e-05, + "loss": 0.12338924407958984, + "step": 20080 + }, + { + "epoch": 2.85166784953868, + "grad_norm": 2.7312419414520264, + "learning_rate": 9.714975159687723e-05, + "loss": 0.06953715085983277, + "step": 20090 + }, + { + "epoch": 2.8530872959545777, + "grad_norm": 5.562644958496094, + "learning_rate": 9.714833215046134e-05, + "loss": 0.06668174266815186, + "step": 20100 + }, + { + "epoch": 2.8545067423704755, + "grad_norm": 6.543910980224609, + "learning_rate": 9.714691270404542e-05, + "loss": 0.11938363313674927, + "step": 20110 + }, + { + "epoch": 2.8559261887863734, + "grad_norm": 1.5311610698699951, + "learning_rate": 9.714549325762953e-05, + "loss": 0.0953073263168335, + "step": 20120 + }, + { + "epoch": 2.8573456352022713, + "grad_norm": 10.13642406463623, + "learning_rate": 9.714407381121363e-05, + "loss": 0.10842293500900269, + "step": 20130 + }, + { + "epoch": 2.858765081618169, + "grad_norm": 6.405614376068115, + "learning_rate": 9.714265436479774e-05, + "loss": 0.18160440921783447, + "step": 20140 + }, + { + "epoch": 2.8601845280340665, + "grad_norm": 8.15994644165039, + "learning_rate": 9.714123491838184e-05, + "loss": 0.15880486965179444, + "step": 20150 + }, + { + "epoch": 2.8616039744499644, + "grad_norm": 9.660137176513672, + "learning_rate": 9.713981547196593e-05, + "loss": 0.1277371048927307, + "step": 20160 + }, + { + "epoch": 2.8630234208658623, + "grad_norm": 13.830092430114746, + "learning_rate": 9.713839602555004e-05, + "loss": 0.12971055507659912, + "step": 20170 + }, + { + "epoch": 2.86444286728176, + "grad_norm": 3.822737455368042, + "learning_rate": 9.713697657913414e-05, + "loss": 0.16139203310012817, + "step": 20180 + }, + { + "epoch": 2.865862313697658, + "grad_norm": 2.0092313289642334, + "learning_rate": 9.713555713271825e-05, + "loss": 0.06620528101921082, + "step": 20190 + }, + { + "epoch": 2.867281760113556, + "grad_norm": 3.479095458984375, + "learning_rate": 9.713413768630235e-05, + "loss": 0.10068619251251221, + "step": 20200 + }, + { + "epoch": 2.8687012065294537, + "grad_norm": 1.8399436473846436, + "learning_rate": 9.713271823988645e-05, + "loss": 0.07809083461761475, + "step": 20210 + }, + { + "epoch": 2.870120652945351, + "grad_norm": 1.2535580396652222, + "learning_rate": 9.713129879347055e-05, + "loss": 0.10528775453567504, + "step": 20220 + }, + { + "epoch": 2.871540099361249, + "grad_norm": 5.34690523147583, + "learning_rate": 9.712987934705466e-05, + "loss": 0.09714440107345582, + "step": 20230 + }, + { + "epoch": 2.872959545777147, + "grad_norm": 3.72548770904541, + "learning_rate": 9.712845990063875e-05, + "loss": 0.05409139394760132, + "step": 20240 + }, + { + "epoch": 2.8743789921930447, + "grad_norm": 4.422288417816162, + "learning_rate": 9.712704045422286e-05, + "loss": 0.0929717779159546, + "step": 20250 + }, + { + "epoch": 2.8757984386089426, + "grad_norm": 1.4169726371765137, + "learning_rate": 9.712562100780696e-05, + "loss": 0.04481082260608673, + "step": 20260 + }, + { + "epoch": 2.8772178850248404, + "grad_norm": 3.0234224796295166, + "learning_rate": 9.712420156139106e-05, + "loss": 0.14665982723236085, + "step": 20270 + }, + { + "epoch": 2.8786373314407383, + "grad_norm": 0.8741635680198669, + "learning_rate": 9.712278211497517e-05, + "loss": 0.057705503702163694, + "step": 20280 + }, + { + "epoch": 2.8800567778566357, + "grad_norm": 1.1250085830688477, + "learning_rate": 9.712136266855927e-05, + "loss": 0.1067537546157837, + "step": 20290 + }, + { + "epoch": 2.8814762242725336, + "grad_norm": 10.388190269470215, + "learning_rate": 9.711994322214338e-05, + "loss": 0.10462450981140137, + "step": 20300 + }, + { + "epoch": 2.8828956706884314, + "grad_norm": 3.0416109561920166, + "learning_rate": 9.711852377572746e-05, + "loss": 0.10544465780258179, + "step": 20310 + }, + { + "epoch": 2.8843151171043293, + "grad_norm": 5.297311782836914, + "learning_rate": 9.711710432931157e-05, + "loss": 0.06729884147644043, + "step": 20320 + }, + { + "epoch": 2.885734563520227, + "grad_norm": 2.5105323791503906, + "learning_rate": 9.711568488289567e-05, + "loss": 0.08199673295021057, + "step": 20330 + }, + { + "epoch": 2.887154009936125, + "grad_norm": 2.514965057373047, + "learning_rate": 9.711426543647978e-05, + "loss": 0.07696297764778137, + "step": 20340 + }, + { + "epoch": 2.888573456352023, + "grad_norm": 2.9623782634735107, + "learning_rate": 9.711284599006388e-05, + "loss": 0.06418653130531311, + "step": 20350 + }, + { + "epoch": 2.8899929027679203, + "grad_norm": 7.9242777824401855, + "learning_rate": 9.711142654364798e-05, + "loss": 0.10036368370056152, + "step": 20360 + }, + { + "epoch": 2.891412349183818, + "grad_norm": 0.3050519526004791, + "learning_rate": 9.711000709723209e-05, + "loss": 0.09710363149642945, + "step": 20370 + }, + { + "epoch": 2.892831795599716, + "grad_norm": 4.167988300323486, + "learning_rate": 9.710858765081618e-05, + "loss": 0.09933966994285584, + "step": 20380 + }, + { + "epoch": 2.894251242015614, + "grad_norm": 4.994990348815918, + "learning_rate": 9.71071682044003e-05, + "loss": 0.14826220273971558, + "step": 20390 + }, + { + "epoch": 2.8956706884315118, + "grad_norm": 5.276573657989502, + "learning_rate": 9.710574875798439e-05, + "loss": 0.06008061766624451, + "step": 20400 + }, + { + "epoch": 2.8970901348474096, + "grad_norm": 1.4481778144836426, + "learning_rate": 9.71043293115685e-05, + "loss": 0.07454321980476379, + "step": 20410 + }, + { + "epoch": 2.8985095812633075, + "grad_norm": 3.215022087097168, + "learning_rate": 9.710290986515259e-05, + "loss": 0.11371394395828247, + "step": 20420 + }, + { + "epoch": 2.899929027679205, + "grad_norm": 7.932292461395264, + "learning_rate": 9.71014904187367e-05, + "loss": 0.1307593822479248, + "step": 20430 + }, + { + "epoch": 2.9013484740951028, + "grad_norm": 3.419353723526001, + "learning_rate": 9.71000709723208e-05, + "loss": 0.11492658853530884, + "step": 20440 + }, + { + "epoch": 2.9027679205110006, + "grad_norm": 1.6420551538467407, + "learning_rate": 9.70986515259049e-05, + "loss": 0.09474117159843445, + "step": 20450 + }, + { + "epoch": 2.9041873669268985, + "grad_norm": 1.5180848836898804, + "learning_rate": 9.7097232079489e-05, + "loss": 0.16010476350784303, + "step": 20460 + }, + { + "epoch": 2.9056068133427964, + "grad_norm": 7.387273788452148, + "learning_rate": 9.70958126330731e-05, + "loss": 0.12979986667633056, + "step": 20470 + }, + { + "epoch": 2.907026259758694, + "grad_norm": 2.0460073947906494, + "learning_rate": 9.709439318665721e-05, + "loss": 0.09822458028793335, + "step": 20480 + }, + { + "epoch": 2.908445706174592, + "grad_norm": 8.7783784866333, + "learning_rate": 9.709297374024131e-05, + "loss": 0.10728850364685058, + "step": 20490 + }, + { + "epoch": 2.9098651525904895, + "grad_norm": 10.74223804473877, + "learning_rate": 9.709155429382542e-05, + "loss": 0.14357963800430298, + "step": 20500 + }, + { + "epoch": 2.9098651525904895, + "eval_accuracy": 0.9639473516881796, + "eval_loss": 0.10387223958969116, + "eval_runtime": 32.6025, + "eval_samples_per_second": 482.386, + "eval_steps_per_second": 15.091, + "step": 20500 + }, + { + "epoch": 2.9112845990063874, + "grad_norm": 6.561285495758057, + "learning_rate": 9.709013484740952e-05, + "loss": 0.09862427711486817, + "step": 20510 + }, + { + "epoch": 2.9127040454222852, + "grad_norm": 9.80976390838623, + "learning_rate": 9.708871540099362e-05, + "loss": 0.10329036712646485, + "step": 20520 + }, + { + "epoch": 2.914123491838183, + "grad_norm": 3.2249553203582764, + "learning_rate": 9.708729595457771e-05, + "loss": 0.09398200511932372, + "step": 20530 + }, + { + "epoch": 2.915542938254081, + "grad_norm": 3.0429465770721436, + "learning_rate": 9.708587650816182e-05, + "loss": 0.08952078223228455, + "step": 20540 + }, + { + "epoch": 2.916962384669979, + "grad_norm": 2.384573459625244, + "learning_rate": 9.708445706174592e-05, + "loss": 0.06835871934890747, + "step": 20550 + }, + { + "epoch": 2.9183818310858767, + "grad_norm": 2.1423826217651367, + "learning_rate": 9.708303761533003e-05, + "loss": 0.07619114518165589, + "step": 20560 + }, + { + "epoch": 2.919801277501774, + "grad_norm": 3.932051181793213, + "learning_rate": 9.708161816891413e-05, + "loss": 0.11714667081832886, + "step": 20570 + }, + { + "epoch": 2.921220723917672, + "grad_norm": 5.277032852172852, + "learning_rate": 9.708019872249823e-05, + "loss": 0.07001240253448486, + "step": 20580 + }, + { + "epoch": 2.92264017033357, + "grad_norm": 5.1413798332214355, + "learning_rate": 9.707877927608234e-05, + "loss": 0.09098179340362549, + "step": 20590 + }, + { + "epoch": 2.9240596167494677, + "grad_norm": 5.64100456237793, + "learning_rate": 9.707735982966644e-05, + "loss": 0.10666381120681763, + "step": 20600 + }, + { + "epoch": 2.9254790631653655, + "grad_norm": 9.501540184020996, + "learning_rate": 9.707594038325055e-05, + "loss": 0.05949283242225647, + "step": 20610 + }, + { + "epoch": 2.9268985095812634, + "grad_norm": 6.489498138427734, + "learning_rate": 9.707452093683463e-05, + "loss": 0.083852881193161, + "step": 20620 + }, + { + "epoch": 2.9283179559971613, + "grad_norm": 1.9999171495437622, + "learning_rate": 9.707310149041874e-05, + "loss": 0.08824072480201721, + "step": 20630 + }, + { + "epoch": 2.9297374024130587, + "grad_norm": 10.467041015625, + "learning_rate": 9.707168204400284e-05, + "loss": 0.22370665073394774, + "step": 20640 + }, + { + "epoch": 2.9311568488289566, + "grad_norm": 3.191193103790283, + "learning_rate": 9.707026259758695e-05, + "loss": 0.08672508597373962, + "step": 20650 + }, + { + "epoch": 2.9325762952448544, + "grad_norm": 8.910825729370117, + "learning_rate": 9.706884315117105e-05, + "loss": 0.09984519481658935, + "step": 20660 + }, + { + "epoch": 2.9339957416607523, + "grad_norm": 5.282776832580566, + "learning_rate": 9.706742370475514e-05, + "loss": 0.12132351398468018, + "step": 20670 + }, + { + "epoch": 2.93541518807665, + "grad_norm": 6.024061679840088, + "learning_rate": 9.706600425833925e-05, + "loss": 0.0702341616153717, + "step": 20680 + }, + { + "epoch": 2.936834634492548, + "grad_norm": 3.016757011413574, + "learning_rate": 9.706458481192335e-05, + "loss": 0.18172093629837036, + "step": 20690 + }, + { + "epoch": 2.938254080908446, + "grad_norm": 6.451714515686035, + "learning_rate": 9.706316536550746e-05, + "loss": 0.16414980888366698, + "step": 20700 + }, + { + "epoch": 2.9396735273243433, + "grad_norm": 3.6543655395507812, + "learning_rate": 9.706174591909156e-05, + "loss": 0.06588509678840637, + "step": 20710 + }, + { + "epoch": 2.941092973740241, + "grad_norm": 2.2044341564178467, + "learning_rate": 9.706032647267566e-05, + "loss": 0.10182955265045165, + "step": 20720 + }, + { + "epoch": 2.942512420156139, + "grad_norm": 4.035127639770508, + "learning_rate": 9.705890702625976e-05, + "loss": 0.08563597202301025, + "step": 20730 + }, + { + "epoch": 2.943931866572037, + "grad_norm": 0.5155683159828186, + "learning_rate": 9.705748757984387e-05, + "loss": 0.061300069093704224, + "step": 20740 + }, + { + "epoch": 2.9453513129879347, + "grad_norm": 5.438033103942871, + "learning_rate": 9.705606813342796e-05, + "loss": 0.10039635896682739, + "step": 20750 + }, + { + "epoch": 2.9467707594038326, + "grad_norm": 4.031142711639404, + "learning_rate": 9.705464868701207e-05, + "loss": 0.13106780052185057, + "step": 20760 + }, + { + "epoch": 2.9481902058197305, + "grad_norm": 1.6434075832366943, + "learning_rate": 9.705322924059617e-05, + "loss": 0.06390889883041381, + "step": 20770 + }, + { + "epoch": 2.949609652235628, + "grad_norm": 0.5606821775436401, + "learning_rate": 9.705180979418027e-05, + "loss": 0.1445988893508911, + "step": 20780 + }, + { + "epoch": 2.9510290986515257, + "grad_norm": 8.509517669677734, + "learning_rate": 9.705039034776438e-05, + "loss": 0.11696761846542358, + "step": 20790 + }, + { + "epoch": 2.9524485450674236, + "grad_norm": 1.219256043434143, + "learning_rate": 9.704897090134848e-05, + "loss": 0.1131407618522644, + "step": 20800 + }, + { + "epoch": 2.9538679914833215, + "grad_norm": 4.903664588928223, + "learning_rate": 9.704755145493259e-05, + "loss": 0.12911027669906616, + "step": 20810 + }, + { + "epoch": 2.9552874378992193, + "grad_norm": 2.6238746643066406, + "learning_rate": 9.704613200851669e-05, + "loss": 0.04189004898071289, + "step": 20820 + }, + { + "epoch": 2.956706884315117, + "grad_norm": 4.9772443771362305, + "learning_rate": 9.704471256210078e-05, + "loss": 0.1641558289527893, + "step": 20830 + }, + { + "epoch": 2.958126330731015, + "grad_norm": 3.766991376876831, + "learning_rate": 9.704329311568488e-05, + "loss": 0.15985740423202516, + "step": 20840 + }, + { + "epoch": 2.9595457771469125, + "grad_norm": 0.883904218673706, + "learning_rate": 9.704187366926899e-05, + "loss": 0.04713291525840759, + "step": 20850 + }, + { + "epoch": 2.9609652235628108, + "grad_norm": 2.7351174354553223, + "learning_rate": 9.704045422285309e-05, + "loss": 0.08329285383224487, + "step": 20860 + }, + { + "epoch": 2.962384669978708, + "grad_norm": 7.424506187438965, + "learning_rate": 9.70390347764372e-05, + "loss": 0.1191827893257141, + "step": 20870 + }, + { + "epoch": 2.963804116394606, + "grad_norm": 2.405928134918213, + "learning_rate": 9.70376153300213e-05, + "loss": 0.03490549027919769, + "step": 20880 + }, + { + "epoch": 2.965223562810504, + "grad_norm": 2.498183488845825, + "learning_rate": 9.70361958836054e-05, + "loss": 0.043746381998062134, + "step": 20890 + }, + { + "epoch": 2.966643009226402, + "grad_norm": 5.296067237854004, + "learning_rate": 9.70347764371895e-05, + "loss": 0.12182191610336304, + "step": 20900 + }, + { + "epoch": 2.9680624556422996, + "grad_norm": 5.240711688995361, + "learning_rate": 9.70333569907736e-05, + "loss": 0.05737144351005554, + "step": 20910 + }, + { + "epoch": 2.969481902058197, + "grad_norm": 9.032751083374023, + "learning_rate": 9.703193754435771e-05, + "loss": 0.10139278173446656, + "step": 20920 + }, + { + "epoch": 2.9709013484740954, + "grad_norm": 8.68384838104248, + "learning_rate": 9.70305180979418e-05, + "loss": 0.10488021373748779, + "step": 20930 + }, + { + "epoch": 2.972320794889993, + "grad_norm": 11.946162223815918, + "learning_rate": 9.702909865152591e-05, + "loss": 0.08643736839294433, + "step": 20940 + }, + { + "epoch": 2.9737402413058907, + "grad_norm": 7.999373435974121, + "learning_rate": 9.702767920511e-05, + "loss": 0.10784640312194824, + "step": 20950 + }, + { + "epoch": 2.9751596877217885, + "grad_norm": 10.503974914550781, + "learning_rate": 9.702625975869412e-05, + "loss": 0.14789512157440185, + "step": 20960 + }, + { + "epoch": 2.9765791341376864, + "grad_norm": 2.7038733959198, + "learning_rate": 9.702484031227821e-05, + "loss": 0.11931388378143311, + "step": 20970 + }, + { + "epoch": 2.9779985805535842, + "grad_norm": 4.435423374176025, + "learning_rate": 9.702342086586231e-05, + "loss": 0.1014961838722229, + "step": 20980 + }, + { + "epoch": 2.9794180269694817, + "grad_norm": 9.037029266357422, + "learning_rate": 9.702200141944642e-05, + "loss": 0.10802547931671143, + "step": 20990 + }, + { + "epoch": 2.98083747338538, + "grad_norm": 1.3593106269836426, + "learning_rate": 9.702058197303052e-05, + "loss": 0.06956174969673157, + "step": 21000 + }, + { + "epoch": 2.98083747338538, + "eval_accuracy": 0.9595599923698099, + "eval_loss": 0.12062688916921616, + "eval_runtime": 33.1203, + "eval_samples_per_second": 474.845, + "eval_steps_per_second": 14.855, + "step": 21000 + }, + { + "epoch": 2.9822569198012774, + "grad_norm": 0.6549391150474548, + "learning_rate": 9.701916252661463e-05, + "loss": 0.13815345764160156, + "step": 21010 + }, + { + "epoch": 2.9836763662171752, + "grad_norm": 6.318053722381592, + "learning_rate": 9.701774308019873e-05, + "loss": 0.126990008354187, + "step": 21020 + }, + { + "epoch": 2.985095812633073, + "grad_norm": 0.9818340539932251, + "learning_rate": 9.701632363378283e-05, + "loss": 0.1425946831703186, + "step": 21030 + }, + { + "epoch": 2.986515259048971, + "grad_norm": 7.161218643188477, + "learning_rate": 9.701490418736692e-05, + "loss": 0.10167466402053833, + "step": 21040 + }, + { + "epoch": 2.987934705464869, + "grad_norm": 2.8544816970825195, + "learning_rate": 9.701348474095103e-05, + "loss": 0.052589023113250734, + "step": 21050 + }, + { + "epoch": 2.9893541518807663, + "grad_norm": 3.788613796234131, + "learning_rate": 9.701206529453513e-05, + "loss": 0.0730807602405548, + "step": 21060 + }, + { + "epoch": 2.9907735982966646, + "grad_norm": 3.1659812927246094, + "learning_rate": 9.701064584811924e-05, + "loss": 0.0667772889137268, + "step": 21070 + }, + { + "epoch": 2.992193044712562, + "grad_norm": 3.7923996448516846, + "learning_rate": 9.700922640170334e-05, + "loss": 0.09958038330078126, + "step": 21080 + }, + { + "epoch": 2.99361249112846, + "grad_norm": 0.3780229985713959, + "learning_rate": 9.700780695528744e-05, + "loss": 0.04535020887851715, + "step": 21090 + }, + { + "epoch": 2.9950319375443577, + "grad_norm": 6.924422264099121, + "learning_rate": 9.700638750887155e-05, + "loss": 0.07231849431991577, + "step": 21100 + }, + { + "epoch": 2.9964513839602556, + "grad_norm": 4.052742958068848, + "learning_rate": 9.700496806245565e-05, + "loss": 0.06652356386184692, + "step": 21110 + }, + { + "epoch": 2.9978708303761534, + "grad_norm": 2.4228880405426025, + "learning_rate": 9.700354861603976e-05, + "loss": 0.13166139125823975, + "step": 21120 + }, + { + "epoch": 2.999290276792051, + "grad_norm": 1.3871126174926758, + "learning_rate": 9.700212916962385e-05, + "loss": 0.0858015775680542, + "step": 21130 + }, + { + "epoch": 3.0007097232079487, + "grad_norm": 4.722600936889648, + "learning_rate": 9.700070972320795e-05, + "loss": 0.12765930891036986, + "step": 21140 + }, + { + "epoch": 3.0021291696238466, + "grad_norm": 1.4345152378082275, + "learning_rate": 9.699929027679205e-05, + "loss": 0.11781737804412842, + "step": 21150 + }, + { + "epoch": 3.0035486160397444, + "grad_norm": 4.4884352684021, + "learning_rate": 9.699787083037616e-05, + "loss": 0.05820587873458862, + "step": 21160 + }, + { + "epoch": 3.0049680624556423, + "grad_norm": 2.4350528717041016, + "learning_rate": 9.699645138396026e-05, + "loss": 0.08642982244491577, + "step": 21170 + }, + { + "epoch": 3.00638750887154, + "grad_norm": 2.5722460746765137, + "learning_rate": 9.699503193754437e-05, + "loss": 0.13995343446731567, + "step": 21180 + }, + { + "epoch": 3.007806955287438, + "grad_norm": 8.12808895111084, + "learning_rate": 9.699361249112846e-05, + "loss": 0.10619027614593506, + "step": 21190 + }, + { + "epoch": 3.009226401703336, + "grad_norm": 9.416518211364746, + "learning_rate": 9.699219304471256e-05, + "loss": 0.10611592531204224, + "step": 21200 + }, + { + "epoch": 3.0106458481192333, + "grad_norm": 2.595517873764038, + "learning_rate": 9.699077359829667e-05, + "loss": 0.03944927752017975, + "step": 21210 + }, + { + "epoch": 3.012065294535131, + "grad_norm": 6.59434175491333, + "learning_rate": 9.698935415188077e-05, + "loss": 0.06297655701637268, + "step": 21220 + }, + { + "epoch": 3.013484740951029, + "grad_norm": 7.814486026763916, + "learning_rate": 9.698793470546488e-05, + "loss": 0.12234771251678467, + "step": 21230 + }, + { + "epoch": 3.014904187366927, + "grad_norm": 3.0475339889526367, + "learning_rate": 9.698651525904897e-05, + "loss": 0.10413910150527954, + "step": 21240 + }, + { + "epoch": 3.0163236337828248, + "grad_norm": 3.0739729404449463, + "learning_rate": 9.698509581263308e-05, + "loss": 0.048439356684684756, + "step": 21250 + }, + { + "epoch": 3.0177430801987226, + "grad_norm": 2.5247795581817627, + "learning_rate": 9.698367636621717e-05, + "loss": 0.08907513618469239, + "step": 21260 + }, + { + "epoch": 3.0191625266146205, + "grad_norm": 1.5360527038574219, + "learning_rate": 9.698225691980128e-05, + "loss": 0.08706284761428833, + "step": 21270 + }, + { + "epoch": 3.020581973030518, + "grad_norm": 2.9414641857147217, + "learning_rate": 9.698083747338538e-05, + "loss": 0.06573014259338379, + "step": 21280 + }, + { + "epoch": 3.0220014194464158, + "grad_norm": 4.994847297668457, + "learning_rate": 9.697941802696948e-05, + "loss": 0.13635185956954957, + "step": 21290 + }, + { + "epoch": 3.0234208658623136, + "grad_norm": 1.8111882209777832, + "learning_rate": 9.697799858055359e-05, + "loss": 0.09840369820594788, + "step": 21300 + }, + { + "epoch": 3.0248403122782115, + "grad_norm": 1.4137115478515625, + "learning_rate": 9.697657913413769e-05, + "loss": 0.136954402923584, + "step": 21310 + }, + { + "epoch": 3.0262597586941093, + "grad_norm": 2.749936819076538, + "learning_rate": 9.697530163236339e-05, + "loss": 0.10054677724838257, + "step": 21320 + }, + { + "epoch": 3.027679205110007, + "grad_norm": 4.701079368591309, + "learning_rate": 9.697388218594748e-05, + "loss": 0.07355481386184692, + "step": 21330 + }, + { + "epoch": 3.029098651525905, + "grad_norm": 4.2811408042907715, + "learning_rate": 9.69724627395316e-05, + "loss": 0.07188469767570496, + "step": 21340 + }, + { + "epoch": 3.0305180979418025, + "grad_norm": 7.573612213134766, + "learning_rate": 9.697104329311569e-05, + "loss": 0.05111314058303833, + "step": 21350 + }, + { + "epoch": 3.0319375443577004, + "grad_norm": 3.0801517963409424, + "learning_rate": 9.696962384669979e-05, + "loss": 0.0739107072353363, + "step": 21360 + }, + { + "epoch": 3.033356990773598, + "grad_norm": 14.997776985168457, + "learning_rate": 9.696820440028389e-05, + "loss": 0.18201708793640137, + "step": 21370 + }, + { + "epoch": 3.034776437189496, + "grad_norm": 8.705801963806152, + "learning_rate": 9.6966784953868e-05, + "loss": 0.09414076805114746, + "step": 21380 + }, + { + "epoch": 3.036195883605394, + "grad_norm": 2.687983751296997, + "learning_rate": 9.69653655074521e-05, + "loss": 0.10116174221038818, + "step": 21390 + }, + { + "epoch": 3.037615330021292, + "grad_norm": 3.300055503845215, + "learning_rate": 9.69639460610362e-05, + "loss": 0.05839415788650513, + "step": 21400 + }, + { + "epoch": 3.0390347764371897, + "grad_norm": 4.883892059326172, + "learning_rate": 9.69625266146203e-05, + "loss": 0.0997147798538208, + "step": 21410 + }, + { + "epoch": 3.040454222853087, + "grad_norm": 4.513243675231934, + "learning_rate": 9.69611071682044e-05, + "loss": 0.053650110960006714, + "step": 21420 + }, + { + "epoch": 3.041873669268985, + "grad_norm": 1.9839102029800415, + "learning_rate": 9.695968772178851e-05, + "loss": 0.1009899377822876, + "step": 21430 + }, + { + "epoch": 3.043293115684883, + "grad_norm": 3.678035259246826, + "learning_rate": 9.695826827537261e-05, + "loss": 0.09355159401893616, + "step": 21440 + }, + { + "epoch": 3.0447125621007807, + "grad_norm": 12.25532054901123, + "learning_rate": 9.695684882895672e-05, + "loss": 0.09784587025642395, + "step": 21450 + }, + { + "epoch": 3.0461320085166785, + "grad_norm": 10.08337688446045, + "learning_rate": 9.695542938254082e-05, + "loss": 0.22380528450012208, + "step": 21460 + }, + { + "epoch": 3.0475514549325764, + "grad_norm": 1.0703997611999512, + "learning_rate": 9.695400993612491e-05, + "loss": 0.03969487845897675, + "step": 21470 + }, + { + "epoch": 3.0489709013484743, + "grad_norm": 2.9388980865478516, + "learning_rate": 9.695259048970901e-05, + "loss": 0.07186501622200012, + "step": 21480 + }, + { + "epoch": 3.0503903477643717, + "grad_norm": 3.5290896892547607, + "learning_rate": 9.695117104329312e-05, + "loss": 0.07260159850120544, + "step": 21490 + }, + { + "epoch": 3.0518097941802695, + "grad_norm": 2.9938881397247314, + "learning_rate": 9.694975159687722e-05, + "loss": 0.09509387612342834, + "step": 21500 + }, + { + "epoch": 3.0518097941802695, + "eval_accuracy": 0.9589877281108921, + "eval_loss": 0.12867264449596405, + "eval_runtime": 31.3789, + "eval_samples_per_second": 501.196, + "eval_steps_per_second": 15.679, + "step": 21500 + }, + { + "epoch": 3.0532292405961674, + "grad_norm": 10.751752853393555, + "learning_rate": 9.694833215046133e-05, + "loss": 0.08080363273620605, + "step": 21510 + }, + { + "epoch": 3.0546486870120653, + "grad_norm": 2.299959659576416, + "learning_rate": 9.694691270404543e-05, + "loss": 0.14854525327682494, + "step": 21520 + }, + { + "epoch": 3.056068133427963, + "grad_norm": 4.220566272735596, + "learning_rate": 9.694549325762953e-05, + "loss": 0.09466566443443299, + "step": 21530 + }, + { + "epoch": 3.057487579843861, + "grad_norm": 6.087703704833984, + "learning_rate": 9.694407381121364e-05, + "loss": 0.09965238571166993, + "step": 21540 + }, + { + "epoch": 3.058907026259759, + "grad_norm": 8.385695457458496, + "learning_rate": 9.694265436479773e-05, + "loss": 0.10562925338745117, + "step": 21550 + }, + { + "epoch": 3.0603264726756567, + "grad_norm": 0.5750550031661987, + "learning_rate": 9.694123491838185e-05, + "loss": 0.07159033417701721, + "step": 21560 + }, + { + "epoch": 3.061745919091554, + "grad_norm": 5.470452308654785, + "learning_rate": 9.693981547196593e-05, + "loss": 0.1067430019378662, + "step": 21570 + }, + { + "epoch": 3.063165365507452, + "grad_norm": 1.6126492023468018, + "learning_rate": 9.693839602555004e-05, + "loss": 0.07778850793838502, + "step": 21580 + }, + { + "epoch": 3.06458481192335, + "grad_norm": 8.54702377319336, + "learning_rate": 9.693697657913414e-05, + "loss": 0.051190412044525145, + "step": 21590 + }, + { + "epoch": 3.0660042583392477, + "grad_norm": 5.9458818435668945, + "learning_rate": 9.693555713271825e-05, + "loss": 0.05976734161376953, + "step": 21600 + }, + { + "epoch": 3.0674237047551456, + "grad_norm": 11.962884902954102, + "learning_rate": 9.693413768630235e-05, + "loss": 0.08366570472717286, + "step": 21610 + }, + { + "epoch": 3.0688431511710434, + "grad_norm": 5.248124122619629, + "learning_rate": 9.693271823988644e-05, + "loss": 0.06071932911872864, + "step": 21620 + }, + { + "epoch": 3.0702625975869413, + "grad_norm": 3.1197493076324463, + "learning_rate": 9.693129879347055e-05, + "loss": 0.08671906590461731, + "step": 21630 + }, + { + "epoch": 3.0716820440028387, + "grad_norm": 6.69197940826416, + "learning_rate": 9.692987934705465e-05, + "loss": 0.0895846426486969, + "step": 21640 + }, + { + "epoch": 3.0731014904187366, + "grad_norm": 1.1883106231689453, + "learning_rate": 9.692845990063876e-05, + "loss": 0.09830948114395141, + "step": 21650 + }, + { + "epoch": 3.0745209368346345, + "grad_norm": 1.1830201148986816, + "learning_rate": 9.692704045422286e-05, + "loss": 0.09011884927749633, + "step": 21660 + }, + { + "epoch": 3.0759403832505323, + "grad_norm": 1.0241851806640625, + "learning_rate": 9.692562100780696e-05, + "loss": 0.11997926235198975, + "step": 21670 + }, + { + "epoch": 3.07735982966643, + "grad_norm": 5.068016052246094, + "learning_rate": 9.692420156139105e-05, + "loss": 0.11507253646850586, + "step": 21680 + }, + { + "epoch": 3.078779276082328, + "grad_norm": 3.562347173690796, + "learning_rate": 9.692278211497517e-05, + "loss": 0.07022674679756165, + "step": 21690 + }, + { + "epoch": 3.080198722498226, + "grad_norm": 7.2673163414001465, + "learning_rate": 9.692136266855926e-05, + "loss": 0.09197630882263183, + "step": 21700 + }, + { + "epoch": 3.0816181689141233, + "grad_norm": 2.2533631324768066, + "learning_rate": 9.691994322214337e-05, + "loss": 0.05809432864189148, + "step": 21710 + }, + { + "epoch": 3.083037615330021, + "grad_norm": 5.0073561668396, + "learning_rate": 9.691852377572747e-05, + "loss": 0.10983726978302003, + "step": 21720 + }, + { + "epoch": 3.084457061745919, + "grad_norm": 8.21857738494873, + "learning_rate": 9.691710432931157e-05, + "loss": 0.06723290681838989, + "step": 21730 + }, + { + "epoch": 3.085876508161817, + "grad_norm": 9.05629825592041, + "learning_rate": 9.691568488289568e-05, + "loss": 0.05822429656982422, + "step": 21740 + }, + { + "epoch": 3.0872959545777148, + "grad_norm": 2.9089202880859375, + "learning_rate": 9.691426543647978e-05, + "loss": 0.062278813123703, + "step": 21750 + }, + { + "epoch": 3.0887154009936126, + "grad_norm": 5.445140838623047, + "learning_rate": 9.691284599006389e-05, + "loss": 0.07242774367332458, + "step": 21760 + }, + { + "epoch": 3.0901348474095105, + "grad_norm": 5.643183708190918, + "learning_rate": 9.691142654364798e-05, + "loss": 0.17729694843292237, + "step": 21770 + }, + { + "epoch": 3.091554293825408, + "grad_norm": 1.2977749109268188, + "learning_rate": 9.691000709723208e-05, + "loss": 0.06676494479179382, + "step": 21780 + }, + { + "epoch": 3.092973740241306, + "grad_norm": 3.805422067642212, + "learning_rate": 9.690858765081618e-05, + "loss": 0.09304124712944031, + "step": 21790 + }, + { + "epoch": 3.0943931866572036, + "grad_norm": 6.814877510070801, + "learning_rate": 9.690716820440029e-05, + "loss": 0.08317658305168152, + "step": 21800 + }, + { + "epoch": 3.0958126330731015, + "grad_norm": 6.4380388259887695, + "learning_rate": 9.690574875798439e-05, + "loss": 0.11440763473510743, + "step": 21810 + }, + { + "epoch": 3.0972320794889994, + "grad_norm": 2.2712135314941406, + "learning_rate": 9.69043293115685e-05, + "loss": 0.05781182050704956, + "step": 21820 + }, + { + "epoch": 3.0986515259048972, + "grad_norm": 2.6996850967407227, + "learning_rate": 9.69029098651526e-05, + "loss": 0.09182395935058593, + "step": 21830 + }, + { + "epoch": 3.100070972320795, + "grad_norm": 3.8571221828460693, + "learning_rate": 9.69014904187367e-05, + "loss": 0.05620205998420715, + "step": 21840 + }, + { + "epoch": 3.1014904187366925, + "grad_norm": 2.1438169479370117, + "learning_rate": 9.69000709723208e-05, + "loss": 0.11742359399795532, + "step": 21850 + }, + { + "epoch": 3.1029098651525904, + "grad_norm": 0.5870881676673889, + "learning_rate": 9.68986515259049e-05, + "loss": 0.10411131381988525, + "step": 21860 + }, + { + "epoch": 3.1043293115684882, + "grad_norm": 3.8963239192962646, + "learning_rate": 9.689723207948901e-05, + "loss": 0.073959881067276, + "step": 21870 + }, + { + "epoch": 3.105748757984386, + "grad_norm": 1.869137167930603, + "learning_rate": 9.68958126330731e-05, + "loss": 0.09284948706626892, + "step": 21880 + }, + { + "epoch": 3.107168204400284, + "grad_norm": 7.974472522735596, + "learning_rate": 9.689439318665721e-05, + "loss": 0.08199034929275513, + "step": 21890 + }, + { + "epoch": 3.108587650816182, + "grad_norm": 5.112462520599365, + "learning_rate": 9.68929737402413e-05, + "loss": 0.04500599205493927, + "step": 21900 + }, + { + "epoch": 3.1100070972320797, + "grad_norm": 4.712485313415527, + "learning_rate": 9.689155429382542e-05, + "loss": 0.08608510494232177, + "step": 21910 + }, + { + "epoch": 3.111426543647977, + "grad_norm": 4.643701553344727, + "learning_rate": 9.689013484740951e-05, + "loss": 0.06371254920959472, + "step": 21920 + }, + { + "epoch": 3.112845990063875, + "grad_norm": 0.6126397252082825, + "learning_rate": 9.688871540099361e-05, + "loss": 0.06569015383720397, + "step": 21930 + }, + { + "epoch": 3.114265436479773, + "grad_norm": 0.9692607522010803, + "learning_rate": 9.688729595457772e-05, + "loss": 0.04018869698047638, + "step": 21940 + }, + { + "epoch": 3.1156848828956707, + "grad_norm": 1.4925132989883423, + "learning_rate": 9.688587650816182e-05, + "loss": 0.12035884857177734, + "step": 21950 + }, + { + "epoch": 3.1171043293115686, + "grad_norm": 8.849794387817383, + "learning_rate": 9.688445706174593e-05, + "loss": 0.10423930883407592, + "step": 21960 + }, + { + "epoch": 3.1185237757274664, + "grad_norm": 0.555972158908844, + "learning_rate": 9.688303761533003e-05, + "loss": 0.036292347311973575, + "step": 21970 + }, + { + "epoch": 3.1199432221433643, + "grad_norm": 1.3053301572799683, + "learning_rate": 9.688161816891412e-05, + "loss": 0.055543911457061765, + "step": 21980 + }, + { + "epoch": 3.1213626685592617, + "grad_norm": 5.318549633026123, + "learning_rate": 9.688019872249822e-05, + "loss": 0.06087319850921631, + "step": 21990 + }, + { + "epoch": 3.1227821149751596, + "grad_norm": 1.2716312408447266, + "learning_rate": 9.687877927608233e-05, + "loss": 0.05343518257141113, + "step": 22000 + }, + { + "epoch": 3.1227821149751596, + "eval_accuracy": 0.9604501812170153, + "eval_loss": 0.12624655663967133, + "eval_runtime": 31.6816, + "eval_samples_per_second": 496.408, + "eval_steps_per_second": 15.53, + "step": 22000 + }, + { + "epoch": 3.1242015613910574, + "grad_norm": 2.791890859603882, + "learning_rate": 9.687735982966643e-05, + "loss": 0.16994814872741698, + "step": 22010 + }, + { + "epoch": 3.1256210078069553, + "grad_norm": 0.727378249168396, + "learning_rate": 9.687594038325054e-05, + "loss": 0.09330202341079712, + "step": 22020 + }, + { + "epoch": 3.127040454222853, + "grad_norm": 2.6088101863861084, + "learning_rate": 9.687452093683464e-05, + "loss": 0.05271919369697571, + "step": 22030 + }, + { + "epoch": 3.128459900638751, + "grad_norm": 5.061529159545898, + "learning_rate": 9.687310149041874e-05, + "loss": 0.1032175898551941, + "step": 22040 + }, + { + "epoch": 3.129879347054649, + "grad_norm": 3.324045419692993, + "learning_rate": 9.687168204400285e-05, + "loss": 0.13030195236206055, + "step": 22050 + }, + { + "epoch": 3.1312987934705463, + "grad_norm": 2.8977231979370117, + "learning_rate": 9.687026259758694e-05, + "loss": 0.04515729248523712, + "step": 22060 + }, + { + "epoch": 3.132718239886444, + "grad_norm": 13.42546272277832, + "learning_rate": 9.686884315117106e-05, + "loss": 0.16047141551971436, + "step": 22070 + }, + { + "epoch": 3.134137686302342, + "grad_norm": 8.009624481201172, + "learning_rate": 9.686742370475514e-05, + "loss": 0.07332398891448974, + "step": 22080 + }, + { + "epoch": 3.13555713271824, + "grad_norm": 1.6250791549682617, + "learning_rate": 9.686600425833925e-05, + "loss": 0.08664785027503967, + "step": 22090 + }, + { + "epoch": 3.1369765791341377, + "grad_norm": 3.961372137069702, + "learning_rate": 9.686458481192335e-05, + "loss": 0.05184776782989502, + "step": 22100 + }, + { + "epoch": 3.1383960255500356, + "grad_norm": 3.3162078857421875, + "learning_rate": 9.686316536550746e-05, + "loss": 0.14172728061676027, + "step": 22110 + }, + { + "epoch": 3.1398154719659335, + "grad_norm": 2.8545219898223877, + "learning_rate": 9.686174591909156e-05, + "loss": 0.12487195730209351, + "step": 22120 + }, + { + "epoch": 3.141234918381831, + "grad_norm": 5.991825580596924, + "learning_rate": 9.686032647267567e-05, + "loss": 0.09468575716018676, + "step": 22130 + }, + { + "epoch": 3.1426543647977287, + "grad_norm": 3.7277402877807617, + "learning_rate": 9.685890702625976e-05, + "loss": 0.15779935121536254, + "step": 22140 + }, + { + "epoch": 3.1440738112136266, + "grad_norm": 5.867143630981445, + "learning_rate": 9.685748757984386e-05, + "loss": 0.06446941494941712, + "step": 22150 + }, + { + "epoch": 3.1454932576295245, + "grad_norm": 0.9702675342559814, + "learning_rate": 9.685606813342797e-05, + "loss": 0.10171631574630738, + "step": 22160 + }, + { + "epoch": 3.1469127040454223, + "grad_norm": 12.031753540039062, + "learning_rate": 9.685464868701207e-05, + "loss": 0.1400713086128235, + "step": 22170 + }, + { + "epoch": 3.14833215046132, + "grad_norm": 3.781707525253296, + "learning_rate": 9.685322924059618e-05, + "loss": 0.05259775519371033, + "step": 22180 + }, + { + "epoch": 3.149751596877218, + "grad_norm": 4.4153642654418945, + "learning_rate": 9.685180979418026e-05, + "loss": 0.10050948858261108, + "step": 22190 + }, + { + "epoch": 3.1511710432931155, + "grad_norm": 2.492379665374756, + "learning_rate": 9.685039034776438e-05, + "loss": 0.13373640775680543, + "step": 22200 + }, + { + "epoch": 3.1525904897090133, + "grad_norm": 8.212589263916016, + "learning_rate": 9.684897090134847e-05, + "loss": 0.0804680585861206, + "step": 22210 + }, + { + "epoch": 3.154009936124911, + "grad_norm": 7.918879508972168, + "learning_rate": 9.684755145493258e-05, + "loss": 0.04239166975021362, + "step": 22220 + }, + { + "epoch": 3.155429382540809, + "grad_norm": 0.38615530729293823, + "learning_rate": 9.68461320085167e-05, + "loss": 0.07814024686813355, + "step": 22230 + }, + { + "epoch": 3.156848828956707, + "grad_norm": 6.945682048797607, + "learning_rate": 9.684471256210078e-05, + "loss": 0.11140685081481934, + "step": 22240 + }, + { + "epoch": 3.158268275372605, + "grad_norm": 5.574148654937744, + "learning_rate": 9.684329311568489e-05, + "loss": 0.12524588108062745, + "step": 22250 + }, + { + "epoch": 3.1596877217885027, + "grad_norm": 2.4712400436401367, + "learning_rate": 9.684187366926899e-05, + "loss": 0.06859158277511597, + "step": 22260 + }, + { + "epoch": 3.1611071682044, + "grad_norm": 11.472119331359863, + "learning_rate": 9.68404542228531e-05, + "loss": 0.07999058961868286, + "step": 22270 + }, + { + "epoch": 3.162526614620298, + "grad_norm": 0.743500828742981, + "learning_rate": 9.68390347764372e-05, + "loss": 0.05272719860076904, + "step": 22280 + }, + { + "epoch": 3.163946061036196, + "grad_norm": 3.228672742843628, + "learning_rate": 9.683761533002129e-05, + "loss": 0.09461968541145324, + "step": 22290 + }, + { + "epoch": 3.1653655074520937, + "grad_norm": 9.705907821655273, + "learning_rate": 9.683619588360539e-05, + "loss": 0.08296184539794922, + "step": 22300 + }, + { + "epoch": 3.1667849538679915, + "grad_norm": 5.514443397521973, + "learning_rate": 9.68347764371895e-05, + "loss": 0.08486506342887878, + "step": 22310 + }, + { + "epoch": 3.1682044002838894, + "grad_norm": 10.679105758666992, + "learning_rate": 9.683335699077361e-05, + "loss": 0.1270732879638672, + "step": 22320 + }, + { + "epoch": 3.1696238466997873, + "grad_norm": 6.348006725311279, + "learning_rate": 9.683193754435771e-05, + "loss": 0.09326770305633544, + "step": 22330 + }, + { + "epoch": 3.1710432931156847, + "grad_norm": 0.7028082609176636, + "learning_rate": 9.68305180979418e-05, + "loss": 0.057895565032958986, + "step": 22340 + }, + { + "epoch": 3.1724627395315825, + "grad_norm": 2.103309392929077, + "learning_rate": 9.68290986515259e-05, + "loss": 0.08313475251197815, + "step": 22350 + }, + { + "epoch": 3.1738821859474804, + "grad_norm": 1.7693034410476685, + "learning_rate": 9.682767920511001e-05, + "loss": 0.07178552150726318, + "step": 22360 + }, + { + "epoch": 3.1753016323633783, + "grad_norm": 1.420407772064209, + "learning_rate": 9.682625975869411e-05, + "loss": 0.1434171199798584, + "step": 22370 + }, + { + "epoch": 3.176721078779276, + "grad_norm": 2.847599744796753, + "learning_rate": 9.682484031227822e-05, + "loss": 0.06267567276954651, + "step": 22380 + }, + { + "epoch": 3.178140525195174, + "grad_norm": 2.813729763031006, + "learning_rate": 9.68234208658623e-05, + "loss": 0.07424157261848449, + "step": 22390 + }, + { + "epoch": 3.179559971611072, + "grad_norm": 7.473203182220459, + "learning_rate": 9.682200141944642e-05, + "loss": 0.11200079917907715, + "step": 22400 + }, + { + "epoch": 3.1809794180269693, + "grad_norm": 6.4801177978515625, + "learning_rate": 9.682058197303053e-05, + "loss": 0.13543713092803955, + "step": 22410 + }, + { + "epoch": 3.182398864442867, + "grad_norm": 3.577303409576416, + "learning_rate": 9.681916252661463e-05, + "loss": 0.11488020420074463, + "step": 22420 + }, + { + "epoch": 3.183818310858765, + "grad_norm": 4.844555377960205, + "learning_rate": 9.681774308019874e-05, + "loss": 0.03927421867847443, + "step": 22430 + }, + { + "epoch": 3.185237757274663, + "grad_norm": 1.6158503293991089, + "learning_rate": 9.681632363378282e-05, + "loss": 0.09847801327705383, + "step": 22440 + }, + { + "epoch": 3.1866572036905607, + "grad_norm": 12.733912467956543, + "learning_rate": 9.681490418736693e-05, + "loss": 0.08998562097549438, + "step": 22450 + }, + { + "epoch": 3.1880766501064586, + "grad_norm": 0.760240912437439, + "learning_rate": 9.681348474095103e-05, + "loss": 0.07409574389457703, + "step": 22460 + }, + { + "epoch": 3.1894960965223564, + "grad_norm": 2.920081377029419, + "learning_rate": 9.681206529453514e-05, + "loss": 0.11183276176452636, + "step": 22470 + }, + { + "epoch": 3.190915542938254, + "grad_norm": 4.768205165863037, + "learning_rate": 9.681064584811924e-05, + "loss": 0.07697643041610717, + "step": 22480 + }, + { + "epoch": 3.1923349893541517, + "grad_norm": 3.8446145057678223, + "learning_rate": 9.680922640170335e-05, + "loss": 0.068821781873703, + "step": 22490 + }, + { + "epoch": 3.1937544357700496, + "grad_norm": 8.481558799743652, + "learning_rate": 9.680780695528745e-05, + "loss": 0.09039323329925537, + "step": 22500 + }, + { + "epoch": 3.1937544357700496, + "eval_accuracy": 0.954791123545495, + "eval_loss": 0.1401221603155136, + "eval_runtime": 31.488, + "eval_samples_per_second": 499.46, + "eval_steps_per_second": 15.625, + "step": 22500 + }, + { + "epoch": 3.1951738821859474, + "grad_norm": 5.633203983306885, + "learning_rate": 9.680638750887154e-05, + "loss": 0.07210381031036377, + "step": 22510 + }, + { + "epoch": 3.1965933286018453, + "grad_norm": 1.863991379737854, + "learning_rate": 9.680496806245565e-05, + "loss": 0.0704656958580017, + "step": 22520 + }, + { + "epoch": 3.198012775017743, + "grad_norm": 0.9419695734977722, + "learning_rate": 9.680354861603975e-05, + "loss": 0.08578440546989441, + "step": 22530 + }, + { + "epoch": 3.199432221433641, + "grad_norm": 9.0354642868042, + "learning_rate": 9.680212916962386e-05, + "loss": 0.10872071981430054, + "step": 22540 + }, + { + "epoch": 3.2008516678495385, + "grad_norm": 3.955871820449829, + "learning_rate": 9.680070972320795e-05, + "loss": 0.05301453471183777, + "step": 22550 + }, + { + "epoch": 3.2022711142654363, + "grad_norm": 9.719240188598633, + "learning_rate": 9.679929027679206e-05, + "loss": 0.1132009506225586, + "step": 22560 + }, + { + "epoch": 3.203690560681334, + "grad_norm": 8.175822257995605, + "learning_rate": 9.679787083037615e-05, + "loss": 0.03667646646499634, + "step": 22570 + }, + { + "epoch": 3.205110007097232, + "grad_norm": 1.2011351585388184, + "learning_rate": 9.679645138396027e-05, + "loss": 0.06343533992767333, + "step": 22580 + }, + { + "epoch": 3.20652945351313, + "grad_norm": 2.916196823120117, + "learning_rate": 9.679503193754436e-05, + "loss": 0.054550164937973024, + "step": 22590 + }, + { + "epoch": 3.2079488999290278, + "grad_norm": 11.839608192443848, + "learning_rate": 9.679361249112846e-05, + "loss": 0.11105455160140991, + "step": 22600 + }, + { + "epoch": 3.2093683463449256, + "grad_norm": 5.120648384094238, + "learning_rate": 9.679219304471257e-05, + "loss": 0.04551963210105896, + "step": 22610 + }, + { + "epoch": 3.210787792760823, + "grad_norm": 2.64894437789917, + "learning_rate": 9.679077359829667e-05, + "loss": 0.07367442846298218, + "step": 22620 + }, + { + "epoch": 3.212207239176721, + "grad_norm": 7.870187759399414, + "learning_rate": 9.678935415188078e-05, + "loss": 0.12482872009277343, + "step": 22630 + }, + { + "epoch": 3.2136266855926188, + "grad_norm": 1.49652898311615, + "learning_rate": 9.678793470546488e-05, + "loss": 0.11122183799743653, + "step": 22640 + }, + { + "epoch": 3.2150461320085166, + "grad_norm": 2.1385059356689453, + "learning_rate": 9.678651525904897e-05, + "loss": 0.08030745387077332, + "step": 22650 + }, + { + "epoch": 3.2164655784244145, + "grad_norm": 5.634016036987305, + "learning_rate": 9.678509581263307e-05, + "loss": 0.135706627368927, + "step": 22660 + }, + { + "epoch": 3.2178850248403124, + "grad_norm": 7.30700159072876, + "learning_rate": 9.678367636621718e-05, + "loss": 0.09824522137641907, + "step": 22670 + }, + { + "epoch": 3.21930447125621, + "grad_norm": 3.9598324298858643, + "learning_rate": 9.678225691980128e-05, + "loss": 0.0592613160610199, + "step": 22680 + }, + { + "epoch": 3.220723917672108, + "grad_norm": 3.0672085285186768, + "learning_rate": 9.678083747338539e-05, + "loss": 0.10512404441833496, + "step": 22690 + }, + { + "epoch": 3.2221433640880055, + "grad_norm": 3.729863405227661, + "learning_rate": 9.677941802696949e-05, + "loss": 0.08016419410705566, + "step": 22700 + }, + { + "epoch": 3.2235628105039034, + "grad_norm": 2.7525126934051514, + "learning_rate": 9.677799858055359e-05, + "loss": 0.062538743019104, + "step": 22710 + }, + { + "epoch": 3.2249822569198012, + "grad_norm": 2.103010892868042, + "learning_rate": 9.67765791341377e-05, + "loss": 0.07154433131217956, + "step": 22720 + }, + { + "epoch": 3.226401703335699, + "grad_norm": 1.3044795989990234, + "learning_rate": 9.67751596877218e-05, + "loss": 0.04868173897266388, + "step": 22730 + }, + { + "epoch": 3.227821149751597, + "grad_norm": 0.34033793210983276, + "learning_rate": 9.67737402413059e-05, + "loss": 0.06057687401771546, + "step": 22740 + }, + { + "epoch": 3.229240596167495, + "grad_norm": 14.895809173583984, + "learning_rate": 9.677232079488999e-05, + "loss": 0.14414306879043579, + "step": 22750 + }, + { + "epoch": 3.2306600425833927, + "grad_norm": 2.03631329536438, + "learning_rate": 9.67709013484741e-05, + "loss": 0.03532655239105224, + "step": 22760 + }, + { + "epoch": 3.23207948899929, + "grad_norm": 1.9289063215255737, + "learning_rate": 9.67694819020582e-05, + "loss": 0.04410083889961243, + "step": 22770 + }, + { + "epoch": 3.233498935415188, + "grad_norm": 8.339526176452637, + "learning_rate": 9.676806245564231e-05, + "loss": 0.07176212072372437, + "step": 22780 + }, + { + "epoch": 3.234918381831086, + "grad_norm": 6.541379928588867, + "learning_rate": 9.67666430092264e-05, + "loss": 0.08053820133209229, + "step": 22790 + }, + { + "epoch": 3.2363378282469837, + "grad_norm": 3.6586859226226807, + "learning_rate": 9.67652235628105e-05, + "loss": 0.04074668884277344, + "step": 22800 + }, + { + "epoch": 3.2377572746628815, + "grad_norm": 0.39181602001190186, + "learning_rate": 9.676380411639461e-05, + "loss": 0.06584768891334533, + "step": 22810 + }, + { + "epoch": 3.2391767210787794, + "grad_norm": 4.53519868850708, + "learning_rate": 9.676238466997871e-05, + "loss": 0.10924329757690429, + "step": 22820 + }, + { + "epoch": 3.2405961674946773, + "grad_norm": 5.562971591949463, + "learning_rate": 9.676096522356282e-05, + "loss": 0.12216780185699463, + "step": 22830 + }, + { + "epoch": 3.2420156139105747, + "grad_norm": 9.106098175048828, + "learning_rate": 9.675954577714692e-05, + "loss": 0.09589399695396424, + "step": 22840 + }, + { + "epoch": 3.2434350603264726, + "grad_norm": 8.574522972106934, + "learning_rate": 9.675812633073103e-05, + "loss": 0.0823745608329773, + "step": 22850 + }, + { + "epoch": 3.2448545067423704, + "grad_norm": 8.706705093383789, + "learning_rate": 9.675670688431511e-05, + "loss": 0.14104554653167725, + "step": 22860 + }, + { + "epoch": 3.2462739531582683, + "grad_norm": 8.810419082641602, + "learning_rate": 9.675528743789922e-05, + "loss": 0.05990390777587891, + "step": 22870 + }, + { + "epoch": 3.247693399574166, + "grad_norm": 4.165992736816406, + "learning_rate": 9.675386799148332e-05, + "loss": 0.0668636441230774, + "step": 22880 + }, + { + "epoch": 3.249112845990064, + "grad_norm": 9.099569320678711, + "learning_rate": 9.675244854506743e-05, + "loss": 0.06936246156692505, + "step": 22890 + }, + { + "epoch": 3.250532292405962, + "grad_norm": 4.4353132247924805, + "learning_rate": 9.675102909865153e-05, + "loss": 0.06273015737533569, + "step": 22900 + }, + { + "epoch": 3.2519517388218593, + "grad_norm": 1.2650339603424072, + "learning_rate": 9.674960965223563e-05, + "loss": 0.06168818473815918, + "step": 22910 + }, + { + "epoch": 3.253371185237757, + "grad_norm": 4.567782402038574, + "learning_rate": 9.674819020581974e-05, + "loss": 0.10136575698852539, + "step": 22920 + }, + { + "epoch": 3.254790631653655, + "grad_norm": 6.448585510253906, + "learning_rate": 9.674677075940384e-05, + "loss": 0.07393231987953186, + "step": 22930 + }, + { + "epoch": 3.256210078069553, + "grad_norm": 10.017446517944336, + "learning_rate": 9.674535131298795e-05, + "loss": 0.10242644548416138, + "step": 22940 + }, + { + "epoch": 3.2576295244854507, + "grad_norm": 3.191063404083252, + "learning_rate": 9.674393186657204e-05, + "loss": 0.047987133264541626, + "step": 22950 + }, + { + "epoch": 3.2590489709013486, + "grad_norm": 3.556180477142334, + "learning_rate": 9.674251242015614e-05, + "loss": 0.047191986441612245, + "step": 22960 + }, + { + "epoch": 3.2604684173172465, + "grad_norm": 1.7208983898162842, + "learning_rate": 9.674109297374024e-05, + "loss": 0.08717820644378663, + "step": 22970 + }, + { + "epoch": 3.2618878637331443, + "grad_norm": 5.613543510437012, + "learning_rate": 9.673967352732435e-05, + "loss": 0.11286189556121826, + "step": 22980 + }, + { + "epoch": 3.2633073101490417, + "grad_norm": 5.163478374481201, + "learning_rate": 9.673825408090845e-05, + "loss": 0.11744798421859741, + "step": 22990 + }, + { + "epoch": 3.2647267565649396, + "grad_norm": 3.8311023712158203, + "learning_rate": 9.673683463449256e-05, + "loss": 0.0839583694934845, + "step": 23000 + }, + { + "epoch": 3.2647267565649396, + "eval_accuracy": 0.9688433903478095, + "eval_loss": 0.09206999838352203, + "eval_runtime": 32.6805, + "eval_samples_per_second": 481.235, + "eval_steps_per_second": 15.055, + "step": 23000 + }, + { + "epoch": 3.2661462029808375, + "grad_norm": 6.961423873901367, + "learning_rate": 9.673541518807666e-05, + "loss": 0.060645246505737306, + "step": 23010 + }, + { + "epoch": 3.2675656493967353, + "grad_norm": 4.491827011108398, + "learning_rate": 9.673399574166075e-05, + "loss": 0.060946452617645266, + "step": 23020 + }, + { + "epoch": 3.268985095812633, + "grad_norm": 8.529021263122559, + "learning_rate": 9.673257629524486e-05, + "loss": 0.0623835563659668, + "step": 23030 + }, + { + "epoch": 3.270404542228531, + "grad_norm": 7.560174942016602, + "learning_rate": 9.673115684882896e-05, + "loss": 0.05246782898902893, + "step": 23040 + }, + { + "epoch": 3.271823988644429, + "grad_norm": 5.852350234985352, + "learning_rate": 9.672973740241307e-05, + "loss": 0.1177408218383789, + "step": 23050 + }, + { + "epoch": 3.2732434350603263, + "grad_norm": 2.9898064136505127, + "learning_rate": 9.672831795599716e-05, + "loss": 0.10251556634902954, + "step": 23060 + }, + { + "epoch": 3.274662881476224, + "grad_norm": 0.7350359559059143, + "learning_rate": 9.672689850958127e-05, + "loss": 0.08793265223503113, + "step": 23070 + }, + { + "epoch": 3.276082327892122, + "grad_norm": 7.976613998413086, + "learning_rate": 9.672547906316536e-05, + "loss": 0.11746323108673096, + "step": 23080 + }, + { + "epoch": 3.27750177430802, + "grad_norm": 5.30941915512085, + "learning_rate": 9.672405961674948e-05, + "loss": 0.1818032145500183, + "step": 23090 + }, + { + "epoch": 3.278921220723918, + "grad_norm": 4.999229907989502, + "learning_rate": 9.672264017033357e-05, + "loss": 0.05894123911857605, + "step": 23100 + }, + { + "epoch": 3.2803406671398156, + "grad_norm": 5.794082164764404, + "learning_rate": 9.672122072391767e-05, + "loss": 0.081751549243927, + "step": 23110 + }, + { + "epoch": 3.2817601135557135, + "grad_norm": 2.565143346786499, + "learning_rate": 9.671980127750178e-05, + "loss": 0.07524069547653198, + "step": 23120 + }, + { + "epoch": 3.283179559971611, + "grad_norm": 4.894937038421631, + "learning_rate": 9.671838183108588e-05, + "loss": 0.12828075885772705, + "step": 23130 + }, + { + "epoch": 3.284599006387509, + "grad_norm": 6.212746620178223, + "learning_rate": 9.671696238466999e-05, + "loss": 0.1400521755218506, + "step": 23140 + }, + { + "epoch": 3.2860184528034067, + "grad_norm": 4.2761921882629395, + "learning_rate": 9.671554293825409e-05, + "loss": 0.09644685983657837, + "step": 23150 + }, + { + "epoch": 3.2874378992193045, + "grad_norm": 16.000354766845703, + "learning_rate": 9.671412349183818e-05, + "loss": 0.11152185201644897, + "step": 23160 + }, + { + "epoch": 3.2888573456352024, + "grad_norm": 6.135869026184082, + "learning_rate": 9.671270404542228e-05, + "loss": 0.0771405816078186, + "step": 23170 + }, + { + "epoch": 3.2902767920511002, + "grad_norm": 1.4707847833633423, + "learning_rate": 9.671128459900639e-05, + "loss": 0.09533407092094422, + "step": 23180 + }, + { + "epoch": 3.291696238466998, + "grad_norm": 1.1678895950317383, + "learning_rate": 9.670986515259049e-05, + "loss": 0.09652703404426574, + "step": 23190 + }, + { + "epoch": 3.2931156848828955, + "grad_norm": 3.4155921936035156, + "learning_rate": 9.67084457061746e-05, + "loss": 0.04331456124782562, + "step": 23200 + }, + { + "epoch": 3.2945351312987934, + "grad_norm": 3.263784408569336, + "learning_rate": 9.67070262597587e-05, + "loss": 0.12196718454360962, + "step": 23210 + }, + { + "epoch": 3.2959545777146912, + "grad_norm": 0.8338903188705444, + "learning_rate": 9.67056068133428e-05, + "loss": 0.08930212855339051, + "step": 23220 + }, + { + "epoch": 3.297374024130589, + "grad_norm": 2.3964731693267822, + "learning_rate": 9.67041873669269e-05, + "loss": 0.061741960048675534, + "step": 23230 + }, + { + "epoch": 3.298793470546487, + "grad_norm": 9.600022315979004, + "learning_rate": 9.6702767920511e-05, + "loss": 0.1284554719924927, + "step": 23240 + }, + { + "epoch": 3.300212916962385, + "grad_norm": 1.0024387836456299, + "learning_rate": 9.670134847409511e-05, + "loss": 0.09108211994171142, + "step": 23250 + }, + { + "epoch": 3.3016323633782827, + "grad_norm": 4.58043098449707, + "learning_rate": 9.669992902767921e-05, + "loss": 0.10650498867034912, + "step": 23260 + }, + { + "epoch": 3.30305180979418, + "grad_norm": 3.778592824935913, + "learning_rate": 9.669850958126331e-05, + "loss": 0.0809212327003479, + "step": 23270 + }, + { + "epoch": 3.304471256210078, + "grad_norm": 2.984292984008789, + "learning_rate": 9.669709013484741e-05, + "loss": 0.0674120306968689, + "step": 23280 + }, + { + "epoch": 3.305890702625976, + "grad_norm": 2.295304298400879, + "learning_rate": 9.669567068843152e-05, + "loss": 0.04605483114719391, + "step": 23290 + }, + { + "epoch": 3.3073101490418737, + "grad_norm": 5.067991256713867, + "learning_rate": 9.669425124201562e-05, + "loss": 0.07464765906333923, + "step": 23300 + }, + { + "epoch": 3.3087295954577716, + "grad_norm": 0.5175068974494934, + "learning_rate": 9.669283179559973e-05, + "loss": 0.10126523971557617, + "step": 23310 + }, + { + "epoch": 3.3101490418736694, + "grad_norm": 0.7718493938446045, + "learning_rate": 9.669141234918382e-05, + "loss": 0.1306004047393799, + "step": 23320 + }, + { + "epoch": 3.3115684882895673, + "grad_norm": 0.4733130931854248, + "learning_rate": 9.668999290276792e-05, + "loss": 0.07524165511131287, + "step": 23330 + }, + { + "epoch": 3.3129879347054647, + "grad_norm": 1.91227388381958, + "learning_rate": 9.668857345635203e-05, + "loss": 0.10234876871109008, + "step": 23340 + }, + { + "epoch": 3.3144073811213626, + "grad_norm": 3.8604981899261475, + "learning_rate": 9.668715400993613e-05, + "loss": 0.08232152462005615, + "step": 23350 + }, + { + "epoch": 3.3158268275372604, + "grad_norm": 4.264747619628906, + "learning_rate": 9.668573456352024e-05, + "loss": 0.08970657587051392, + "step": 23360 + }, + { + "epoch": 3.3172462739531583, + "grad_norm": 8.413162231445312, + "learning_rate": 9.668431511710432e-05, + "loss": 0.0798837423324585, + "step": 23370 + }, + { + "epoch": 3.318665720369056, + "grad_norm": 6.562158107757568, + "learning_rate": 9.668289567068843e-05, + "loss": 0.1796337842941284, + "step": 23380 + }, + { + "epoch": 3.320085166784954, + "grad_norm": 6.798343658447266, + "learning_rate": 9.668147622427253e-05, + "loss": 0.13204431533813477, + "step": 23390 + }, + { + "epoch": 3.321504613200852, + "grad_norm": 7.170462131500244, + "learning_rate": 9.668005677785664e-05, + "loss": 0.082490473985672, + "step": 23400 + }, + { + "epoch": 3.3229240596167493, + "grad_norm": 1.1640955209732056, + "learning_rate": 9.667863733144074e-05, + "loss": 0.11552011966705322, + "step": 23410 + }, + { + "epoch": 3.324343506032647, + "grad_norm": 3.5345652103424072, + "learning_rate": 9.667721788502484e-05, + "loss": 0.07584733963012695, + "step": 23420 + }, + { + "epoch": 3.325762952448545, + "grad_norm": 1.844787836074829, + "learning_rate": 9.667579843860895e-05, + "loss": 0.09344690442085266, + "step": 23430 + }, + { + "epoch": 3.327182398864443, + "grad_norm": 2.403691053390503, + "learning_rate": 9.667437899219305e-05, + "loss": 0.057882833480834964, + "step": 23440 + }, + { + "epoch": 3.3286018452803408, + "grad_norm": 2.586052894592285, + "learning_rate": 9.667295954577716e-05, + "loss": 0.07656934261322021, + "step": 23450 + }, + { + "epoch": 3.3300212916962386, + "grad_norm": 0.33396223187446594, + "learning_rate": 9.667154009936125e-05, + "loss": 0.08143852353096008, + "step": 23460 + }, + { + "epoch": 3.3314407381121365, + "grad_norm": 0.9797456860542297, + "learning_rate": 9.667012065294535e-05, + "loss": 0.032908812165260315, + "step": 23470 + }, + { + "epoch": 3.332860184528034, + "grad_norm": 0.3462522625923157, + "learning_rate": 9.666870120652945e-05, + "loss": 0.05224289894104004, + "step": 23480 + }, + { + "epoch": 3.3342796309439318, + "grad_norm": 5.588517189025879, + "learning_rate": 9.666728176011356e-05, + "loss": 0.08177621364593506, + "step": 23490 + }, + { + "epoch": 3.3356990773598296, + "grad_norm": 6.037621021270752, + "learning_rate": 9.666586231369766e-05, + "loss": 0.06431897282600403, + "step": 23500 + }, + { + "epoch": 3.3356990773598296, + "eval_accuracy": 0.9582247090990017, + "eval_loss": 0.1211514100432396, + "eval_runtime": 32.6171, + "eval_samples_per_second": 482.17, + "eval_steps_per_second": 15.084, + "step": 23500 + }, + { + "epoch": 3.3371185237757275, + "grad_norm": 4.2738142013549805, + "learning_rate": 9.666444286728177e-05, + "loss": 0.07732362151145936, + "step": 23510 + }, + { + "epoch": 3.3385379701916253, + "grad_norm": 5.357970237731934, + "learning_rate": 9.666302342086587e-05, + "loss": 0.057775235176086424, + "step": 23520 + }, + { + "epoch": 3.339957416607523, + "grad_norm": 2.4043660163879395, + "learning_rate": 9.666160397444996e-05, + "loss": 0.10017684698104859, + "step": 23530 + }, + { + "epoch": 3.341376863023421, + "grad_norm": 7.4561381340026855, + "learning_rate": 9.666018452803407e-05, + "loss": 0.14003334045410157, + "step": 23540 + }, + { + "epoch": 3.3427963094393185, + "grad_norm": 2.9771358966827393, + "learning_rate": 9.665876508161817e-05, + "loss": 0.11144789457321166, + "step": 23550 + }, + { + "epoch": 3.3442157558552164, + "grad_norm": 5.861306190490723, + "learning_rate": 9.665734563520228e-05, + "loss": 0.1083723783493042, + "step": 23560 + }, + { + "epoch": 3.345635202271114, + "grad_norm": 1.0332176685333252, + "learning_rate": 9.665592618878638e-05, + "loss": 0.08513032793998718, + "step": 23570 + }, + { + "epoch": 3.347054648687012, + "grad_norm": 6.1437177658081055, + "learning_rate": 9.665450674237048e-05, + "loss": 0.08397155404090881, + "step": 23580 + }, + { + "epoch": 3.34847409510291, + "grad_norm": 4.794635772705078, + "learning_rate": 9.665308729595457e-05, + "loss": 0.042923647165298465, + "step": 23590 + }, + { + "epoch": 3.349893541518808, + "grad_norm": 3.806190252304077, + "learning_rate": 9.665166784953869e-05, + "loss": 0.08098719120025635, + "step": 23600 + }, + { + "epoch": 3.3513129879347057, + "grad_norm": 0.2237672656774521, + "learning_rate": 9.665024840312278e-05, + "loss": 0.07011445760726928, + "step": 23610 + }, + { + "epoch": 3.352732434350603, + "grad_norm": 3.0982532501220703, + "learning_rate": 9.66488289567069e-05, + "loss": 0.061842381954193115, + "step": 23620 + }, + { + "epoch": 3.354151880766501, + "grad_norm": 2.976536512374878, + "learning_rate": 9.664740951029099e-05, + "loss": 0.10006380081176758, + "step": 23630 + }, + { + "epoch": 3.355571327182399, + "grad_norm": 4.319900035858154, + "learning_rate": 9.664599006387509e-05, + "loss": 0.13653013706207276, + "step": 23640 + }, + { + "epoch": 3.3569907735982967, + "grad_norm": 1.9102489948272705, + "learning_rate": 9.66445706174592e-05, + "loss": 0.038610780239105226, + "step": 23650 + }, + { + "epoch": 3.3584102200141945, + "grad_norm": 6.633970737457275, + "learning_rate": 9.66431511710433e-05, + "loss": 0.06831348538398743, + "step": 23660 + }, + { + "epoch": 3.3598296664300924, + "grad_norm": 1.1184673309326172, + "learning_rate": 9.664173172462741e-05, + "loss": 0.06864879727363586, + "step": 23670 + }, + { + "epoch": 3.3612491128459903, + "grad_norm": 0.8485651612281799, + "learning_rate": 9.664031227821149e-05, + "loss": 0.08388459086418151, + "step": 23680 + }, + { + "epoch": 3.3626685592618877, + "grad_norm": 1.4212796688079834, + "learning_rate": 9.66388928317956e-05, + "loss": 0.20324900150299072, + "step": 23690 + }, + { + "epoch": 3.3640880056777855, + "grad_norm": 0.2244710922241211, + "learning_rate": 9.66374733853797e-05, + "loss": 0.07268852591514588, + "step": 23700 + }, + { + "epoch": 3.3655074520936834, + "grad_norm": 0.2561863660812378, + "learning_rate": 9.663605393896381e-05, + "loss": 0.036457425355911253, + "step": 23710 + }, + { + "epoch": 3.3669268985095813, + "grad_norm": 2.078640937805176, + "learning_rate": 9.663463449254792e-05, + "loss": 0.07209231853485107, + "step": 23720 + }, + { + "epoch": 3.368346344925479, + "grad_norm": 4.892085552215576, + "learning_rate": 9.6633215046132e-05, + "loss": 0.1211774468421936, + "step": 23730 + }, + { + "epoch": 3.369765791341377, + "grad_norm": 1.651289939880371, + "learning_rate": 9.663179559971612e-05, + "loss": 0.08962616324424744, + "step": 23740 + }, + { + "epoch": 3.371185237757275, + "grad_norm": 1.4341058731079102, + "learning_rate": 9.663037615330021e-05, + "loss": 0.06757261753082275, + "step": 23750 + }, + { + "epoch": 3.3726046841731723, + "grad_norm": 0.5684829354286194, + "learning_rate": 9.662895670688432e-05, + "loss": 0.04020809531211853, + "step": 23760 + }, + { + "epoch": 3.37402413058907, + "grad_norm": 2.886730194091797, + "learning_rate": 9.662753726046842e-05, + "loss": 0.07528796195983886, + "step": 23770 + }, + { + "epoch": 3.375443577004968, + "grad_norm": 7.543295383453369, + "learning_rate": 9.662611781405252e-05, + "loss": 0.2501710891723633, + "step": 23780 + }, + { + "epoch": 3.376863023420866, + "grad_norm": 6.99386739730835, + "learning_rate": 9.662469836763662e-05, + "loss": 0.07813713550567628, + "step": 23790 + }, + { + "epoch": 3.3782824698367637, + "grad_norm": 6.142605781555176, + "learning_rate": 9.662327892122073e-05, + "loss": 0.0971024513244629, + "step": 23800 + }, + { + "epoch": 3.3797019162526616, + "grad_norm": 5.846232891082764, + "learning_rate": 9.662185947480484e-05, + "loss": 0.11326665878295898, + "step": 23810 + }, + { + "epoch": 3.3811213626685594, + "grad_norm": 3.8466222286224365, + "learning_rate": 9.662044002838894e-05, + "loss": 0.09037129282951355, + "step": 23820 + }, + { + "epoch": 3.382540809084457, + "grad_norm": 1.8509072065353394, + "learning_rate": 9.661902058197303e-05, + "loss": 0.08938190340995789, + "step": 23830 + }, + { + "epoch": 3.3839602555003547, + "grad_norm": 8.372735023498535, + "learning_rate": 9.661760113555713e-05, + "loss": 0.10955497026443481, + "step": 23840 + }, + { + "epoch": 3.3853797019162526, + "grad_norm": 10.327803611755371, + "learning_rate": 9.661618168914124e-05, + "loss": 0.10603039264678955, + "step": 23850 + }, + { + "epoch": 3.3867991483321505, + "grad_norm": 2.4464328289031982, + "learning_rate": 9.661476224272534e-05, + "loss": 0.07022827863693237, + "step": 23860 + }, + { + "epoch": 3.3882185947480483, + "grad_norm": 4.964604377746582, + "learning_rate": 9.661334279630945e-05, + "loss": 0.10754181146621704, + "step": 23870 + }, + { + "epoch": 3.389638041163946, + "grad_norm": 2.0936126708984375, + "learning_rate": 9.661192334989355e-05, + "loss": 0.07387771010398865, + "step": 23880 + }, + { + "epoch": 3.391057487579844, + "grad_norm": 1.5606902837753296, + "learning_rate": 9.661050390347764e-05, + "loss": 0.06499841809272766, + "step": 23890 + }, + { + "epoch": 3.3924769339957415, + "grad_norm": 0.09581028670072556, + "learning_rate": 9.660908445706176e-05, + "loss": 0.09357624053955078, + "step": 23900 + }, + { + "epoch": 3.3938963804116393, + "grad_norm": 2.011545181274414, + "learning_rate": 9.660766501064585e-05, + "loss": 0.04169844388961792, + "step": 23910 + }, + { + "epoch": 3.395315826827537, + "grad_norm": 0.6940661668777466, + "learning_rate": 9.660624556422996e-05, + "loss": 0.05995774269104004, + "step": 23920 + }, + { + "epoch": 3.396735273243435, + "grad_norm": 2.8684120178222656, + "learning_rate": 9.660482611781406e-05, + "loss": 0.05829171538352966, + "step": 23930 + }, + { + "epoch": 3.398154719659333, + "grad_norm": 5.727314472198486, + "learning_rate": 9.660340667139816e-05, + "loss": 0.0676846444606781, + "step": 23940 + }, + { + "epoch": 3.3995741660752308, + "grad_norm": 3.3505942821502686, + "learning_rate": 9.660198722498226e-05, + "loss": 0.12202495336532593, + "step": 23950 + }, + { + "epoch": 3.4009936124911286, + "grad_norm": 1.6798441410064697, + "learning_rate": 9.660056777856637e-05, + "loss": 0.10003808736801148, + "step": 23960 + }, + { + "epoch": 3.402413058907026, + "grad_norm": 2.8134841918945312, + "learning_rate": 9.659914833215046e-05, + "loss": 0.053173118829727174, + "step": 23970 + }, + { + "epoch": 3.403832505322924, + "grad_norm": 9.647566795349121, + "learning_rate": 9.659772888573458e-05, + "loss": 0.09169653654098511, + "step": 23980 + }, + { + "epoch": 3.405251951738822, + "grad_norm": 2.525071620941162, + "learning_rate": 9.659630943931867e-05, + "loss": 0.05470997095108032, + "step": 23990 + }, + { + "epoch": 3.4066713981547196, + "grad_norm": 7.918493270874023, + "learning_rate": 9.659488999290277e-05, + "loss": 0.12718768119812013, + "step": 24000 + }, + { + "epoch": 3.4066713981547196, + "eval_accuracy": 0.9593056526991798, + "eval_loss": 0.11784256994724274, + "eval_runtime": 34.1419, + "eval_samples_per_second": 460.636, + "eval_steps_per_second": 14.41, + "step": 24000 + }, + { + "epoch": 3.4080908445706175, + "grad_norm": 8.479427337646484, + "learning_rate": 9.659347054648688e-05, + "loss": 0.12565889358520507, + "step": 24010 + }, + { + "epoch": 3.4095102909865154, + "grad_norm": 1.4310401678085327, + "learning_rate": 9.659205110007098e-05, + "loss": 0.07409765720367431, + "step": 24020 + }, + { + "epoch": 3.4109297374024132, + "grad_norm": 1.3293160200119019, + "learning_rate": 9.659063165365509e-05, + "loss": 0.06405404210090637, + "step": 24030 + }, + { + "epoch": 3.4123491838183106, + "grad_norm": 2.3439300060272217, + "learning_rate": 9.658921220723917e-05, + "loss": 0.07644574642181397, + "step": 24040 + }, + { + "epoch": 3.4137686302342085, + "grad_norm": 5.991164207458496, + "learning_rate": 9.658779276082328e-05, + "loss": 0.1311761498451233, + "step": 24050 + }, + { + "epoch": 3.4151880766501064, + "grad_norm": 4.515506267547607, + "learning_rate": 9.658637331440738e-05, + "loss": 0.080819970369339, + "step": 24060 + }, + { + "epoch": 3.4166075230660042, + "grad_norm": 3.080458402633667, + "learning_rate": 9.658495386799149e-05, + "loss": 0.07668147087097169, + "step": 24070 + }, + { + "epoch": 3.418026969481902, + "grad_norm": 6.942470550537109, + "learning_rate": 9.658353442157559e-05, + "loss": 0.07289301753044128, + "step": 24080 + }, + { + "epoch": 3.4194464158978, + "grad_norm": 9.14225959777832, + "learning_rate": 9.658211497515969e-05, + "loss": 0.07435898780822754, + "step": 24090 + }, + { + "epoch": 3.420865862313698, + "grad_norm": 7.3029704093933105, + "learning_rate": 9.65806955287438e-05, + "loss": 0.12275665998458862, + "step": 24100 + }, + { + "epoch": 3.4222853087295952, + "grad_norm": 1.066394567489624, + "learning_rate": 9.65792760823279e-05, + "loss": 0.12547402381896972, + "step": 24110 + }, + { + "epoch": 3.423704755145493, + "grad_norm": 2.095668315887451, + "learning_rate": 9.6577856635912e-05, + "loss": 0.08885858654975891, + "step": 24120 + }, + { + "epoch": 3.425124201561391, + "grad_norm": 10.10063648223877, + "learning_rate": 9.65764371894961e-05, + "loss": 0.08219894766807556, + "step": 24130 + }, + { + "epoch": 3.426543647977289, + "grad_norm": 0.24362793564796448, + "learning_rate": 9.65750177430802e-05, + "loss": 0.07828856706619262, + "step": 24140 + }, + { + "epoch": 3.4279630943931867, + "grad_norm": 3.3321142196655273, + "learning_rate": 9.65735982966643e-05, + "loss": 0.052914398908615115, + "step": 24150 + }, + { + "epoch": 3.4293825408090846, + "grad_norm": 6.5169291496276855, + "learning_rate": 9.657217885024841e-05, + "loss": 0.0918683409690857, + "step": 24160 + }, + { + "epoch": 3.4308019872249824, + "grad_norm": 1.8033021688461304, + "learning_rate": 9.657075940383251e-05, + "loss": 0.07939133048057556, + "step": 24170 + }, + { + "epoch": 3.43222143364088, + "grad_norm": 0.5477492213249207, + "learning_rate": 9.656933995741662e-05, + "loss": 0.08349984288215637, + "step": 24180 + }, + { + "epoch": 3.4336408800567777, + "grad_norm": 5.996103763580322, + "learning_rate": 9.656792051100072e-05, + "loss": 0.05642620325088501, + "step": 24190 + }, + { + "epoch": 3.4350603264726756, + "grad_norm": 10.91261100769043, + "learning_rate": 9.656650106458481e-05, + "loss": 0.10933701992034912, + "step": 24200 + }, + { + "epoch": 3.4364797728885734, + "grad_norm": 2.225350856781006, + "learning_rate": 9.656508161816892e-05, + "loss": 0.09172443151474, + "step": 24210 + }, + { + "epoch": 3.4378992193044713, + "grad_norm": 12.634965896606445, + "learning_rate": 9.656366217175302e-05, + "loss": 0.11917402744293212, + "step": 24220 + }, + { + "epoch": 3.439318665720369, + "grad_norm": 1.6125768423080444, + "learning_rate": 9.656224272533713e-05, + "loss": 0.06305748820304871, + "step": 24230 + }, + { + "epoch": 3.440738112136267, + "grad_norm": 0.32264623045921326, + "learning_rate": 9.656082327892123e-05, + "loss": 0.053128784894943236, + "step": 24240 + }, + { + "epoch": 3.4421575585521644, + "grad_norm": 1.5485633611679077, + "learning_rate": 9.655940383250533e-05, + "loss": 0.09052397012710571, + "step": 24250 + }, + { + "epoch": 3.4435770049680623, + "grad_norm": 8.407336235046387, + "learning_rate": 9.655798438608942e-05, + "loss": 0.0869211733341217, + "step": 24260 + }, + { + "epoch": 3.44499645138396, + "grad_norm": 4.730905532836914, + "learning_rate": 9.655656493967353e-05, + "loss": 0.07399642467498779, + "step": 24270 + }, + { + "epoch": 3.446415897799858, + "grad_norm": 3.4000537395477295, + "learning_rate": 9.655514549325763e-05, + "loss": 0.047950705885887145, + "step": 24280 + }, + { + "epoch": 3.447835344215756, + "grad_norm": 1.1020469665527344, + "learning_rate": 9.655372604684174e-05, + "loss": 0.06868406534194946, + "step": 24290 + }, + { + "epoch": 3.4492547906316537, + "grad_norm": 7.190598964691162, + "learning_rate": 9.655230660042584e-05, + "loss": 0.0772173523902893, + "step": 24300 + }, + { + "epoch": 3.4506742370475516, + "grad_norm": 0.16195560991764069, + "learning_rate": 9.655088715400994e-05, + "loss": 0.05037579536437988, + "step": 24310 + }, + { + "epoch": 3.452093683463449, + "grad_norm": 5.206357955932617, + "learning_rate": 9.654946770759405e-05, + "loss": 0.05795242190361023, + "step": 24320 + }, + { + "epoch": 3.453513129879347, + "grad_norm": 3.8032917976379395, + "learning_rate": 9.654804826117815e-05, + "loss": 0.06166144609451294, + "step": 24330 + }, + { + "epoch": 3.4549325762952448, + "grad_norm": 7.195924282073975, + "learning_rate": 9.654662881476226e-05, + "loss": 0.07787706851959228, + "step": 24340 + }, + { + "epoch": 3.4563520227111426, + "grad_norm": 16.916200637817383, + "learning_rate": 9.654520936834634e-05, + "loss": 0.12234679460525513, + "step": 24350 + }, + { + "epoch": 3.4577714691270405, + "grad_norm": 8.12978458404541, + "learning_rate": 9.654378992193045e-05, + "loss": 0.08913070559501649, + "step": 24360 + }, + { + "epoch": 3.4591909155429383, + "grad_norm": 5.649082660675049, + "learning_rate": 9.654237047551455e-05, + "loss": 0.06851221919059754, + "step": 24370 + }, + { + "epoch": 3.460610361958836, + "grad_norm": 9.085246086120605, + "learning_rate": 9.654095102909866e-05, + "loss": 0.07936888933181763, + "step": 24380 + }, + { + "epoch": 3.4620298083747336, + "grad_norm": 6.739210605621338, + "learning_rate": 9.653953158268276e-05, + "loss": 0.0680124282836914, + "step": 24390 + }, + { + "epoch": 3.4634492547906315, + "grad_norm": 4.914496421813965, + "learning_rate": 9.653811213626686e-05, + "loss": 0.07251676321029663, + "step": 24400 + }, + { + "epoch": 3.4648687012065293, + "grad_norm": 3.8612000942230225, + "learning_rate": 9.653669268985097e-05, + "loss": 0.08312456011772155, + "step": 24410 + }, + { + "epoch": 3.466288147622427, + "grad_norm": 5.080418109893799, + "learning_rate": 9.653527324343506e-05, + "loss": 0.08824545741081238, + "step": 24420 + }, + { + "epoch": 3.467707594038325, + "grad_norm": 2.7461204528808594, + "learning_rate": 9.653385379701917e-05, + "loss": 0.04693276584148407, + "step": 24430 + }, + { + "epoch": 3.469127040454223, + "grad_norm": 2.284554958343506, + "learning_rate": 9.653243435060327e-05, + "loss": 0.10196805000305176, + "step": 24440 + }, + { + "epoch": 3.470546486870121, + "grad_norm": 6.074938774108887, + "learning_rate": 9.653101490418737e-05, + "loss": 0.12747013568878174, + "step": 24450 + }, + { + "epoch": 3.471965933286018, + "grad_norm": 4.511362075805664, + "learning_rate": 9.652959545777147e-05, + "loss": 0.115402352809906, + "step": 24460 + }, + { + "epoch": 3.473385379701916, + "grad_norm": 2.1728434562683105, + "learning_rate": 9.652817601135558e-05, + "loss": 0.050969237089157106, + "step": 24470 + }, + { + "epoch": 3.474804826117814, + "grad_norm": 5.665693283081055, + "learning_rate": 9.652675656493967e-05, + "loss": 0.06567577123641968, + "step": 24480 + }, + { + "epoch": 3.476224272533712, + "grad_norm": 1.5518124103546143, + "learning_rate": 9.652533711852379e-05, + "loss": 0.05959618091583252, + "step": 24490 + }, + { + "epoch": 3.4776437189496097, + "grad_norm": 8.269552230834961, + "learning_rate": 9.652391767210788e-05, + "loss": 0.13263360261917115, + "step": 24500 + }, + { + "epoch": 3.4776437189496097, + "eval_accuracy": 0.9656641444649329, + "eval_loss": 0.10235972702503204, + "eval_runtime": 33.7613, + "eval_samples_per_second": 465.829, + "eval_steps_per_second": 14.573, + "step": 24500 + }, + { + "epoch": 3.4790631653655075, + "grad_norm": 4.476282119750977, + "learning_rate": 9.652249822569198e-05, + "loss": 0.06739105582237244, + "step": 24510 + }, + { + "epoch": 3.4804826117814054, + "grad_norm": 8.51496410369873, + "learning_rate": 9.652107877927609e-05, + "loss": 0.08702877759933472, + "step": 24520 + }, + { + "epoch": 3.481902058197303, + "grad_norm": 2.6295464038848877, + "learning_rate": 9.651965933286019e-05, + "loss": 0.10381957292556762, + "step": 24530 + }, + { + "epoch": 3.4833215046132007, + "grad_norm": 2.952054023742676, + "learning_rate": 9.65182398864443e-05, + "loss": 0.10116275548934936, + "step": 24540 + }, + { + "epoch": 3.4847409510290985, + "grad_norm": 7.40458869934082, + "learning_rate": 9.651682044002838e-05, + "loss": 0.1063815712928772, + "step": 24550 + }, + { + "epoch": 3.4861603974449964, + "grad_norm": 3.4271445274353027, + "learning_rate": 9.65154009936125e-05, + "loss": 0.12271822690963745, + "step": 24560 + }, + { + "epoch": 3.4875798438608943, + "grad_norm": 4.230976581573486, + "learning_rate": 9.651398154719659e-05, + "loss": 0.06680415868759156, + "step": 24570 + }, + { + "epoch": 3.488999290276792, + "grad_norm": 10.036641120910645, + "learning_rate": 9.65125621007807e-05, + "loss": 0.05150268673896789, + "step": 24580 + }, + { + "epoch": 3.49041873669269, + "grad_norm": 7.777481555938721, + "learning_rate": 9.65111426543648e-05, + "loss": 0.08956578969955445, + "step": 24590 + }, + { + "epoch": 3.4918381831085874, + "grad_norm": 5.8065266609191895, + "learning_rate": 9.650972320794891e-05, + "loss": 0.10252159833908081, + "step": 24600 + }, + { + "epoch": 3.4932576295244853, + "grad_norm": 0.654843270778656, + "learning_rate": 9.650830376153301e-05, + "loss": 0.0543290913105011, + "step": 24610 + }, + { + "epoch": 3.494677075940383, + "grad_norm": 0.9799067974090576, + "learning_rate": 9.65068843151171e-05, + "loss": 0.0650719940662384, + "step": 24620 + }, + { + "epoch": 3.496096522356281, + "grad_norm": 1.798651933670044, + "learning_rate": 9.650546486870122e-05, + "loss": 0.06141197085380554, + "step": 24630 + }, + { + "epoch": 3.497515968772179, + "grad_norm": 3.3337302207946777, + "learning_rate": 9.650404542228531e-05, + "loss": 0.07960495352745056, + "step": 24640 + }, + { + "epoch": 3.4989354151880767, + "grad_norm": 0.37414559721946716, + "learning_rate": 9.650262597586942e-05, + "loss": 0.10498731136322022, + "step": 24650 + }, + { + "epoch": 3.5003548616039746, + "grad_norm": 11.681800842285156, + "learning_rate": 9.650120652945351e-05, + "loss": 0.09571239948272706, + "step": 24660 + }, + { + "epoch": 3.501774308019872, + "grad_norm": 1.942776083946228, + "learning_rate": 9.649978708303762e-05, + "loss": 0.11830227375030518, + "step": 24670 + }, + { + "epoch": 3.5031937544357703, + "grad_norm": 3.5960655212402344, + "learning_rate": 9.649836763662172e-05, + "loss": 0.056894832849502565, + "step": 24680 + }, + { + "epoch": 3.5046132008516677, + "grad_norm": 5.279286861419678, + "learning_rate": 9.649694819020583e-05, + "loss": 0.11261140108108521, + "step": 24690 + }, + { + "epoch": 3.5060326472675656, + "grad_norm": 3.89916729927063, + "learning_rate": 9.649552874378993e-05, + "loss": 0.09311820268630981, + "step": 24700 + }, + { + "epoch": 3.5074520936834634, + "grad_norm": 0.738353431224823, + "learning_rate": 9.649410929737402e-05, + "loss": 0.08309696912765503, + "step": 24710 + }, + { + "epoch": 3.5088715400993613, + "grad_norm": 6.307223320007324, + "learning_rate": 9.649268985095813e-05, + "loss": 0.07369316220283509, + "step": 24720 + }, + { + "epoch": 3.510290986515259, + "grad_norm": 8.444607734680176, + "learning_rate": 9.649127040454223e-05, + "loss": 0.07793084383010865, + "step": 24730 + }, + { + "epoch": 3.5117104329311566, + "grad_norm": 1.6136986017227173, + "learning_rate": 9.648985095812634e-05, + "loss": 0.11156415939331055, + "step": 24740 + }, + { + "epoch": 3.513129879347055, + "grad_norm": 6.505612373352051, + "learning_rate": 9.648843151171044e-05, + "loss": 0.06763787865638733, + "step": 24750 + }, + { + "epoch": 3.5145493257629523, + "grad_norm": 8.121411323547363, + "learning_rate": 9.648701206529454e-05, + "loss": 0.09036332368850708, + "step": 24760 + }, + { + "epoch": 3.51596877217885, + "grad_norm": 4.047122001647949, + "learning_rate": 9.648559261887863e-05, + "loss": 0.10994062423706055, + "step": 24770 + }, + { + "epoch": 3.517388218594748, + "grad_norm": 0.8031113743782043, + "learning_rate": 9.648417317246275e-05, + "loss": 0.1324693441390991, + "step": 24780 + }, + { + "epoch": 3.518807665010646, + "grad_norm": 0.1589478850364685, + "learning_rate": 9.648275372604684e-05, + "loss": 0.07809120416641235, + "step": 24790 + }, + { + "epoch": 3.5202271114265438, + "grad_norm": 5.511590480804443, + "learning_rate": 9.648133427963095e-05, + "loss": 0.09465236663818359, + "step": 24800 + }, + { + "epoch": 3.521646557842441, + "grad_norm": 2.396857738494873, + "learning_rate": 9.647991483321505e-05, + "loss": 0.09788199663162231, + "step": 24810 + }, + { + "epoch": 3.5230660042583395, + "grad_norm": 3.002704381942749, + "learning_rate": 9.647849538679915e-05, + "loss": 0.05331340432167053, + "step": 24820 + }, + { + "epoch": 3.524485450674237, + "grad_norm": 0.42355236411094666, + "learning_rate": 9.647707594038326e-05, + "loss": 0.0318134218454361, + "step": 24830 + }, + { + "epoch": 3.5259048970901348, + "grad_norm": 2.182748794555664, + "learning_rate": 9.647565649396736e-05, + "loss": 0.07532593607902527, + "step": 24840 + }, + { + "epoch": 3.5273243435060326, + "grad_norm": 8.828009605407715, + "learning_rate": 9.647423704755147e-05, + "loss": 0.060152608156204226, + "step": 24850 + }, + { + "epoch": 3.5287437899219305, + "grad_norm": 4.714108943939209, + "learning_rate": 9.647281760113555e-05, + "loss": 0.029934373497962952, + "step": 24860 + }, + { + "epoch": 3.5301632363378284, + "grad_norm": 0.8313024640083313, + "learning_rate": 9.647139815471966e-05, + "loss": 0.12967721223831177, + "step": 24870 + }, + { + "epoch": 3.5315826827537258, + "grad_norm": 2.8028974533081055, + "learning_rate": 9.646997870830376e-05, + "loss": 0.08742020130157471, + "step": 24880 + }, + { + "epoch": 3.533002129169624, + "grad_norm": 5.4242143630981445, + "learning_rate": 9.646855926188787e-05, + "loss": 0.11844632625579835, + "step": 24890 + }, + { + "epoch": 3.5344215755855215, + "grad_norm": 6.741092681884766, + "learning_rate": 9.646713981547197e-05, + "loss": 0.08173008561134339, + "step": 24900 + }, + { + "epoch": 3.5358410220014194, + "grad_norm": 7.06593132019043, + "learning_rate": 9.646572036905607e-05, + "loss": 0.10493273735046386, + "step": 24910 + }, + { + "epoch": 3.5372604684173172, + "grad_norm": 0.9364591836929321, + "learning_rate": 9.646430092264018e-05, + "loss": 0.09316438436508179, + "step": 24920 + }, + { + "epoch": 3.538679914833215, + "grad_norm": 6.34156608581543, + "learning_rate": 9.646288147622427e-05, + "loss": 0.04985399842262268, + "step": 24930 + }, + { + "epoch": 3.540099361249113, + "grad_norm": 13.556730270385742, + "learning_rate": 9.646146202980838e-05, + "loss": 0.0970345377922058, + "step": 24940 + }, + { + "epoch": 3.5415188076650104, + "grad_norm": 7.102383613586426, + "learning_rate": 9.646004258339248e-05, + "loss": 0.10454981327056885, + "step": 24950 + }, + { + "epoch": 3.5429382540809087, + "grad_norm": 5.229292869567871, + "learning_rate": 9.645862313697659e-05, + "loss": 0.08302426338195801, + "step": 24960 + }, + { + "epoch": 3.544357700496806, + "grad_norm": 5.76925802230835, + "learning_rate": 9.645720369056068e-05, + "loss": 0.10274431705474854, + "step": 24970 + }, + { + "epoch": 3.545777146912704, + "grad_norm": 4.70728063583374, + "learning_rate": 9.645578424414479e-05, + "loss": 0.0719529628753662, + "step": 24980 + }, + { + "epoch": 3.547196593328602, + "grad_norm": 4.43380069732666, + "learning_rate": 9.645436479772888e-05, + "loss": 0.060102427005767824, + "step": 24990 + }, + { + "epoch": 3.5486160397444997, + "grad_norm": 4.603033542633057, + "learning_rate": 9.6452945351313e-05, + "loss": 0.10912116765975952, + "step": 25000 + }, + { + "epoch": 3.5486160397444997, + "eval_accuracy": 0.9611496153112482, + "eval_loss": 0.11311028897762299, + "eval_runtime": 33.6933, + "eval_samples_per_second": 466.769, + "eval_steps_per_second": 14.602, + "step": 25000 + }, + { + "epoch": 3.5500354861603975, + "grad_norm": 1.6893727779388428, + "learning_rate": 9.645152590489709e-05, + "loss": 0.08042104840278626, + "step": 25010 + }, + { + "epoch": 3.5514549325762954, + "grad_norm": 0.5648311376571655, + "learning_rate": 9.645010645848119e-05, + "loss": 0.059700363874435426, + "step": 25020 + }, + { + "epoch": 3.5528743789921933, + "grad_norm": 9.752403259277344, + "learning_rate": 9.64486870120653e-05, + "loss": 0.0868448257446289, + "step": 25030 + }, + { + "epoch": 3.5542938254080907, + "grad_norm": 2.9299299716949463, + "learning_rate": 9.64472675656494e-05, + "loss": 0.04335830807685852, + "step": 25040 + }, + { + "epoch": 3.5557132718239886, + "grad_norm": 10.618478775024414, + "learning_rate": 9.644584811923351e-05, + "loss": 0.1310647249221802, + "step": 25050 + }, + { + "epoch": 3.5571327182398864, + "grad_norm": 9.584770202636719, + "learning_rate": 9.644442867281761e-05, + "loss": 0.11195597648620606, + "step": 25060 + }, + { + "epoch": 3.5585521646557843, + "grad_norm": 3.4068570137023926, + "learning_rate": 9.64430092264017e-05, + "loss": 0.11045770645141602, + "step": 25070 + }, + { + "epoch": 3.559971611071682, + "grad_norm": 2.1086835861206055, + "learning_rate": 9.64415897799858e-05, + "loss": 0.04244246780872345, + "step": 25080 + }, + { + "epoch": 3.56139105748758, + "grad_norm": 0.7292802929878235, + "learning_rate": 9.644017033356991e-05, + "loss": 0.05122672319412232, + "step": 25090 + }, + { + "epoch": 3.562810503903478, + "grad_norm": 0.779449999332428, + "learning_rate": 9.643875088715401e-05, + "loss": 0.05376675724983215, + "step": 25100 + }, + { + "epoch": 3.5642299503193753, + "grad_norm": 6.790277004241943, + "learning_rate": 9.643733144073812e-05, + "loss": 0.08969722390174865, + "step": 25110 + }, + { + "epoch": 3.565649396735273, + "grad_norm": 0.9874815344810486, + "learning_rate": 9.643591199432222e-05, + "loss": 0.07819917201995849, + "step": 25120 + }, + { + "epoch": 3.567068843151171, + "grad_norm": 3.9310240745544434, + "learning_rate": 9.643449254790632e-05, + "loss": 0.07071614861488343, + "step": 25130 + }, + { + "epoch": 3.568488289567069, + "grad_norm": 13.718709945678711, + "learning_rate": 9.643307310149043e-05, + "loss": 0.09383904933929443, + "step": 25140 + }, + { + "epoch": 3.5699077359829667, + "grad_norm": 6.163698673248291, + "learning_rate": 9.643165365507452e-05, + "loss": 0.10959553718566895, + "step": 25150 + }, + { + "epoch": 3.5713271823988646, + "grad_norm": 6.060120105743408, + "learning_rate": 9.643023420865864e-05, + "loss": 0.060175150632858276, + "step": 25160 + }, + { + "epoch": 3.5727466288147625, + "grad_norm": 1.9445127248764038, + "learning_rate": 9.642881476224272e-05, + "loss": 0.10089634656906128, + "step": 25170 + }, + { + "epoch": 3.57416607523066, + "grad_norm": 8.044722557067871, + "learning_rate": 9.642739531582683e-05, + "loss": 0.12044985294342041, + "step": 25180 + }, + { + "epoch": 3.5755855216465577, + "grad_norm": 6.326447010040283, + "learning_rate": 9.642597586941093e-05, + "loss": 0.09188529253005981, + "step": 25190 + }, + { + "epoch": 3.5770049680624556, + "grad_norm": 1.62061607837677, + "learning_rate": 9.642455642299504e-05, + "loss": 0.06626140475273132, + "step": 25200 + }, + { + "epoch": 3.5784244144783535, + "grad_norm": 6.9937591552734375, + "learning_rate": 9.642313697657915e-05, + "loss": 0.12362555265426636, + "step": 25210 + }, + { + "epoch": 3.5798438608942513, + "grad_norm": 3.7871155738830566, + "learning_rate": 9.642171753016323e-05, + "loss": 0.08823931813240052, + "step": 25220 + }, + { + "epoch": 3.581263307310149, + "grad_norm": 1.0720821619033813, + "learning_rate": 9.642029808374734e-05, + "loss": 0.049576738476753236, + "step": 25230 + }, + { + "epoch": 3.582682753726047, + "grad_norm": 0.7439848780632019, + "learning_rate": 9.641887863733144e-05, + "loss": 0.07031044960021973, + "step": 25240 + }, + { + "epoch": 3.5841022001419445, + "grad_norm": 8.99116325378418, + "learning_rate": 9.641745919091555e-05, + "loss": 0.09977667331695557, + "step": 25250 + }, + { + "epoch": 3.5855216465578423, + "grad_norm": 0.37592869997024536, + "learning_rate": 9.641603974449965e-05, + "loss": 0.0772906482219696, + "step": 25260 + }, + { + "epoch": 3.58694109297374, + "grad_norm": 6.236084938049316, + "learning_rate": 9.641462029808375e-05, + "loss": 0.05990852117538452, + "step": 25270 + }, + { + "epoch": 3.588360539389638, + "grad_norm": 3.208134412765503, + "learning_rate": 9.641320085166784e-05, + "loss": 0.11211087703704833, + "step": 25280 + }, + { + "epoch": 3.589779985805536, + "grad_norm": 1.552689552307129, + "learning_rate": 9.641178140525196e-05, + "loss": 0.08635483980178833, + "step": 25290 + }, + { + "epoch": 3.591199432221434, + "grad_norm": 6.984618663787842, + "learning_rate": 9.641036195883607e-05, + "loss": 0.06257756948471069, + "step": 25300 + }, + { + "epoch": 3.5926188786373316, + "grad_norm": 0.3589995503425598, + "learning_rate": 9.640894251242016e-05, + "loss": 0.07547361850738525, + "step": 25310 + }, + { + "epoch": 3.594038325053229, + "grad_norm": 7.3257737159729, + "learning_rate": 9.640752306600427e-05, + "loss": 0.04197915494441986, + "step": 25320 + }, + { + "epoch": 3.595457771469127, + "grad_norm": 5.887513160705566, + "learning_rate": 9.640610361958836e-05, + "loss": 0.06401208639144898, + "step": 25330 + }, + { + "epoch": 3.596877217885025, + "grad_norm": 7.882718086242676, + "learning_rate": 9.640468417317247e-05, + "loss": 0.06862297058105468, + "step": 25340 + }, + { + "epoch": 3.5982966643009227, + "grad_norm": 1.1109976768493652, + "learning_rate": 9.640340667139815e-05, + "loss": 0.07234618067741394, + "step": 25350 + }, + { + "epoch": 3.5997161107168205, + "grad_norm": 11.460066795349121, + "learning_rate": 9.640198722498226e-05, + "loss": 0.07255152463912964, + "step": 25360 + }, + { + "epoch": 3.6011355571327184, + "grad_norm": 9.745214462280273, + "learning_rate": 9.640056777856636e-05, + "loss": 0.10036202669143676, + "step": 25370 + }, + { + "epoch": 3.6025550035486162, + "grad_norm": 2.1519269943237305, + "learning_rate": 9.639914833215047e-05, + "loss": 0.06883406043052673, + "step": 25380 + }, + { + "epoch": 3.6039744499645137, + "grad_norm": 2.5749173164367676, + "learning_rate": 9.639772888573457e-05, + "loss": 0.03991932868957519, + "step": 25390 + }, + { + "epoch": 3.6053938963804115, + "grad_norm": 5.585699558258057, + "learning_rate": 9.639630943931867e-05, + "loss": 0.07052261233329774, + "step": 25400 + }, + { + "epoch": 3.6068133427963094, + "grad_norm": 10.144248008728027, + "learning_rate": 9.639488999290277e-05, + "loss": 0.08440894484519959, + "step": 25410 + }, + { + "epoch": 3.6082327892122072, + "grad_norm": 7.000726222991943, + "learning_rate": 9.639347054648688e-05, + "loss": 0.11805384159088135, + "step": 25420 + }, + { + "epoch": 3.609652235628105, + "grad_norm": 4.314553737640381, + "learning_rate": 9.639205110007097e-05, + "loss": 0.04131576418876648, + "step": 25430 + }, + { + "epoch": 3.611071682044003, + "grad_norm": 6.750652313232422, + "learning_rate": 9.639063165365508e-05, + "loss": 0.16327909231185914, + "step": 25440 + }, + { + "epoch": 3.612491128459901, + "grad_norm": 4.056532859802246, + "learning_rate": 9.638921220723918e-05, + "loss": 0.08948258757591247, + "step": 25450 + }, + { + "epoch": 3.6139105748757983, + "grad_norm": 1.1540457010269165, + "learning_rate": 9.638779276082328e-05, + "loss": 0.059862494468688965, + "step": 25460 + }, + { + "epoch": 3.615330021291696, + "grad_norm": 0.861678421497345, + "learning_rate": 9.638637331440739e-05, + "loss": 0.056112641096115114, + "step": 25470 + }, + { + "epoch": 3.616749467707594, + "grad_norm": 5.665146827697754, + "learning_rate": 9.638495386799149e-05, + "loss": 0.07238735556602478, + "step": 25480 + }, + { + "epoch": 3.618168914123492, + "grad_norm": 1.3516796827316284, + "learning_rate": 9.63835344215756e-05, + "loss": 0.07645809054374694, + "step": 25490 + }, + { + "epoch": 3.6195883605393897, + "grad_norm": 4.757277011871338, + "learning_rate": 9.638211497515968e-05, + "loss": 0.0696679949760437, + "step": 25500 + }, + { + "epoch": 3.6195883605393897, + "eval_accuracy": 0.9649011254530425, + "eval_loss": 0.10610143095254898, + "eval_runtime": 33.7176, + "eval_samples_per_second": 466.433, + "eval_steps_per_second": 14.592, + "step": 25500 + }, + { + "epoch": 3.6210078069552876, + "grad_norm": 4.6941819190979, + "learning_rate": 9.63806955287438e-05, + "loss": 0.05806577205657959, + "step": 25510 + }, + { + "epoch": 3.6224272533711854, + "grad_norm": 7.179256439208984, + "learning_rate": 9.637927608232789e-05, + "loss": 0.10271693468093872, + "step": 25520 + }, + { + "epoch": 3.623846699787083, + "grad_norm": 2.660531997680664, + "learning_rate": 9.6377856635912e-05, + "loss": 0.11873768568038941, + "step": 25530 + }, + { + "epoch": 3.6252661462029807, + "grad_norm": 2.4319779872894287, + "learning_rate": 9.63764371894961e-05, + "loss": 0.05240858197212219, + "step": 25540 + }, + { + "epoch": 3.6266855926188786, + "grad_norm": 3.6984822750091553, + "learning_rate": 9.63750177430802e-05, + "loss": 0.05370069146156311, + "step": 25550 + }, + { + "epoch": 3.6281050390347764, + "grad_norm": 4.340889930725098, + "learning_rate": 9.637359829666431e-05, + "loss": 0.08285000324249267, + "step": 25560 + }, + { + "epoch": 3.6295244854506743, + "grad_norm": 9.616756439208984, + "learning_rate": 9.63721788502484e-05, + "loss": 0.07987736463546753, + "step": 25570 + }, + { + "epoch": 3.630943931866572, + "grad_norm": 1.1144945621490479, + "learning_rate": 9.637075940383252e-05, + "loss": 0.08109164237976074, + "step": 25580 + }, + { + "epoch": 3.63236337828247, + "grad_norm": 5.484223365783691, + "learning_rate": 9.636933995741661e-05, + "loss": 0.1028984785079956, + "step": 25590 + }, + { + "epoch": 3.6337828246983674, + "grad_norm": 1.9237008094787598, + "learning_rate": 9.636792051100071e-05, + "loss": 0.11396080255508423, + "step": 25600 + }, + { + "epoch": 3.6352022711142653, + "grad_norm": 4.099696159362793, + "learning_rate": 9.636650106458481e-05, + "loss": 0.10055809020996094, + "step": 25610 + }, + { + "epoch": 3.636621717530163, + "grad_norm": 1.0165332555770874, + "learning_rate": 9.636508161816892e-05, + "loss": 0.03974857628345489, + "step": 25620 + }, + { + "epoch": 3.638041163946061, + "grad_norm": 1.9846147298812866, + "learning_rate": 9.636366217175302e-05, + "loss": 0.11356563568115234, + "step": 25630 + }, + { + "epoch": 3.639460610361959, + "grad_norm": 0.7101534008979797, + "learning_rate": 9.636224272533713e-05, + "loss": 0.043464869260787964, + "step": 25640 + }, + { + "epoch": 3.6408800567778568, + "grad_norm": 10.363176345825195, + "learning_rate": 9.636082327892122e-05, + "loss": 0.10717108249664306, + "step": 25650 + }, + { + "epoch": 3.6422995031937546, + "grad_norm": 9.283759117126465, + "learning_rate": 9.635940383250532e-05, + "loss": 0.11051251888275146, + "step": 25660 + }, + { + "epoch": 3.643718949609652, + "grad_norm": 0.6984942555427551, + "learning_rate": 9.635798438608943e-05, + "loss": 0.06172139048576355, + "step": 25670 + }, + { + "epoch": 3.64513839602555, + "grad_norm": 8.953624725341797, + "learning_rate": 9.635656493967353e-05, + "loss": 0.08708047866821289, + "step": 25680 + }, + { + "epoch": 3.6465578424414478, + "grad_norm": 3.1702566146850586, + "learning_rate": 9.635514549325764e-05, + "loss": 0.10060502290725708, + "step": 25690 + }, + { + "epoch": 3.6479772888573456, + "grad_norm": 1.4352515935897827, + "learning_rate": 9.635372604684174e-05, + "loss": 0.05796252489089966, + "step": 25700 + }, + { + "epoch": 3.6493967352732435, + "grad_norm": 2.140181303024292, + "learning_rate": 9.635230660042584e-05, + "loss": 0.09123912453651428, + "step": 25710 + }, + { + "epoch": 3.6508161816891413, + "grad_norm": 0.572270929813385, + "learning_rate": 9.635088715400993e-05, + "loss": 0.060646504163742065, + "step": 25720 + }, + { + "epoch": 3.652235628105039, + "grad_norm": 0.5034018158912659, + "learning_rate": 9.634946770759404e-05, + "loss": 0.07777016758918762, + "step": 25730 + }, + { + "epoch": 3.6536550745209366, + "grad_norm": 1.8546146154403687, + "learning_rate": 9.634804826117814e-05, + "loss": 0.1366788387298584, + "step": 25740 + }, + { + "epoch": 3.6550745209368345, + "grad_norm": 1.924846887588501, + "learning_rate": 9.634662881476225e-05, + "loss": 0.0842927873134613, + "step": 25750 + }, + { + "epoch": 3.6564939673527324, + "grad_norm": 4.480966567993164, + "learning_rate": 9.634520936834635e-05, + "loss": 0.051267868280410765, + "step": 25760 + }, + { + "epoch": 3.65791341376863, + "grad_norm": 6.783929347991943, + "learning_rate": 9.634378992193045e-05, + "loss": 0.07230629920959472, + "step": 25770 + }, + { + "epoch": 3.659332860184528, + "grad_norm": 0.09694766253232956, + "learning_rate": 9.634237047551456e-05, + "loss": 0.032146582007408143, + "step": 25780 + }, + { + "epoch": 3.660752306600426, + "grad_norm": 1.94701087474823, + "learning_rate": 9.634095102909866e-05, + "loss": 0.08497151136398315, + "step": 25790 + }, + { + "epoch": 3.662171753016324, + "grad_norm": 4.432292461395264, + "learning_rate": 9.633953158268277e-05, + "loss": 0.06812145113945008, + "step": 25800 + }, + { + "epoch": 3.6635911994322212, + "grad_norm": 1.2834193706512451, + "learning_rate": 9.633811213626685e-05, + "loss": 0.045406836271286014, + "step": 25810 + }, + { + "epoch": 3.665010645848119, + "grad_norm": 10.543720245361328, + "learning_rate": 9.633669268985096e-05, + "loss": 0.11493253707885742, + "step": 25820 + }, + { + "epoch": 3.666430092264017, + "grad_norm": 8.311552047729492, + "learning_rate": 9.633527324343506e-05, + "loss": 0.12239972352981568, + "step": 25830 + }, + { + "epoch": 3.667849538679915, + "grad_norm": 3.1987102031707764, + "learning_rate": 9.633385379701917e-05, + "loss": 0.0902472972869873, + "step": 25840 + }, + { + "epoch": 3.6692689850958127, + "grad_norm": 3.2787559032440186, + "learning_rate": 9.633243435060327e-05, + "loss": 0.09204012155532837, + "step": 25850 + }, + { + "epoch": 3.6706884315117105, + "grad_norm": 0.667934775352478, + "learning_rate": 9.633101490418736e-05, + "loss": 0.12997353076934814, + "step": 25860 + }, + { + "epoch": 3.6721078779276084, + "grad_norm": 0.9998012185096741, + "learning_rate": 9.632959545777148e-05, + "loss": 0.1022377371788025, + "step": 25870 + }, + { + "epoch": 3.673527324343506, + "grad_norm": 6.183043479919434, + "learning_rate": 9.632817601135557e-05, + "loss": 0.11707621812820435, + "step": 25880 + }, + { + "epoch": 3.6749467707594037, + "grad_norm": 2.924884080886841, + "learning_rate": 9.632675656493968e-05, + "loss": 0.06619247198104858, + "step": 25890 + }, + { + "epoch": 3.6763662171753015, + "grad_norm": 5.365716934204102, + "learning_rate": 9.632533711852378e-05, + "loss": 0.06179612874984741, + "step": 25900 + }, + { + "epoch": 3.6777856635911994, + "grad_norm": 1.3756598234176636, + "learning_rate": 9.632391767210788e-05, + "loss": 0.06040409207344055, + "step": 25910 + }, + { + "epoch": 3.6792051100070973, + "grad_norm": 2.291795015335083, + "learning_rate": 9.632249822569198e-05, + "loss": 0.0542915940284729, + "step": 25920 + }, + { + "epoch": 3.680624556422995, + "grad_norm": 11.811894416809082, + "learning_rate": 9.632107877927609e-05, + "loss": 0.11134748458862305, + "step": 25930 + }, + { + "epoch": 3.682044002838893, + "grad_norm": 7.955319881439209, + "learning_rate": 9.631965933286018e-05, + "loss": 0.05479052066802979, + "step": 25940 + }, + { + "epoch": 3.6834634492547904, + "grad_norm": 4.885499954223633, + "learning_rate": 9.63182398864443e-05, + "loss": 0.07002484798431396, + "step": 25950 + }, + { + "epoch": 3.6848828956706883, + "grad_norm": 10.359343528747559, + "learning_rate": 9.631682044002839e-05, + "loss": 0.08828185796737671, + "step": 25960 + }, + { + "epoch": 3.686302342086586, + "grad_norm": 0.5266070365905762, + "learning_rate": 9.631540099361249e-05, + "loss": 0.06352581977844238, + "step": 25970 + }, + { + "epoch": 3.687721788502484, + "grad_norm": 4.642971515655518, + "learning_rate": 9.63139815471966e-05, + "loss": 0.10442217588424682, + "step": 25980 + }, + { + "epoch": 3.689141234918382, + "grad_norm": 9.688862800598145, + "learning_rate": 9.63125621007807e-05, + "loss": 0.11465591192245483, + "step": 25990 + }, + { + "epoch": 3.6905606813342797, + "grad_norm": 3.961071252822876, + "learning_rate": 9.631114265436481e-05, + "loss": 0.06878976821899414, + "step": 26000 + }, + { + "epoch": 3.6905606813342797, + "eval_accuracy": 0.9624213136643988, + "eval_loss": 0.10850615799427032, + "eval_runtime": 33.8166, + "eval_samples_per_second": 465.068, + "eval_steps_per_second": 14.549, + "step": 26000 + }, + { + "epoch": 3.6919801277501776, + "grad_norm": 4.773269176483154, + "learning_rate": 9.63097232079489e-05, + "loss": 0.1178863525390625, + "step": 26010 + }, + { + "epoch": 3.693399574166075, + "grad_norm": 2.3169288635253906, + "learning_rate": 9.6308303761533e-05, + "loss": 0.11722581386566162, + "step": 26020 + }, + { + "epoch": 3.694819020581973, + "grad_norm": 0.1308957189321518, + "learning_rate": 9.63068843151171e-05, + "loss": 0.10760715007781982, + "step": 26030 + }, + { + "epoch": 3.6962384669978707, + "grad_norm": 1.4586437940597534, + "learning_rate": 9.630546486870121e-05, + "loss": 0.05567214488983154, + "step": 26040 + }, + { + "epoch": 3.6976579134137686, + "grad_norm": 2.1539924144744873, + "learning_rate": 9.630404542228532e-05, + "loss": 0.06053451895713806, + "step": 26050 + }, + { + "epoch": 3.6990773598296665, + "grad_norm": 4.069761276245117, + "learning_rate": 9.630262597586942e-05, + "loss": 0.08152814507484436, + "step": 26060 + }, + { + "epoch": 3.7004968062455643, + "grad_norm": 7.161952018737793, + "learning_rate": 9.630120652945352e-05, + "loss": 0.12550090551376342, + "step": 26070 + }, + { + "epoch": 3.701916252661462, + "grad_norm": 2.8434369564056396, + "learning_rate": 9.629978708303761e-05, + "loss": 0.07767623662948608, + "step": 26080 + }, + { + "epoch": 3.7033356990773596, + "grad_norm": 0.979110062122345, + "learning_rate": 9.629836763662173e-05, + "loss": 0.11028465032577514, + "step": 26090 + }, + { + "epoch": 3.704755145493258, + "grad_norm": 0.522153913974762, + "learning_rate": 9.629694819020582e-05, + "loss": 0.04352694153785706, + "step": 26100 + }, + { + "epoch": 3.7061745919091553, + "grad_norm": 12.08892822265625, + "learning_rate": 9.629552874378993e-05, + "loss": 0.0692097783088684, + "step": 26110 + }, + { + "epoch": 3.707594038325053, + "grad_norm": 1.5270307064056396, + "learning_rate": 9.629410929737402e-05, + "loss": 0.07068887948989869, + "step": 26120 + }, + { + "epoch": 3.709013484740951, + "grad_norm": 8.704063415527344, + "learning_rate": 9.629268985095813e-05, + "loss": 0.1482228994369507, + "step": 26130 + }, + { + "epoch": 3.710432931156849, + "grad_norm": 5.276047706604004, + "learning_rate": 9.629127040454224e-05, + "loss": 0.12095144987106324, + "step": 26140 + }, + { + "epoch": 3.7118523775727468, + "grad_norm": 6.4501566886901855, + "learning_rate": 9.628985095812634e-05, + "loss": 0.11314345598220825, + "step": 26150 + }, + { + "epoch": 3.713271823988644, + "grad_norm": 1.444300889968872, + "learning_rate": 9.628843151171045e-05, + "loss": 0.15359948873519896, + "step": 26160 + }, + { + "epoch": 3.7146912704045425, + "grad_norm": 0.7862725853919983, + "learning_rate": 9.628701206529453e-05, + "loss": 0.04201339781284332, + "step": 26170 + }, + { + "epoch": 3.71611071682044, + "grad_norm": 4.127621650695801, + "learning_rate": 9.628559261887864e-05, + "loss": 0.08473769426345826, + "step": 26180 + }, + { + "epoch": 3.717530163236338, + "grad_norm": 1.7975977659225464, + "learning_rate": 9.628417317246274e-05, + "loss": 0.06359924674034119, + "step": 26190 + }, + { + "epoch": 3.7189496096522356, + "grad_norm": 3.1404027938842773, + "learning_rate": 9.628275372604685e-05, + "loss": 0.10828089714050293, + "step": 26200 + }, + { + "epoch": 3.7203690560681335, + "grad_norm": 10.088000297546387, + "learning_rate": 9.628133427963095e-05, + "loss": 0.0649361789226532, + "step": 26210 + }, + { + "epoch": 3.7217885024840314, + "grad_norm": 4.136114120483398, + "learning_rate": 9.627991483321505e-05, + "loss": 0.09006186723709106, + "step": 26220 + }, + { + "epoch": 3.723207948899929, + "grad_norm": 0.03255090489983559, + "learning_rate": 9.627849538679916e-05, + "loss": 0.11616590023040771, + "step": 26230 + }, + { + "epoch": 3.724627395315827, + "grad_norm": 1.3517063856124878, + "learning_rate": 9.627707594038325e-05, + "loss": 0.05594800710678101, + "step": 26240 + }, + { + "epoch": 3.7260468417317245, + "grad_norm": 0.47422000765800476, + "learning_rate": 9.627565649396737e-05, + "loss": 0.11615034341812133, + "step": 26250 + }, + { + "epoch": 3.7274662881476224, + "grad_norm": 4.764305591583252, + "learning_rate": 9.627423704755146e-05, + "loss": 0.09596173763275147, + "step": 26260 + }, + { + "epoch": 3.7288857345635202, + "grad_norm": 7.1653008460998535, + "learning_rate": 9.627281760113556e-05, + "loss": 0.09863389730453491, + "step": 26270 + }, + { + "epoch": 3.730305180979418, + "grad_norm": 6.6298418045043945, + "learning_rate": 9.627139815471966e-05, + "loss": 0.08772293925285339, + "step": 26280 + }, + { + "epoch": 3.731724627395316, + "grad_norm": 2.4701497554779053, + "learning_rate": 9.626997870830377e-05, + "loss": 0.05061078667640686, + "step": 26290 + }, + { + "epoch": 3.7331440738112134, + "grad_norm": 10.637267112731934, + "learning_rate": 9.626855926188787e-05, + "loss": 0.134352707862854, + "step": 26300 + }, + { + "epoch": 3.7345635202271117, + "grad_norm": 1.4602038860321045, + "learning_rate": 9.626713981547198e-05, + "loss": 0.0697918713092804, + "step": 26310 + }, + { + "epoch": 3.735982966643009, + "grad_norm": 2.9405529499053955, + "learning_rate": 9.626572036905607e-05, + "loss": 0.08264508247375488, + "step": 26320 + }, + { + "epoch": 3.737402413058907, + "grad_norm": 1.7835215330123901, + "learning_rate": 9.626430092264017e-05, + "loss": 0.08539316654205323, + "step": 26330 + }, + { + "epoch": 3.738821859474805, + "grad_norm": 4.365605354309082, + "learning_rate": 9.626288147622428e-05, + "loss": 0.07182769775390625, + "step": 26340 + }, + { + "epoch": 3.7402413058907027, + "grad_norm": 0.8774107098579407, + "learning_rate": 9.626146202980838e-05, + "loss": 0.06151903867721557, + "step": 26350 + }, + { + "epoch": 3.7416607523066006, + "grad_norm": 6.543043613433838, + "learning_rate": 9.626004258339249e-05, + "loss": 0.08403420448303223, + "step": 26360 + }, + { + "epoch": 3.743080198722498, + "grad_norm": 8.38376522064209, + "learning_rate": 9.625862313697659e-05, + "loss": 0.09423916339874268, + "step": 26370 + }, + { + "epoch": 3.7444996451383963, + "grad_norm": 0.7524133920669556, + "learning_rate": 9.625720369056069e-05, + "loss": 0.09477906823158264, + "step": 26380 + }, + { + "epoch": 3.7459190915542937, + "grad_norm": 4.932705879211426, + "learning_rate": 9.625578424414478e-05, + "loss": 0.09350728392601013, + "step": 26390 + }, + { + "epoch": 3.7473385379701916, + "grad_norm": 1.0093165636062622, + "learning_rate": 9.62543647977289e-05, + "loss": 0.08254989981651306, + "step": 26400 + }, + { + "epoch": 3.7487579843860894, + "grad_norm": 10.72624683380127, + "learning_rate": 9.625294535131299e-05, + "loss": 0.14923367500305176, + "step": 26410 + }, + { + "epoch": 3.7501774308019873, + "grad_norm": 2.8247926235198975, + "learning_rate": 9.62515259048971e-05, + "loss": 0.048303854465484616, + "step": 26420 + }, + { + "epoch": 3.751596877217885, + "grad_norm": 1.634414792060852, + "learning_rate": 9.62501064584812e-05, + "loss": 0.04281752109527588, + "step": 26430 + }, + { + "epoch": 3.7530163236337826, + "grad_norm": 7.190004825592041, + "learning_rate": 9.62486870120653e-05, + "loss": 0.12459969520568848, + "step": 26440 + }, + { + "epoch": 3.754435770049681, + "grad_norm": 0.9193140864372253, + "learning_rate": 9.624726756564941e-05, + "loss": 0.10240849256515502, + "step": 26450 + }, + { + "epoch": 3.7558552164655783, + "grad_norm": 5.948113918304443, + "learning_rate": 9.62458481192335e-05, + "loss": 0.07362242937088012, + "step": 26460 + }, + { + "epoch": 3.757274662881476, + "grad_norm": 6.859321117401123, + "learning_rate": 9.624442867281762e-05, + "loss": 0.06381948590278626, + "step": 26470 + }, + { + "epoch": 3.758694109297374, + "grad_norm": 8.806060791015625, + "learning_rate": 9.62430092264017e-05, + "loss": 0.10844473838806153, + "step": 26480 + }, + { + "epoch": 3.760113555713272, + "grad_norm": 6.027776718139648, + "learning_rate": 9.624158977998581e-05, + "loss": 0.044374221563339235, + "step": 26490 + }, + { + "epoch": 3.7615330021291697, + "grad_norm": 12.30217456817627, + "learning_rate": 9.624017033356991e-05, + "loss": 0.08910216689109803, + "step": 26500 + }, + { + "epoch": 3.7615330021291697, + "eval_accuracy": 0.9674445221593438, + "eval_loss": 0.09445588290691376, + "eval_runtime": 35.0795, + "eval_samples_per_second": 448.325, + "eval_steps_per_second": 14.025, + "step": 26500 + }, + { + "epoch": 3.762952448545067, + "grad_norm": 9.912172317504883, + "learning_rate": 9.623875088715402e-05, + "loss": 0.07598323225975037, + "step": 26510 + }, + { + "epoch": 3.7643718949609655, + "grad_norm": 6.507425785064697, + "learning_rate": 9.623733144073812e-05, + "loss": 0.10796759128570557, + "step": 26520 + }, + { + "epoch": 3.765791341376863, + "grad_norm": 0.8920461535453796, + "learning_rate": 9.623591199432221e-05, + "loss": 0.0660994827747345, + "step": 26530 + }, + { + "epoch": 3.7672107877927608, + "grad_norm": 0.7246550917625427, + "learning_rate": 9.623449254790632e-05, + "loss": 0.0721944272518158, + "step": 26540 + }, + { + "epoch": 3.7686302342086586, + "grad_norm": 8.431264877319336, + "learning_rate": 9.623307310149042e-05, + "loss": 0.11602548360824586, + "step": 26550 + }, + { + "epoch": 3.7700496806245565, + "grad_norm": 2.8587067127227783, + "learning_rate": 9.623165365507453e-05, + "loss": 0.0831636905670166, + "step": 26560 + }, + { + "epoch": 3.7714691270404543, + "grad_norm": 6.571961402893066, + "learning_rate": 9.623023420865863e-05, + "loss": 0.08520074486732483, + "step": 26570 + }, + { + "epoch": 3.7728885734563518, + "grad_norm": 9.178510665893555, + "learning_rate": 9.622881476224273e-05, + "loss": 0.10961424112319947, + "step": 26580 + }, + { + "epoch": 3.77430801987225, + "grad_norm": 5.433183670043945, + "learning_rate": 9.622739531582682e-05, + "loss": 0.035625565052032473, + "step": 26590 + }, + { + "epoch": 3.7757274662881475, + "grad_norm": 3.318091869354248, + "learning_rate": 9.622597586941094e-05, + "loss": 0.08876525163650513, + "step": 26600 + }, + { + "epoch": 3.7771469127040453, + "grad_norm": 5.857662677764893, + "learning_rate": 9.622455642299503e-05, + "loss": 0.09441637992858887, + "step": 26610 + }, + { + "epoch": 3.778566359119943, + "grad_norm": 0.6901429891586304, + "learning_rate": 9.622313697657914e-05, + "loss": 0.1499311089515686, + "step": 26620 + }, + { + "epoch": 3.779985805535841, + "grad_norm": 4.66740608215332, + "learning_rate": 9.622171753016324e-05, + "loss": 0.08082123398780823, + "step": 26630 + }, + { + "epoch": 3.781405251951739, + "grad_norm": 1.0753750801086426, + "learning_rate": 9.622029808374734e-05, + "loss": 0.11261454820632935, + "step": 26640 + }, + { + "epoch": 3.7828246983676364, + "grad_norm": 1.8048758506774902, + "learning_rate": 9.621887863733145e-05, + "loss": 0.04800321459770203, + "step": 26650 + }, + { + "epoch": 3.7842441447835347, + "grad_norm": 4.112722396850586, + "learning_rate": 9.621745919091555e-05, + "loss": 0.05345563292503357, + "step": 26660 + }, + { + "epoch": 3.785663591199432, + "grad_norm": 4.393552303314209, + "learning_rate": 9.621603974449966e-05, + "loss": 0.06797432899475098, + "step": 26670 + }, + { + "epoch": 3.78708303761533, + "grad_norm": 5.8782057762146, + "learning_rate": 9.621462029808376e-05, + "loss": 0.10248314142227173, + "step": 26680 + }, + { + "epoch": 3.788502484031228, + "grad_norm": 4.203194618225098, + "learning_rate": 9.621320085166785e-05, + "loss": 0.07673492431640624, + "step": 26690 + }, + { + "epoch": 3.7899219304471257, + "grad_norm": 6.538164138793945, + "learning_rate": 9.621178140525195e-05, + "loss": 0.14795901775360107, + "step": 26700 + }, + { + "epoch": 3.7913413768630235, + "grad_norm": 11.21495532989502, + "learning_rate": 9.621036195883606e-05, + "loss": 0.11105353832244873, + "step": 26710 + }, + { + "epoch": 3.792760823278921, + "grad_norm": 9.616037368774414, + "learning_rate": 9.620894251242016e-05, + "loss": 0.06156564950942993, + "step": 26720 + }, + { + "epoch": 3.7941802696948193, + "grad_norm": 1.3275951147079468, + "learning_rate": 9.620752306600427e-05, + "loss": 0.10542140007019044, + "step": 26730 + }, + { + "epoch": 3.7955997161107167, + "grad_norm": 0.3900807797908783, + "learning_rate": 9.620610361958837e-05, + "loss": 0.07363483309745789, + "step": 26740 + }, + { + "epoch": 3.7970191625266145, + "grad_norm": 0.9365026950836182, + "learning_rate": 9.620468417317246e-05, + "loss": 0.13672546148300171, + "step": 26750 + }, + { + "epoch": 3.7984386089425124, + "grad_norm": 5.348412990570068, + "learning_rate": 9.620326472675658e-05, + "loss": 0.08223192691802979, + "step": 26760 + }, + { + "epoch": 3.7998580553584103, + "grad_norm": 5.438634872436523, + "learning_rate": 9.620184528034067e-05, + "loss": 0.0626322865486145, + "step": 26770 + }, + { + "epoch": 3.801277501774308, + "grad_norm": 3.877898693084717, + "learning_rate": 9.620042583392478e-05, + "loss": 0.08450109958648681, + "step": 26780 + }, + { + "epoch": 3.802696948190206, + "grad_norm": 1.7447080612182617, + "learning_rate": 9.619900638750887e-05, + "loss": 0.0839974820613861, + "step": 26790 + }, + { + "epoch": 3.804116394606104, + "grad_norm": 3.0330183506011963, + "learning_rate": 9.619758694109298e-05, + "loss": 0.09644001126289367, + "step": 26800 + }, + { + "epoch": 3.8055358410220013, + "grad_norm": 1.270749568939209, + "learning_rate": 9.619616749467708e-05, + "loss": 0.0933254063129425, + "step": 26810 + }, + { + "epoch": 3.806955287437899, + "grad_norm": 7.720582485198975, + "learning_rate": 9.619474804826119e-05, + "loss": 0.10009632110595704, + "step": 26820 + }, + { + "epoch": 3.808374733853797, + "grad_norm": 0.09740854054689407, + "learning_rate": 9.619332860184528e-05, + "loss": 0.055448722839355466, + "step": 26830 + }, + { + "epoch": 3.809794180269695, + "grad_norm": 7.304279327392578, + "learning_rate": 9.619190915542938e-05, + "loss": 0.07708572149276734, + "step": 26840 + }, + { + "epoch": 3.8112136266855927, + "grad_norm": 2.435711622238159, + "learning_rate": 9.619048970901349e-05, + "loss": 0.03736964464187622, + "step": 26850 + }, + { + "epoch": 3.8126330731014906, + "grad_norm": 1.575897455215454, + "learning_rate": 9.618907026259759e-05, + "loss": 0.08769443035125732, + "step": 26860 + }, + { + "epoch": 3.8140525195173884, + "grad_norm": 2.552058458328247, + "learning_rate": 9.61876508161817e-05, + "loss": 0.08449045419692994, + "step": 26870 + }, + { + "epoch": 3.815471965933286, + "grad_norm": 5.736693382263184, + "learning_rate": 9.61862313697658e-05, + "loss": 0.037884673476219176, + "step": 26880 + }, + { + "epoch": 3.8168914123491837, + "grad_norm": 1.4216078519821167, + "learning_rate": 9.61848119233499e-05, + "loss": 0.08100056648254395, + "step": 26890 + }, + { + "epoch": 3.8183108587650816, + "grad_norm": 2.799546480178833, + "learning_rate": 9.618339247693399e-05, + "loss": 0.06523057818412781, + "step": 26900 + }, + { + "epoch": 3.8197303051809794, + "grad_norm": 1.2117537260055542, + "learning_rate": 9.61819730305181e-05, + "loss": 0.049738320708274844, + "step": 26910 + }, + { + "epoch": 3.8211497515968773, + "grad_norm": 4.893204212188721, + "learning_rate": 9.61805535841022e-05, + "loss": 0.07888569235801697, + "step": 26920 + }, + { + "epoch": 3.822569198012775, + "grad_norm": 7.409163475036621, + "learning_rate": 9.617913413768631e-05, + "loss": 0.07680698037147522, + "step": 26930 + }, + { + "epoch": 3.823988644428673, + "grad_norm": 3.720153331756592, + "learning_rate": 9.617771469127041e-05, + "loss": 0.09198516607284546, + "step": 26940 + }, + { + "epoch": 3.8254080908445705, + "grad_norm": 6.0060224533081055, + "learning_rate": 9.61762952448545e-05, + "loss": 0.07883418202400208, + "step": 26950 + }, + { + "epoch": 3.8268275372604683, + "grad_norm": 1.143011212348938, + "learning_rate": 9.617487579843862e-05, + "loss": 0.04225245714187622, + "step": 26960 + }, + { + "epoch": 3.828246983676366, + "grad_norm": 12.127354621887207, + "learning_rate": 9.617345635202271e-05, + "loss": 0.05237703919410706, + "step": 26970 + }, + { + "epoch": 3.829666430092264, + "grad_norm": 5.843609809875488, + "learning_rate": 9.617203690560683e-05, + "loss": 0.08967744708061218, + "step": 26980 + }, + { + "epoch": 3.831085876508162, + "grad_norm": 2.9347615242004395, + "learning_rate": 9.617061745919091e-05, + "loss": 0.08393052220344543, + "step": 26990 + }, + { + "epoch": 3.8325053229240598, + "grad_norm": 4.1316657066345215, + "learning_rate": 9.616919801277502e-05, + "loss": 0.09489677548408508, + "step": 27000 + }, + { + "epoch": 3.8325053229240598, + "eval_accuracy": 0.9656641444649329, + "eval_loss": 0.10252257436513901, + "eval_runtime": 33.8687, + "eval_samples_per_second": 464.352, + "eval_steps_per_second": 14.527, + "step": 27000 + }, + { + "epoch": 3.8339247693399576, + "grad_norm": 2.383795976638794, + "learning_rate": 9.616777856635912e-05, + "loss": 0.062172305583953855, + "step": 27010 + }, + { + "epoch": 3.835344215755855, + "grad_norm": 4.4788737297058105, + "learning_rate": 9.616635911994323e-05, + "loss": 0.08685917258262635, + "step": 27020 + }, + { + "epoch": 3.836763662171753, + "grad_norm": 4.786855220794678, + "learning_rate": 9.616493967352733e-05, + "loss": 0.04608690142631531, + "step": 27030 + }, + { + "epoch": 3.8381831085876508, + "grad_norm": 5.334591388702393, + "learning_rate": 9.616352022711144e-05, + "loss": 0.08949378728866578, + "step": 27040 + }, + { + "epoch": 3.8396025550035486, + "grad_norm": 6.602955341339111, + "learning_rate": 9.616210078069553e-05, + "loss": 0.08425779342651367, + "step": 27050 + }, + { + "epoch": 3.8410220014194465, + "grad_norm": 6.772454261779785, + "learning_rate": 9.616068133427963e-05, + "loss": 0.08206725120544434, + "step": 27060 + }, + { + "epoch": 3.8424414478353444, + "grad_norm": 2.036407709121704, + "learning_rate": 9.615926188786374e-05, + "loss": 0.12470332384109498, + "step": 27070 + }, + { + "epoch": 3.843860894251242, + "grad_norm": 5.146540641784668, + "learning_rate": 9.615784244144784e-05, + "loss": 0.041440796852111814, + "step": 27080 + }, + { + "epoch": 3.8452803406671396, + "grad_norm": 4.734882831573486, + "learning_rate": 9.615642299503195e-05, + "loss": 0.10296415090560913, + "step": 27090 + }, + { + "epoch": 3.8466997870830375, + "grad_norm": 4.153848171234131, + "learning_rate": 9.615500354861604e-05, + "loss": 0.10323355197906495, + "step": 27100 + }, + { + "epoch": 3.8481192334989354, + "grad_norm": 3.4101874828338623, + "learning_rate": 9.615358410220015e-05, + "loss": 0.05214914083480835, + "step": 27110 + }, + { + "epoch": 3.8495386799148332, + "grad_norm": 2.5987448692321777, + "learning_rate": 9.615216465578424e-05, + "loss": 0.06731126308441163, + "step": 27120 + }, + { + "epoch": 3.850958126330731, + "grad_norm": 0.9972496628761292, + "learning_rate": 9.615074520936835e-05, + "loss": 0.09721655249595643, + "step": 27130 + }, + { + "epoch": 3.852377572746629, + "grad_norm": 2.8008410930633545, + "learning_rate": 9.614932576295245e-05, + "loss": 0.09973838925361633, + "step": 27140 + }, + { + "epoch": 3.853797019162527, + "grad_norm": 6.48881721496582, + "learning_rate": 9.614790631653655e-05, + "loss": 0.09506261944770814, + "step": 27150 + }, + { + "epoch": 3.8552164655784242, + "grad_norm": 4.826530933380127, + "learning_rate": 9.614648687012066e-05, + "loss": 0.07496817111968994, + "step": 27160 + }, + { + "epoch": 3.856635911994322, + "grad_norm": 1.649975061416626, + "learning_rate": 9.614506742370476e-05, + "loss": 0.07133917212486267, + "step": 27170 + }, + { + "epoch": 3.85805535841022, + "grad_norm": 5.264161586761475, + "learning_rate": 9.614364797728887e-05, + "loss": 0.0537201464176178, + "step": 27180 + }, + { + "epoch": 3.859474804826118, + "grad_norm": 4.296278953552246, + "learning_rate": 9.614222853087297e-05, + "loss": 0.08852277994155884, + "step": 27190 + }, + { + "epoch": 3.8608942512420157, + "grad_norm": 3.727269172668457, + "learning_rate": 9.614080908445706e-05, + "loss": 0.12326380014419555, + "step": 27200 + }, + { + "epoch": 3.8623136976579135, + "grad_norm": 1.0287762880325317, + "learning_rate": 9.613938963804116e-05, + "loss": 0.08480355739593506, + "step": 27210 + }, + { + "epoch": 3.8637331440738114, + "grad_norm": 2.53420090675354, + "learning_rate": 9.613797019162527e-05, + "loss": 0.08443946838378906, + "step": 27220 + }, + { + "epoch": 3.865152590489709, + "grad_norm": 2.7054736614227295, + "learning_rate": 9.613655074520937e-05, + "loss": 0.12175383567810058, + "step": 27230 + }, + { + "epoch": 3.8665720369056067, + "grad_norm": 0.3844822943210602, + "learning_rate": 9.613513129879348e-05, + "loss": 0.05505117177963257, + "step": 27240 + }, + { + "epoch": 3.8679914833215046, + "grad_norm": 1.9096373319625854, + "learning_rate": 9.613371185237758e-05, + "loss": 0.08993933200836182, + "step": 27250 + }, + { + "epoch": 3.8694109297374024, + "grad_norm": 9.668591499328613, + "learning_rate": 9.613229240596167e-05, + "loss": 0.09555359482765198, + "step": 27260 + }, + { + "epoch": 3.8708303761533003, + "grad_norm": 6.2174787521362305, + "learning_rate": 9.613087295954579e-05, + "loss": 0.07780954241752625, + "step": 27270 + }, + { + "epoch": 3.872249822569198, + "grad_norm": 6.8190741539001465, + "learning_rate": 9.612945351312988e-05, + "loss": 0.06228452920913696, + "step": 27280 + }, + { + "epoch": 3.873669268985096, + "grad_norm": 5.052826881408691, + "learning_rate": 9.6128034066714e-05, + "loss": 0.05573546886444092, + "step": 27290 + }, + { + "epoch": 3.8750887154009934, + "grad_norm": 2.147706985473633, + "learning_rate": 9.612661462029808e-05, + "loss": 0.030921798944473267, + "step": 27300 + }, + { + "epoch": 3.8765081618168913, + "grad_norm": 1.844710350036621, + "learning_rate": 9.612519517388219e-05, + "loss": 0.02291310876607895, + "step": 27310 + }, + { + "epoch": 3.877927608232789, + "grad_norm": 4.277228355407715, + "learning_rate": 9.612377572746629e-05, + "loss": 0.09191072583198548, + "step": 27320 + }, + { + "epoch": 3.879347054648687, + "grad_norm": 0.5011488199234009, + "learning_rate": 9.61223562810504e-05, + "loss": 0.09018791913986206, + "step": 27330 + }, + { + "epoch": 3.880766501064585, + "grad_norm": 7.728168964385986, + "learning_rate": 9.61209368346345e-05, + "loss": 0.17614935636520385, + "step": 27340 + }, + { + "epoch": 3.8821859474804827, + "grad_norm": 7.640387058258057, + "learning_rate": 9.611951738821859e-05, + "loss": 0.053128105401992795, + "step": 27350 + }, + { + "epoch": 3.8836053938963806, + "grad_norm": 4.586857795715332, + "learning_rate": 9.61180979418027e-05, + "loss": 0.07566349506378174, + "step": 27360 + }, + { + "epoch": 3.885024840312278, + "grad_norm": 9.557601928710938, + "learning_rate": 9.61166784953868e-05, + "loss": 0.16053431034088134, + "step": 27370 + }, + { + "epoch": 3.886444286728176, + "grad_norm": 2.9451968669891357, + "learning_rate": 9.61154009936125e-05, + "loss": 0.10314490795135497, + "step": 27380 + }, + { + "epoch": 3.8878637331440737, + "grad_norm": 2.856229305267334, + "learning_rate": 9.61139815471966e-05, + "loss": 0.03152236342430115, + "step": 27390 + }, + { + "epoch": 3.8892831795599716, + "grad_norm": 10.189471244812012, + "learning_rate": 9.61125621007807e-05, + "loss": 0.07608083486557007, + "step": 27400 + }, + { + "epoch": 3.8907026259758695, + "grad_norm": 4.597877025604248, + "learning_rate": 9.61111426543648e-05, + "loss": 0.09993425607681275, + "step": 27410 + }, + { + "epoch": 3.8921220723917673, + "grad_norm": 4.780374050140381, + "learning_rate": 9.610972320794891e-05, + "loss": 0.12308070659637452, + "step": 27420 + }, + { + "epoch": 3.893541518807665, + "grad_norm": 3.518791675567627, + "learning_rate": 9.6108303761533e-05, + "loss": 0.06599195003509521, + "step": 27430 + }, + { + "epoch": 3.8949609652235626, + "grad_norm": 0.6286087036132812, + "learning_rate": 9.610688431511711e-05, + "loss": 0.0612760066986084, + "step": 27440 + }, + { + "epoch": 3.8963804116394605, + "grad_norm": 4.189844608306885, + "learning_rate": 9.610546486870121e-05, + "loss": 0.09225243330001831, + "step": 27450 + }, + { + "epoch": 3.8977998580553583, + "grad_norm": 0.4884074032306671, + "learning_rate": 9.610404542228532e-05, + "loss": 0.09044739603996277, + "step": 27460 + }, + { + "epoch": 3.899219304471256, + "grad_norm": 4.86144495010376, + "learning_rate": 9.610262597586942e-05, + "loss": 0.09649211764335633, + "step": 27470 + }, + { + "epoch": 3.900638750887154, + "grad_norm": 0.6673234701156616, + "learning_rate": 9.610120652945351e-05, + "loss": 0.06789471507072449, + "step": 27480 + }, + { + "epoch": 3.902058197303052, + "grad_norm": 5.458202362060547, + "learning_rate": 9.609978708303762e-05, + "loss": 0.06328256726264954, + "step": 27490 + }, + { + "epoch": 3.90347764371895, + "grad_norm": 3.2472307682037354, + "learning_rate": 9.609836763662172e-05, + "loss": 0.03994499444961548, + "step": 27500 + }, + { + "epoch": 3.90347764371895, + "eval_accuracy": 0.9700515037833026, + "eval_loss": 0.08563879877328873, + "eval_runtime": 35.5026, + "eval_samples_per_second": 442.982, + "eval_steps_per_second": 13.858, + "step": 27500 + }, + { + "epoch": 3.904897090134847, + "grad_norm": 6.66304874420166, + "learning_rate": 9.609694819020583e-05, + "loss": 0.10047676563262939, + "step": 27510 + }, + { + "epoch": 3.906316536550745, + "grad_norm": 4.747247219085693, + "learning_rate": 9.609552874378993e-05, + "loss": 0.0649915337562561, + "step": 27520 + }, + { + "epoch": 3.907735982966643, + "grad_norm": 6.686343669891357, + "learning_rate": 9.609410929737403e-05, + "loss": 0.07071633338928222, + "step": 27530 + }, + { + "epoch": 3.909155429382541, + "grad_norm": 6.8346266746521, + "learning_rate": 9.609268985095812e-05, + "loss": 0.08757308721542359, + "step": 27540 + }, + { + "epoch": 3.9105748757984387, + "grad_norm": 4.510685443878174, + "learning_rate": 9.609127040454223e-05, + "loss": 0.057412338256835935, + "step": 27550 + }, + { + "epoch": 3.9119943222143365, + "grad_norm": 3.572941541671753, + "learning_rate": 9.608985095812633e-05, + "loss": 0.11244267225265503, + "step": 27560 + }, + { + "epoch": 3.9134137686302344, + "grad_norm": 1.7603306770324707, + "learning_rate": 9.608843151171044e-05, + "loss": 0.08775643706321716, + "step": 27570 + }, + { + "epoch": 3.914833215046132, + "grad_norm": 1.7416644096374512, + "learning_rate": 9.608701206529454e-05, + "loss": 0.04108691513538361, + "step": 27580 + }, + { + "epoch": 3.9162526614620297, + "grad_norm": 11.181812286376953, + "learning_rate": 9.608559261887864e-05, + "loss": 0.09978156089782715, + "step": 27590 + }, + { + "epoch": 3.9176721078779275, + "grad_norm": 13.193679809570312, + "learning_rate": 9.608417317246275e-05, + "loss": 0.0903174638748169, + "step": 27600 + }, + { + "epoch": 3.9190915542938254, + "grad_norm": 4.218955993652344, + "learning_rate": 9.608275372604685e-05, + "loss": 0.0756367027759552, + "step": 27610 + }, + { + "epoch": 3.9205110007097232, + "grad_norm": 3.685628652572632, + "learning_rate": 9.608133427963096e-05, + "loss": 0.15287163257598876, + "step": 27620 + }, + { + "epoch": 3.921930447125621, + "grad_norm": 3.4429805278778076, + "learning_rate": 9.607991483321504e-05, + "loss": 0.06430520415306092, + "step": 27630 + }, + { + "epoch": 3.923349893541519, + "grad_norm": 2.0270931720733643, + "learning_rate": 9.607849538679915e-05, + "loss": 0.10801482200622559, + "step": 27640 + }, + { + "epoch": 3.9247693399574164, + "grad_norm": 6.879272937774658, + "learning_rate": 9.607707594038325e-05, + "loss": 0.06597111821174621, + "step": 27650 + }, + { + "epoch": 3.9261887863733143, + "grad_norm": 1.249678611755371, + "learning_rate": 9.607565649396736e-05, + "loss": 0.03797664046287537, + "step": 27660 + }, + { + "epoch": 3.927608232789212, + "grad_norm": 0.24534237384796143, + "learning_rate": 9.607423704755146e-05, + "loss": 0.02991785407066345, + "step": 27670 + }, + { + "epoch": 3.92902767920511, + "grad_norm": 0.6622792482376099, + "learning_rate": 9.607281760113555e-05, + "loss": 0.02254675328731537, + "step": 27680 + }, + { + "epoch": 3.930447125621008, + "grad_norm": 0.3865921199321747, + "learning_rate": 9.607139815471967e-05, + "loss": 0.05718799233436585, + "step": 27690 + }, + { + "epoch": 3.9318665720369057, + "grad_norm": 4.140193462371826, + "learning_rate": 9.606997870830376e-05, + "loss": 0.08032118082046509, + "step": 27700 + }, + { + "epoch": 3.9332860184528036, + "grad_norm": 4.738871097564697, + "learning_rate": 9.606855926188787e-05, + "loss": 0.04987369179725647, + "step": 27710 + }, + { + "epoch": 3.934705464868701, + "grad_norm": 9.0142183303833, + "learning_rate": 9.606713981547197e-05, + "loss": 0.08996903896331787, + "step": 27720 + }, + { + "epoch": 3.936124911284599, + "grad_norm": 0.1371709704399109, + "learning_rate": 9.606572036905608e-05, + "loss": 0.05264982581138611, + "step": 27730 + }, + { + "epoch": 3.9375443577004967, + "grad_norm": 4.215709209442139, + "learning_rate": 9.606430092264017e-05, + "loss": 0.09554726481437684, + "step": 27740 + }, + { + "epoch": 3.9389638041163946, + "grad_norm": 1.1737656593322754, + "learning_rate": 9.606288147622428e-05, + "loss": 0.06068928241729736, + "step": 27750 + }, + { + "epoch": 3.9403832505322924, + "grad_norm": 1.0329593420028687, + "learning_rate": 9.606146202980837e-05, + "loss": 0.0832507848739624, + "step": 27760 + }, + { + "epoch": 3.9418026969481903, + "grad_norm": 1.4225534200668335, + "learning_rate": 9.606004258339249e-05, + "loss": 0.10741715431213379, + "step": 27770 + }, + { + "epoch": 3.943222143364088, + "grad_norm": 2.706094741821289, + "learning_rate": 9.605862313697658e-05, + "loss": 0.09146757125854492, + "step": 27780 + }, + { + "epoch": 3.9446415897799856, + "grad_norm": 7.178990364074707, + "learning_rate": 9.605720369056068e-05, + "loss": 0.09984794855117798, + "step": 27790 + }, + { + "epoch": 3.9460610361958834, + "grad_norm": 3.8521437644958496, + "learning_rate": 9.605578424414479e-05, + "loss": 0.1205756664276123, + "step": 27800 + }, + { + "epoch": 3.9474804826117813, + "grad_norm": 5.163641929626465, + "learning_rate": 9.605436479772889e-05, + "loss": 0.029975688457489012, + "step": 27810 + }, + { + "epoch": 3.948899929027679, + "grad_norm": 2.2490506172180176, + "learning_rate": 9.605308729595457e-05, + "loss": 0.1587289094924927, + "step": 27820 + }, + { + "epoch": 3.950319375443577, + "grad_norm": 6.972119331359863, + "learning_rate": 9.605166784953868e-05, + "loss": 0.09115952849388123, + "step": 27830 + }, + { + "epoch": 3.951738821859475, + "grad_norm": 0.4664672017097473, + "learning_rate": 9.60502484031228e-05, + "loss": 0.07655965089797974, + "step": 27840 + }, + { + "epoch": 3.9531582682753728, + "grad_norm": 4.678897380828857, + "learning_rate": 9.604882895670689e-05, + "loss": 0.052338707447052005, + "step": 27850 + }, + { + "epoch": 3.95457771469127, + "grad_norm": 4.429093837738037, + "learning_rate": 9.604740951029099e-05, + "loss": 0.054144054651260376, + "step": 27860 + }, + { + "epoch": 3.9559971611071685, + "grad_norm": 2.022493600845337, + "learning_rate": 9.604599006387509e-05, + "loss": 0.012421280145645142, + "step": 27870 + }, + { + "epoch": 3.957416607523066, + "grad_norm": 8.385174751281738, + "learning_rate": 9.60445706174592e-05, + "loss": 0.0690504789352417, + "step": 27880 + }, + { + "epoch": 3.9588360539389638, + "grad_norm": 1.3558812141418457, + "learning_rate": 9.60431511710433e-05, + "loss": 0.051670241355896, + "step": 27890 + }, + { + "epoch": 3.9602555003548616, + "grad_norm": 7.869537830352783, + "learning_rate": 9.60417317246274e-05, + "loss": 0.14572601318359374, + "step": 27900 + }, + { + "epoch": 3.9616749467707595, + "grad_norm": 3.734588623046875, + "learning_rate": 9.604031227821149e-05, + "loss": 0.10068864822387695, + "step": 27910 + }, + { + "epoch": 3.9630943931866573, + "grad_norm": 6.552592754364014, + "learning_rate": 9.60388928317956e-05, + "loss": 0.06942902803421021, + "step": 27920 + }, + { + "epoch": 3.9645138396025548, + "grad_norm": 1.7775979042053223, + "learning_rate": 9.603747338537971e-05, + "loss": 0.07231731414794922, + "step": 27930 + }, + { + "epoch": 3.965933286018453, + "grad_norm": 8.738762855529785, + "learning_rate": 9.603605393896381e-05, + "loss": 0.07564049959182739, + "step": 27940 + }, + { + "epoch": 3.9673527324343505, + "grad_norm": 3.3592703342437744, + "learning_rate": 9.603463449254792e-05, + "loss": 0.09379636645317077, + "step": 27950 + }, + { + "epoch": 3.9687721788502484, + "grad_norm": 7.767439842224121, + "learning_rate": 9.6033215046132e-05, + "loss": 0.09403069615364075, + "step": 27960 + }, + { + "epoch": 3.970191625266146, + "grad_norm": 11.272348403930664, + "learning_rate": 9.603179559971612e-05, + "loss": 0.18936721086502076, + "step": 27970 + }, + { + "epoch": 3.971611071682044, + "grad_norm": 5.963432312011719, + "learning_rate": 9.603037615330021e-05, + "loss": 0.12643344402313234, + "step": 27980 + }, + { + "epoch": 3.973030518097942, + "grad_norm": 6.23541784286499, + "learning_rate": 9.602895670688432e-05, + "loss": 0.13420791625976564, + "step": 27990 + }, + { + "epoch": 3.9744499645138394, + "grad_norm": 6.839860439300537, + "learning_rate": 9.602753726046842e-05, + "loss": 0.07564538717269897, + "step": 28000 + }, + { + "epoch": 3.9744499645138394, + "eval_accuracy": 0.9512939530743307, + "eval_loss": 0.14611348509788513, + "eval_runtime": 35.5209, + "eval_samples_per_second": 442.754, + "eval_steps_per_second": 13.851, + "step": 28000 + }, + { + "epoch": 3.9758694109297377, + "grad_norm": 5.825727939605713, + "learning_rate": 9.602611781405252e-05, + "loss": 0.1116330862045288, + "step": 28010 + }, + { + "epoch": 3.977288857345635, + "grad_norm": 2.08404541015625, + "learning_rate": 9.602469836763663e-05, + "loss": 0.046886110305786134, + "step": 28020 + }, + { + "epoch": 3.978708303761533, + "grad_norm": 16.22827911376953, + "learning_rate": 9.602327892122073e-05, + "loss": 0.0687228798866272, + "step": 28030 + }, + { + "epoch": 3.980127750177431, + "grad_norm": 4.588597774505615, + "learning_rate": 9.602185947480484e-05, + "loss": 0.09896028637886048, + "step": 28040 + }, + { + "epoch": 3.9815471965933287, + "grad_norm": 0.9230074882507324, + "learning_rate": 9.602044002838894e-05, + "loss": 0.07626842856407165, + "step": 28050 + }, + { + "epoch": 3.9829666430092265, + "grad_norm": 5.006918907165527, + "learning_rate": 9.601902058197305e-05, + "loss": 0.06256378293037415, + "step": 28060 + }, + { + "epoch": 3.984386089425124, + "grad_norm": 0.970280647277832, + "learning_rate": 9.601760113555713e-05, + "loss": 0.05959609746932983, + "step": 28070 + }, + { + "epoch": 3.9858055358410223, + "grad_norm": 1.083361268043518, + "learning_rate": 9.601618168914124e-05, + "loss": 0.10017790794372558, + "step": 28080 + }, + { + "epoch": 3.9872249822569197, + "grad_norm": 1.0272605419158936, + "learning_rate": 9.601476224272534e-05, + "loss": 0.10394766330718994, + "step": 28090 + }, + { + "epoch": 3.9886444286728175, + "grad_norm": 2.8747732639312744, + "learning_rate": 9.601334279630945e-05, + "loss": 0.08298658728599548, + "step": 28100 + }, + { + "epoch": 3.9900638750887154, + "grad_norm": 1.3981420993804932, + "learning_rate": 9.601192334989355e-05, + "loss": 0.09550594687461852, + "step": 28110 + }, + { + "epoch": 3.9914833215046133, + "grad_norm": 2.6726458072662354, + "learning_rate": 9.601050390347764e-05, + "loss": 0.06938835978507996, + "step": 28120 + }, + { + "epoch": 3.992902767920511, + "grad_norm": 8.076497077941895, + "learning_rate": 9.600908445706175e-05, + "loss": 0.12478160858154297, + "step": 28130 + }, + { + "epoch": 3.9943222143364085, + "grad_norm": 4.511760234832764, + "learning_rate": 9.600766501064585e-05, + "loss": 0.040727350115776065, + "step": 28140 + }, + { + "epoch": 3.995741660752307, + "grad_norm": 5.094786167144775, + "learning_rate": 9.600624556422996e-05, + "loss": 0.046387788653373715, + "step": 28150 + }, + { + "epoch": 3.9971611071682043, + "grad_norm": 1.6641865968704224, + "learning_rate": 9.600482611781406e-05, + "loss": 0.09861783981323242, + "step": 28160 + }, + { + "epoch": 3.998580553584102, + "grad_norm": 1.3645977973937988, + "learning_rate": 9.600340667139816e-05, + "loss": 0.046815165877342226, + "step": 28170 + }, + { + "epoch": 4.0, + "grad_norm": 3.4103214740753174, + "learning_rate": 9.600198722498226e-05, + "loss": 0.07011319398880005, + "step": 28180 + }, + { + "epoch": 4.001419446415897, + "grad_norm": 4.563676834106445, + "learning_rate": 9.600056777856637e-05, + "loss": 0.08069857358932495, + "step": 28190 + }, + { + "epoch": 4.002838892831796, + "grad_norm": 7.2024407386779785, + "learning_rate": 9.599914833215046e-05, + "loss": 0.07496459484100342, + "step": 28200 + }, + { + "epoch": 4.004258339247693, + "grad_norm": 6.962199687957764, + "learning_rate": 9.599772888573457e-05, + "loss": 0.04304870367050171, + "step": 28210 + }, + { + "epoch": 4.0056777856635915, + "grad_norm": 3.1831023693084717, + "learning_rate": 9.599630943931867e-05, + "loss": 0.03946310579776764, + "step": 28220 + }, + { + "epoch": 4.007097232079489, + "grad_norm": 6.079113006591797, + "learning_rate": 9.599488999290277e-05, + "loss": 0.11042921543121338, + "step": 28230 + }, + { + "epoch": 4.008516678495387, + "grad_norm": 4.9173688888549805, + "learning_rate": 9.599347054648688e-05, + "loss": 0.09752132892608642, + "step": 28240 + }, + { + "epoch": 4.009936124911285, + "grad_norm": 7.992283821105957, + "learning_rate": 9.599205110007098e-05, + "loss": 0.08114546537399292, + "step": 28250 + }, + { + "epoch": 4.011355571327182, + "grad_norm": 10.832806587219238, + "learning_rate": 9.599063165365509e-05, + "loss": 0.0714793860912323, + "step": 28260 + }, + { + "epoch": 4.01277501774308, + "grad_norm": 8.594995498657227, + "learning_rate": 9.598921220723917e-05, + "loss": 0.1365175724029541, + "step": 28270 + }, + { + "epoch": 4.014194464158978, + "grad_norm": 3.8112146854400635, + "learning_rate": 9.598779276082328e-05, + "loss": 0.09683020114898681, + "step": 28280 + }, + { + "epoch": 4.015613910574876, + "grad_norm": 3.5960965156555176, + "learning_rate": 9.598637331440738e-05, + "loss": 0.07476208806037903, + "step": 28290 + }, + { + "epoch": 4.0170333569907735, + "grad_norm": 7.426855564117432, + "learning_rate": 9.598495386799149e-05, + "loss": 0.04781417846679688, + "step": 28300 + }, + { + "epoch": 4.018452803406672, + "grad_norm": 3.124751091003418, + "learning_rate": 9.598353442157559e-05, + "loss": 0.020880359411239623, + "step": 28310 + }, + { + "epoch": 4.019872249822569, + "grad_norm": 0.4350399672985077, + "learning_rate": 9.598211497515969e-05, + "loss": 0.031051820516586302, + "step": 28320 + }, + { + "epoch": 4.021291696238467, + "grad_norm": 4.507806301116943, + "learning_rate": 9.59806955287438e-05, + "loss": 0.07177828550338745, + "step": 28330 + }, + { + "epoch": 4.022711142654365, + "grad_norm": 3.546592950820923, + "learning_rate": 9.59792760823279e-05, + "loss": 0.07378742098808289, + "step": 28340 + }, + { + "epoch": 4.024130589070262, + "grad_norm": 0.4405544102191925, + "learning_rate": 9.5977856635912e-05, + "loss": 0.056821930408477786, + "step": 28350 + }, + { + "epoch": 4.025550035486161, + "grad_norm": 5.474564075469971, + "learning_rate": 9.59764371894961e-05, + "loss": 0.07283146381378174, + "step": 28360 + }, + { + "epoch": 4.026969481902058, + "grad_norm": 0.8094128966331482, + "learning_rate": 9.59750177430802e-05, + "loss": 0.05205709934234619, + "step": 28370 + }, + { + "epoch": 4.028388928317956, + "grad_norm": 1.3830199241638184, + "learning_rate": 9.59735982966643e-05, + "loss": 0.035252746939659116, + "step": 28380 + }, + { + "epoch": 4.029808374733854, + "grad_norm": 0.34760722517967224, + "learning_rate": 9.597217885024841e-05, + "loss": 0.06989213228225707, + "step": 28390 + }, + { + "epoch": 4.031227821149751, + "grad_norm": 2.2948784828186035, + "learning_rate": 9.59707594038325e-05, + "loss": 0.09394473433494568, + "step": 28400 + }, + { + "epoch": 4.0326472675656495, + "grad_norm": 0.513525664806366, + "learning_rate": 9.596933995741662e-05, + "loss": 0.07552390694618225, + "step": 28410 + }, + { + "epoch": 4.034066713981547, + "grad_norm": 10.444016456604004, + "learning_rate": 9.596792051100071e-05, + "loss": 0.09061986804008484, + "step": 28420 + }, + { + "epoch": 4.035486160397445, + "grad_norm": 1.8548258543014526, + "learning_rate": 9.596650106458481e-05, + "loss": 0.05329904556274414, + "step": 28430 + }, + { + "epoch": 4.036905606813343, + "grad_norm": 6.657663345336914, + "learning_rate": 9.596508161816892e-05, + "loss": 0.09772533774375916, + "step": 28440 + }, + { + "epoch": 4.038325053229241, + "grad_norm": 0.976019561290741, + "learning_rate": 9.596366217175302e-05, + "loss": 0.06526297330856323, + "step": 28450 + }, + { + "epoch": 4.039744499645138, + "grad_norm": 0.7490736246109009, + "learning_rate": 9.596224272533713e-05, + "loss": 0.057075291872024536, + "step": 28460 + }, + { + "epoch": 4.041163946061036, + "grad_norm": 0.49151191115379333, + "learning_rate": 9.596082327892123e-05, + "loss": 0.03589344024658203, + "step": 28470 + }, + { + "epoch": 4.042583392476934, + "grad_norm": 1.0588443279266357, + "learning_rate": 9.595940383250533e-05, + "loss": 0.06447117924690246, + "step": 28480 + }, + { + "epoch": 4.0440028388928315, + "grad_norm": 0.5945330262184143, + "learning_rate": 9.595798438608942e-05, + "loss": 0.031689152121543884, + "step": 28490 + }, + { + "epoch": 4.04542228530873, + "grad_norm": 2.2356131076812744, + "learning_rate": 9.595656493967353e-05, + "loss": 0.042199242115020755, + "step": 28500 + }, + { + "epoch": 4.04542228530873, + "eval_accuracy": 0.9701150887009601, + "eval_loss": 0.09118034690618515, + "eval_runtime": 34.2314, + "eval_samples_per_second": 459.432, + "eval_steps_per_second": 14.373, + "step": 28500 + }, + { + "epoch": 4.046841731724627, + "grad_norm": 7.246707439422607, + "learning_rate": 9.595514549325763e-05, + "loss": 0.07293486595153809, + "step": 28510 + }, + { + "epoch": 4.0482611781405256, + "grad_norm": 7.857562065124512, + "learning_rate": 9.595372604684174e-05, + "loss": 0.07997156977653504, + "step": 28520 + }, + { + "epoch": 4.049680624556423, + "grad_norm": 6.5758795738220215, + "learning_rate": 9.595230660042584e-05, + "loss": 0.03434431552886963, + "step": 28530 + }, + { + "epoch": 4.05110007097232, + "grad_norm": 4.938292980194092, + "learning_rate": 9.595088715400994e-05, + "loss": 0.03129143714904785, + "step": 28540 + }, + { + "epoch": 4.052519517388219, + "grad_norm": 4.353482246398926, + "learning_rate": 9.594946770759405e-05, + "loss": 0.05996691584587097, + "step": 28550 + }, + { + "epoch": 4.053938963804116, + "grad_norm": 2.334413528442383, + "learning_rate": 9.594804826117815e-05, + "loss": 0.08024400472640991, + "step": 28560 + }, + { + "epoch": 4.055358410220014, + "grad_norm": 8.563901901245117, + "learning_rate": 9.594662881476226e-05, + "loss": 0.10906833410263062, + "step": 28570 + }, + { + "epoch": 4.056777856635912, + "grad_norm": 6.423247814178467, + "learning_rate": 9.594520936834634e-05, + "loss": 0.058406251668930056, + "step": 28580 + }, + { + "epoch": 4.05819730305181, + "grad_norm": 7.802096843719482, + "learning_rate": 9.594378992193045e-05, + "loss": 0.07235055565834045, + "step": 28590 + }, + { + "epoch": 4.059616749467708, + "grad_norm": 2.2815730571746826, + "learning_rate": 9.594237047551455e-05, + "loss": 0.09480493068695069, + "step": 28600 + }, + { + "epoch": 4.061036195883605, + "grad_norm": 0.11063522100448608, + "learning_rate": 9.594095102909866e-05, + "loss": 0.028444766998291016, + "step": 28610 + }, + { + "epoch": 4.062455642299503, + "grad_norm": 4.243860244750977, + "learning_rate": 9.593953158268276e-05, + "loss": 0.07847819924354553, + "step": 28620 + }, + { + "epoch": 4.063875088715401, + "grad_norm": 0.3890465795993805, + "learning_rate": 9.593811213626685e-05, + "loss": 0.09325323104858399, + "step": 28630 + }, + { + "epoch": 4.065294535131299, + "grad_norm": 6.783263206481934, + "learning_rate": 9.593669268985096e-05, + "loss": 0.052077090740203856, + "step": 28640 + }, + { + "epoch": 4.066713981547196, + "grad_norm": 0.21521657705307007, + "learning_rate": 9.593527324343506e-05, + "loss": 0.03206589519977569, + "step": 28650 + }, + { + "epoch": 4.068133427963095, + "grad_norm": 2.6312716007232666, + "learning_rate": 9.593385379701917e-05, + "loss": 0.049369195103645326, + "step": 28660 + }, + { + "epoch": 4.069552874378992, + "grad_norm": 3.080799102783203, + "learning_rate": 9.593243435060327e-05, + "loss": 0.0701833188533783, + "step": 28670 + }, + { + "epoch": 4.07097232079489, + "grad_norm": 3.443047285079956, + "learning_rate": 9.593101490418737e-05, + "loss": 0.06615790724754333, + "step": 28680 + }, + { + "epoch": 4.072391767210788, + "grad_norm": 0.24704548716545105, + "learning_rate": 9.592959545777147e-05, + "loss": 0.08545000553131103, + "step": 28690 + }, + { + "epoch": 4.073811213626685, + "grad_norm": 6.151279449462891, + "learning_rate": 9.592817601135558e-05, + "loss": 0.034337058663368225, + "step": 28700 + }, + { + "epoch": 4.075230660042584, + "grad_norm": 3.560441017150879, + "learning_rate": 9.592675656493967e-05, + "loss": 0.051172637939453126, + "step": 28710 + }, + { + "epoch": 4.076650106458481, + "grad_norm": 2.4762368202209473, + "learning_rate": 9.592533711852378e-05, + "loss": 0.07133134603500366, + "step": 28720 + }, + { + "epoch": 4.078069552874379, + "grad_norm": 6.441526889801025, + "learning_rate": 9.592391767210788e-05, + "loss": 0.07345221638679504, + "step": 28730 + }, + { + "epoch": 4.079488999290277, + "grad_norm": 2.405494213104248, + "learning_rate": 9.592249822569198e-05, + "loss": 0.048948812484741214, + "step": 28740 + }, + { + "epoch": 4.080908445706174, + "grad_norm": 0.5363373160362244, + "learning_rate": 9.592107877927609e-05, + "loss": 0.07454119324684143, + "step": 28750 + }, + { + "epoch": 4.0823278921220725, + "grad_norm": 0.1072138175368309, + "learning_rate": 9.591965933286019e-05, + "loss": 0.05140770673751831, + "step": 28760 + }, + { + "epoch": 4.08374733853797, + "grad_norm": 0.17401359975337982, + "learning_rate": 9.59182398864443e-05, + "loss": 0.08340293169021606, + "step": 28770 + }, + { + "epoch": 4.085166784953868, + "grad_norm": 3.663013458251953, + "learning_rate": 9.59168204400284e-05, + "loss": 0.06304811835289001, + "step": 28780 + }, + { + "epoch": 4.086586231369766, + "grad_norm": 6.273396968841553, + "learning_rate": 9.59154009936125e-05, + "loss": 0.12202763557434082, + "step": 28790 + }, + { + "epoch": 4.088005677785664, + "grad_norm": 9.233271598815918, + "learning_rate": 9.591398154719659e-05, + "loss": 0.13653677701950073, + "step": 28800 + }, + { + "epoch": 4.089425124201561, + "grad_norm": 1.3640882968902588, + "learning_rate": 9.59125621007807e-05, + "loss": 0.06013572812080383, + "step": 28810 + }, + { + "epoch": 4.090844570617459, + "grad_norm": 10.00602912902832, + "learning_rate": 9.59111426543648e-05, + "loss": 0.10673294067382813, + "step": 28820 + }, + { + "epoch": 4.092264017033357, + "grad_norm": 4.630044460296631, + "learning_rate": 9.590972320794891e-05, + "loss": 0.13828219175338746, + "step": 28830 + }, + { + "epoch": 4.0936834634492545, + "grad_norm": 5.386570930480957, + "learning_rate": 9.590830376153301e-05, + "loss": 0.05448143482208252, + "step": 28840 + }, + { + "epoch": 4.095102909865153, + "grad_norm": 1.139168620109558, + "learning_rate": 9.59068843151171e-05, + "loss": 0.04355934262275696, + "step": 28850 + }, + { + "epoch": 4.09652235628105, + "grad_norm": 3.865908145904541, + "learning_rate": 9.590546486870122e-05, + "loss": 0.025386843085289, + "step": 28860 + }, + { + "epoch": 4.0979418026969485, + "grad_norm": 0.1300564855337143, + "learning_rate": 9.590404542228531e-05, + "loss": 0.08789493441581726, + "step": 28870 + }, + { + "epoch": 4.099361249112846, + "grad_norm": 9.505627632141113, + "learning_rate": 9.590262597586942e-05, + "loss": 0.06938197016716004, + "step": 28880 + }, + { + "epoch": 4.100780695528743, + "grad_norm": 5.77639102935791, + "learning_rate": 9.590120652945351e-05, + "loss": 0.08774335384368896, + "step": 28890 + }, + { + "epoch": 4.102200141944642, + "grad_norm": 7.798095703125, + "learning_rate": 9.589978708303762e-05, + "loss": 0.049100473523139954, + "step": 28900 + }, + { + "epoch": 4.103619588360539, + "grad_norm": 4.503985404968262, + "learning_rate": 9.589836763662172e-05, + "loss": 0.12068946361541748, + "step": 28910 + }, + { + "epoch": 4.105039034776437, + "grad_norm": 0.352927565574646, + "learning_rate": 9.589694819020583e-05, + "loss": 0.09867768883705139, + "step": 28920 + }, + { + "epoch": 4.106458481192335, + "grad_norm": 8.994272232055664, + "learning_rate": 9.589552874378992e-05, + "loss": 0.0767949104309082, + "step": 28930 + }, + { + "epoch": 4.107877927608233, + "grad_norm": 10.914958953857422, + "learning_rate": 9.589410929737402e-05, + "loss": 0.11344790458679199, + "step": 28940 + }, + { + "epoch": 4.1092973740241305, + "grad_norm": 4.673663139343262, + "learning_rate": 9.589268985095813e-05, + "loss": 0.09324707984924316, + "step": 28950 + }, + { + "epoch": 4.110716820440028, + "grad_norm": 0.63322913646698, + "learning_rate": 9.589127040454223e-05, + "loss": 0.09688594937324524, + "step": 28960 + }, + { + "epoch": 4.112136266855926, + "grad_norm": 1.9962878227233887, + "learning_rate": 9.588985095812634e-05, + "loss": 0.10838373899459838, + "step": 28970 + }, + { + "epoch": 4.113555713271824, + "grad_norm": 0.5679922699928284, + "learning_rate": 9.588843151171044e-05, + "loss": 0.04196344316005707, + "step": 28980 + }, + { + "epoch": 4.114975159687722, + "grad_norm": 1.6682865619659424, + "learning_rate": 9.588701206529454e-05, + "loss": 0.0966885507106781, + "step": 28990 + }, + { + "epoch": 4.116394606103619, + "grad_norm": 1.2570011615753174, + "learning_rate": 9.588559261887863e-05, + "loss": 0.0969314455986023, + "step": 29000 + }, + { + "epoch": 4.116394606103619, + "eval_accuracy": 0.9755198067018503, + "eval_loss": 0.06681427359580994, + "eval_runtime": 34.1546, + "eval_samples_per_second": 460.466, + "eval_steps_per_second": 14.405, + "step": 29000 + }, + { + "epoch": 4.117814052519518, + "grad_norm": 1.8310188055038452, + "learning_rate": 9.588417317246274e-05, + "loss": 0.03523149788379669, + "step": 29010 + }, + { + "epoch": 4.119233498935415, + "grad_norm": 2.215496778488159, + "learning_rate": 9.588275372604684e-05, + "loss": 0.06970539093017578, + "step": 29020 + }, + { + "epoch": 4.120652945351313, + "grad_norm": 1.687964677810669, + "learning_rate": 9.588133427963095e-05, + "loss": 0.07161665558815003, + "step": 29030 + }, + { + "epoch": 4.122072391767211, + "grad_norm": 4.007593154907227, + "learning_rate": 9.587991483321505e-05, + "loss": 0.02978900372982025, + "step": 29040 + }, + { + "epoch": 4.123491838183108, + "grad_norm": 0.38011181354522705, + "learning_rate": 9.587849538679915e-05, + "loss": 0.06865530610084533, + "step": 29050 + }, + { + "epoch": 4.124911284599007, + "grad_norm": 3.137512683868408, + "learning_rate": 9.587707594038326e-05, + "loss": 0.03670695722103119, + "step": 29060 + }, + { + "epoch": 4.126330731014904, + "grad_norm": 7.8316874504089355, + "learning_rate": 9.587565649396736e-05, + "loss": 0.08162494897842407, + "step": 29070 + }, + { + "epoch": 4.127750177430802, + "grad_norm": 0.1684914380311966, + "learning_rate": 9.587423704755147e-05, + "loss": 0.07897425889968872, + "step": 29080 + }, + { + "epoch": 4.1291696238467, + "grad_norm": 2.3186583518981934, + "learning_rate": 9.587281760113556e-05, + "loss": 0.07271428108215332, + "step": 29090 + }, + { + "epoch": 4.130589070262598, + "grad_norm": 0.6711585521697998, + "learning_rate": 9.587139815471966e-05, + "loss": 0.12461971044540406, + "step": 29100 + }, + { + "epoch": 4.1320085166784954, + "grad_norm": 2.4304239749908447, + "learning_rate": 9.586997870830376e-05, + "loss": 0.0756750762462616, + "step": 29110 + }, + { + "epoch": 4.133427963094393, + "grad_norm": 3.95882248878479, + "learning_rate": 9.586855926188787e-05, + "loss": 0.1011937141418457, + "step": 29120 + }, + { + "epoch": 4.134847409510291, + "grad_norm": 1.7454277276992798, + "learning_rate": 9.586713981547198e-05, + "loss": 0.06007967591285705, + "step": 29130 + }, + { + "epoch": 4.136266855926189, + "grad_norm": 2.151155710220337, + "learning_rate": 9.586572036905608e-05, + "loss": 0.07004793882369995, + "step": 29140 + }, + { + "epoch": 4.137686302342087, + "grad_norm": 3.816464900970459, + "learning_rate": 9.586430092264018e-05, + "loss": 0.11677614450454712, + "step": 29150 + }, + { + "epoch": 4.139105748757984, + "grad_norm": 1.8618497848510742, + "learning_rate": 9.586288147622427e-05, + "loss": 0.03786920607089996, + "step": 29160 + }, + { + "epoch": 4.140525195173883, + "grad_norm": 5.731049060821533, + "learning_rate": 9.586146202980838e-05, + "loss": 0.07805479168891907, + "step": 29170 + }, + { + "epoch": 4.14194464158978, + "grad_norm": 5.20023775100708, + "learning_rate": 9.586004258339248e-05, + "loss": 0.047267353534698485, + "step": 29180 + }, + { + "epoch": 4.1433640880056775, + "grad_norm": 2.555107355117798, + "learning_rate": 9.585862313697659e-05, + "loss": 0.052921724319458005, + "step": 29190 + }, + { + "epoch": 4.144783534421576, + "grad_norm": 6.4647650718688965, + "learning_rate": 9.585720369056068e-05, + "loss": 0.05531458258628845, + "step": 29200 + }, + { + "epoch": 4.146202980837473, + "grad_norm": 2.4424901008605957, + "learning_rate": 9.585578424414479e-05, + "loss": 0.06641347408294677, + "step": 29210 + }, + { + "epoch": 4.1476224272533715, + "grad_norm": 3.6472041606903076, + "learning_rate": 9.58543647977289e-05, + "loss": 0.0659880816936493, + "step": 29220 + }, + { + "epoch": 4.149041873669269, + "grad_norm": 7.028792381286621, + "learning_rate": 9.5852945351313e-05, + "loss": 0.07326681613922119, + "step": 29230 + }, + { + "epoch": 4.150461320085167, + "grad_norm": 5.249692916870117, + "learning_rate": 9.58515259048971e-05, + "loss": 0.0626168668270111, + "step": 29240 + }, + { + "epoch": 4.151880766501065, + "grad_norm": 1.918457269668579, + "learning_rate": 9.585010645848119e-05, + "loss": 0.0697676658630371, + "step": 29250 + }, + { + "epoch": 4.153300212916962, + "grad_norm": 3.7116684913635254, + "learning_rate": 9.58486870120653e-05, + "loss": 0.05905971527099609, + "step": 29260 + }, + { + "epoch": 4.15471965933286, + "grad_norm": 2.3320891857147217, + "learning_rate": 9.58472675656494e-05, + "loss": 0.04564954340457916, + "step": 29270 + }, + { + "epoch": 4.156139105748758, + "grad_norm": 0.35267460346221924, + "learning_rate": 9.584584811923351e-05, + "loss": 0.07400223016738891, + "step": 29280 + }, + { + "epoch": 4.157558552164656, + "grad_norm": 0.4503205120563507, + "learning_rate": 9.58444286728176e-05, + "loss": 0.04973529279232025, + "step": 29290 + }, + { + "epoch": 4.1589779985805535, + "grad_norm": 1.6949446201324463, + "learning_rate": 9.58430092264017e-05, + "loss": 0.0649847149848938, + "step": 29300 + }, + { + "epoch": 4.160397444996452, + "grad_norm": 3.4858663082122803, + "learning_rate": 9.58415897799858e-05, + "loss": 0.045249027013778684, + "step": 29310 + }, + { + "epoch": 4.161816891412349, + "grad_norm": 1.861283302307129, + "learning_rate": 9.584017033356991e-05, + "loss": 0.04219701588153839, + "step": 29320 + }, + { + "epoch": 4.163236337828247, + "grad_norm": 1.1282949447631836, + "learning_rate": 9.583875088715402e-05, + "loss": 0.06556923389434814, + "step": 29330 + }, + { + "epoch": 4.164655784244145, + "grad_norm": 3.5685386657714844, + "learning_rate": 9.583733144073812e-05, + "loss": 0.07912471294403076, + "step": 29340 + }, + { + "epoch": 4.166075230660042, + "grad_norm": 0.43480032682418823, + "learning_rate": 9.583591199432222e-05, + "loss": 0.06162703037261963, + "step": 29350 + }, + { + "epoch": 4.167494677075941, + "grad_norm": 14.023776054382324, + "learning_rate": 9.583449254790631e-05, + "loss": 0.06618826985359191, + "step": 29360 + }, + { + "epoch": 4.168914123491838, + "grad_norm": 5.220623016357422, + "learning_rate": 9.583307310149043e-05, + "loss": 0.03890301287174225, + "step": 29370 + }, + { + "epoch": 4.170333569907736, + "grad_norm": 2.791459560394287, + "learning_rate": 9.583165365507452e-05, + "loss": 0.0788652777671814, + "step": 29380 + }, + { + "epoch": 4.171753016323634, + "grad_norm": 6.876852035522461, + "learning_rate": 9.583023420865863e-05, + "loss": 0.05101839303970337, + "step": 29390 + }, + { + "epoch": 4.173172462739531, + "grad_norm": 3.1492698192596436, + "learning_rate": 9.582881476224272e-05, + "loss": 0.02205464094877243, + "step": 29400 + }, + { + "epoch": 4.1745919091554295, + "grad_norm": 1.2465417385101318, + "learning_rate": 9.582739531582683e-05, + "loss": 0.07565943002700806, + "step": 29410 + }, + { + "epoch": 4.176011355571327, + "grad_norm": 3.89546275138855, + "learning_rate": 9.582597586941094e-05, + "loss": 0.048539546132087705, + "step": 29420 + }, + { + "epoch": 4.177430801987225, + "grad_norm": 6.265598773956299, + "learning_rate": 9.582455642299504e-05, + "loss": 0.03970089554786682, + "step": 29430 + }, + { + "epoch": 4.178850248403123, + "grad_norm": 3.304530143737793, + "learning_rate": 9.582313697657915e-05, + "loss": 0.0281475305557251, + "step": 29440 + }, + { + "epoch": 4.180269694819021, + "grad_norm": 3.612251043319702, + "learning_rate": 9.582171753016325e-05, + "loss": 0.040489718317985535, + "step": 29450 + }, + { + "epoch": 4.181689141234918, + "grad_norm": 5.561727523803711, + "learning_rate": 9.582029808374734e-05, + "loss": 0.028734493255615234, + "step": 29460 + }, + { + "epoch": 4.183108587650816, + "grad_norm": 8.159090042114258, + "learning_rate": 9.581887863733144e-05, + "loss": 0.06577027440071107, + "step": 29470 + }, + { + "epoch": 4.184528034066714, + "grad_norm": 9.581611633300781, + "learning_rate": 9.581745919091555e-05, + "loss": 0.10467482805252075, + "step": 29480 + }, + { + "epoch": 4.185947480482612, + "grad_norm": 0.0830698311328888, + "learning_rate": 9.581603974449965e-05, + "loss": 0.06047796607017517, + "step": 29490 + }, + { + "epoch": 4.18736692689851, + "grad_norm": 10.371894836425781, + "learning_rate": 9.581462029808376e-05, + "loss": 0.08939828872680664, + "step": 29500 + }, + { + "epoch": 4.18736692689851, + "eval_accuracy": 0.9636294270998919, + "eval_loss": 0.1139441579580307, + "eval_runtime": 32.7881, + "eval_samples_per_second": 479.656, + "eval_steps_per_second": 15.005, + "step": 29500 + }, + { + "epoch": 4.188786373314407, + "grad_norm": 3.3707029819488525, + "learning_rate": 9.581320085166786e-05, + "loss": 0.13373112678527832, + "step": 29510 + }, + { + "epoch": 4.190205819730306, + "grad_norm": 8.710700988769531, + "learning_rate": 9.581178140525195e-05, + "loss": 0.08344519138336182, + "step": 29520 + }, + { + "epoch": 4.191625266146203, + "grad_norm": 4.869105339050293, + "learning_rate": 9.581036195883607e-05, + "loss": 0.027658921480178834, + "step": 29530 + }, + { + "epoch": 4.1930447125621, + "grad_norm": 5.363608360290527, + "learning_rate": 9.580894251242016e-05, + "loss": 0.10114845037460327, + "step": 29540 + }, + { + "epoch": 4.194464158977999, + "grad_norm": 5.194397926330566, + "learning_rate": 9.580752306600427e-05, + "loss": 0.06530258655548096, + "step": 29550 + }, + { + "epoch": 4.195883605393896, + "grad_norm": 7.159872531890869, + "learning_rate": 9.580610361958836e-05, + "loss": 0.049175983667373656, + "step": 29560 + }, + { + "epoch": 4.1973030518097945, + "grad_norm": 1.903594970703125, + "learning_rate": 9.580468417317247e-05, + "loss": 0.04481973648071289, + "step": 29570 + }, + { + "epoch": 4.198722498225692, + "grad_norm": 0.6888973116874695, + "learning_rate": 9.580326472675657e-05, + "loss": 0.05650795102119446, + "step": 29580 + }, + { + "epoch": 4.20014194464159, + "grad_norm": 1.0285824537277222, + "learning_rate": 9.580184528034068e-05, + "loss": 0.05506778359413147, + "step": 29590 + }, + { + "epoch": 4.201561391057488, + "grad_norm": 0.3579859733581543, + "learning_rate": 9.580042583392477e-05, + "loss": 0.08136498928070068, + "step": 29600 + }, + { + "epoch": 4.202980837473385, + "grad_norm": 7.7916483879089355, + "learning_rate": 9.579900638750887e-05, + "loss": 0.11614023447036743, + "step": 29610 + }, + { + "epoch": 4.204400283889283, + "grad_norm": 2.5529000759124756, + "learning_rate": 9.579758694109298e-05, + "loss": 0.06429266929626465, + "step": 29620 + }, + { + "epoch": 4.205819730305181, + "grad_norm": 2.441297769546509, + "learning_rate": 9.579616749467708e-05, + "loss": 0.06408776044845581, + "step": 29630 + }, + { + "epoch": 4.207239176721079, + "grad_norm": 0.708437979221344, + "learning_rate": 9.579474804826119e-05, + "loss": 0.042554271221160886, + "step": 29640 + }, + { + "epoch": 4.2086586231369765, + "grad_norm": 2.543992757797241, + "learning_rate": 9.579332860184529e-05, + "loss": 0.03778640329837799, + "step": 29650 + }, + { + "epoch": 4.210078069552875, + "grad_norm": 2.1270813941955566, + "learning_rate": 9.579190915542939e-05, + "loss": 0.059389245510101316, + "step": 29660 + }, + { + "epoch": 4.211497515968772, + "grad_norm": 5.076752662658691, + "learning_rate": 9.579048970901348e-05, + "loss": 0.097792249917984, + "step": 29670 + }, + { + "epoch": 4.21291696238467, + "grad_norm": 0.8046549558639526, + "learning_rate": 9.57890702625976e-05, + "loss": 0.08327121138572693, + "step": 29680 + }, + { + "epoch": 4.214336408800568, + "grad_norm": 5.163249969482422, + "learning_rate": 9.578765081618169e-05, + "loss": 0.0650902271270752, + "step": 29690 + }, + { + "epoch": 4.215755855216465, + "grad_norm": 6.078125, + "learning_rate": 9.57862313697658e-05, + "loss": 0.02894149422645569, + "step": 29700 + }, + { + "epoch": 4.217175301632364, + "grad_norm": 2.5657360553741455, + "learning_rate": 9.57848119233499e-05, + "loss": 0.0827182650566101, + "step": 29710 + }, + { + "epoch": 4.218594748048261, + "grad_norm": 11.177501678466797, + "learning_rate": 9.5783392476934e-05, + "loss": 0.10848512649536132, + "step": 29720 + }, + { + "epoch": 4.220014194464159, + "grad_norm": 0.30279305577278137, + "learning_rate": 9.578197303051811e-05, + "loss": 0.04139010012149811, + "step": 29730 + }, + { + "epoch": 4.221433640880057, + "grad_norm": 5.513749599456787, + "learning_rate": 9.57805535841022e-05, + "loss": 0.07512015104293823, + "step": 29740 + }, + { + "epoch": 4.222853087295954, + "grad_norm": 0.548879086971283, + "learning_rate": 9.577913413768632e-05, + "loss": 0.0479542076587677, + "step": 29750 + }, + { + "epoch": 4.2242725337118525, + "grad_norm": 4.692379951477051, + "learning_rate": 9.57777146912704e-05, + "loss": 0.050175410509109494, + "step": 29760 + }, + { + "epoch": 4.22569198012775, + "grad_norm": 5.73818302154541, + "learning_rate": 9.577629524485451e-05, + "loss": 0.04169095158576965, + "step": 29770 + }, + { + "epoch": 4.227111426543648, + "grad_norm": 2.4854674339294434, + "learning_rate": 9.577487579843861e-05, + "loss": 0.11341255903244019, + "step": 29780 + }, + { + "epoch": 4.228530872959546, + "grad_norm": 5.217370510101318, + "learning_rate": 9.577345635202272e-05, + "loss": 0.08126076459884643, + "step": 29790 + }, + { + "epoch": 4.229950319375444, + "grad_norm": 4.68181037902832, + "learning_rate": 9.577203690560682e-05, + "loss": 0.07262782454490661, + "step": 29800 + }, + { + "epoch": 4.231369765791341, + "grad_norm": 5.919559955596924, + "learning_rate": 9.577061745919093e-05, + "loss": 0.0595442533493042, + "step": 29810 + }, + { + "epoch": 4.232789212207239, + "grad_norm": 2.606187582015991, + "learning_rate": 9.576919801277502e-05, + "loss": 0.07997668385505677, + "step": 29820 + }, + { + "epoch": 4.234208658623137, + "grad_norm": 8.676187515258789, + "learning_rate": 9.576777856635912e-05, + "loss": 0.09488874673843384, + "step": 29830 + }, + { + "epoch": 4.2356281050390345, + "grad_norm": 0.4818950295448303, + "learning_rate": 9.576635911994323e-05, + "loss": 0.04851639866828918, + "step": 29840 + }, + { + "epoch": 4.237047551454933, + "grad_norm": 2.5943143367767334, + "learning_rate": 9.576493967352733e-05, + "loss": 0.07941715121269226, + "step": 29850 + }, + { + "epoch": 4.23846699787083, + "grad_norm": 2.3599934577941895, + "learning_rate": 9.576352022711144e-05, + "loss": 0.048725086450576785, + "step": 29860 + }, + { + "epoch": 4.239886444286729, + "grad_norm": 3.0058441162109375, + "learning_rate": 9.576210078069552e-05, + "loss": 0.057837444543838504, + "step": 29870 + }, + { + "epoch": 4.241305890702626, + "grad_norm": 6.7187700271606445, + "learning_rate": 9.576068133427964e-05, + "loss": 0.0750263512134552, + "step": 29880 + }, + { + "epoch": 4.242725337118523, + "grad_norm": 8.70337200164795, + "learning_rate": 9.575926188786373e-05, + "loss": 0.10019056797027588, + "step": 29890 + }, + { + "epoch": 4.244144783534422, + "grad_norm": 6.477367401123047, + "learning_rate": 9.575784244144784e-05, + "loss": 0.0946097731590271, + "step": 29900 + }, + { + "epoch": 4.245564229950319, + "grad_norm": 2.6386449337005615, + "learning_rate": 9.575642299503194e-05, + "loss": 0.04361373782157898, + "step": 29910 + }, + { + "epoch": 4.246983676366217, + "grad_norm": 1.9583840370178223, + "learning_rate": 9.575500354861604e-05, + "loss": 0.08449437022209168, + "step": 29920 + }, + { + "epoch": 4.248403122782115, + "grad_norm": 3.64117693901062, + "learning_rate": 9.575358410220015e-05, + "loss": 0.05209539532661438, + "step": 29930 + }, + { + "epoch": 4.249822569198013, + "grad_norm": 0.24194148182868958, + "learning_rate": 9.575216465578425e-05, + "loss": 0.06707227230072021, + "step": 29940 + }, + { + "epoch": 4.251242015613911, + "grad_norm": 0.11548645794391632, + "learning_rate": 9.575074520936836e-05, + "loss": 0.04010304808616638, + "step": 29950 + }, + { + "epoch": 4.252661462029808, + "grad_norm": 5.685769557952881, + "learning_rate": 9.574932576295246e-05, + "loss": 0.049138793349266054, + "step": 29960 + }, + { + "epoch": 4.254080908445706, + "grad_norm": 4.332538604736328, + "learning_rate": 9.574790631653655e-05, + "loss": 0.05995388031005859, + "step": 29970 + }, + { + "epoch": 4.255500354861604, + "grad_norm": 7.861778259277344, + "learning_rate": 9.574648687012065e-05, + "loss": 0.10826575756072998, + "step": 29980 + }, + { + "epoch": 4.256919801277502, + "grad_norm": 3.978131055831909, + "learning_rate": 9.574506742370476e-05, + "loss": 0.03147943317890167, + "step": 29990 + }, + { + "epoch": 4.258339247693399, + "grad_norm": 2.1925299167633057, + "learning_rate": 9.574364797728886e-05, + "loss": 0.025776213407516478, + "step": 30000 + }, + { + "epoch": 4.258339247693399, + "eval_accuracy": 0.9660456539708782, + "eval_loss": 0.09928029775619507, + "eval_runtime": 35.5615, + "eval_samples_per_second": 442.248, + "eval_steps_per_second": 13.835, + "step": 30000 + }, + { + "epoch": 4.259758694109298, + "grad_norm": 0.1545456051826477, + "learning_rate": 9.574222853087297e-05, + "loss": 0.04762240052223206, + "step": 30010 + }, + { + "epoch": 4.261178140525195, + "grad_norm": 1.7185860872268677, + "learning_rate": 9.574080908445707e-05, + "loss": 0.05865336060523987, + "step": 30020 + }, + { + "epoch": 4.262597586941093, + "grad_norm": 8.793628692626953, + "learning_rate": 9.573938963804116e-05, + "loss": 0.0683655321598053, + "step": 30030 + }, + { + "epoch": 4.264017033356991, + "grad_norm": 0.2601637840270996, + "learning_rate": 9.573797019162528e-05, + "loss": 0.06883899569511413, + "step": 30040 + }, + { + "epoch": 4.265436479772888, + "grad_norm": 3.019463062286377, + "learning_rate": 9.573655074520937e-05, + "loss": 0.061128252744674684, + "step": 30050 + }, + { + "epoch": 4.266855926188787, + "grad_norm": 5.62221622467041, + "learning_rate": 9.573513129879348e-05, + "loss": 0.04778895676136017, + "step": 30060 + }, + { + "epoch": 4.268275372604684, + "grad_norm": 3.3797640800476074, + "learning_rate": 9.573371185237757e-05, + "loss": 0.03307014107704163, + "step": 30070 + }, + { + "epoch": 4.269694819020582, + "grad_norm": 1.279465675354004, + "learning_rate": 9.573229240596168e-05, + "loss": 0.06789458394050599, + "step": 30080 + }, + { + "epoch": 4.27111426543648, + "grad_norm": 7.307963848114014, + "learning_rate": 9.573087295954578e-05, + "loss": 0.060098963975906375, + "step": 30090 + }, + { + "epoch": 4.272533711852377, + "grad_norm": 1.0080957412719727, + "learning_rate": 9.572945351312989e-05, + "loss": 0.048102378845214844, + "step": 30100 + }, + { + "epoch": 4.2739531582682755, + "grad_norm": 3.0274055004119873, + "learning_rate": 9.572803406671398e-05, + "loss": 0.026803615689277648, + "step": 30110 + }, + { + "epoch": 4.275372604684173, + "grad_norm": 10.846437454223633, + "learning_rate": 9.572661462029808e-05, + "loss": 0.06357570886611938, + "step": 30120 + }, + { + "epoch": 4.276792051100071, + "grad_norm": 4.810959815979004, + "learning_rate": 9.572519517388219e-05, + "loss": 0.13068749904632568, + "step": 30130 + }, + { + "epoch": 4.278211497515969, + "grad_norm": 9.625565528869629, + "learning_rate": 9.572377572746629e-05, + "loss": 0.12544605731964112, + "step": 30140 + }, + { + "epoch": 4.279630943931867, + "grad_norm": 6.736867904663086, + "learning_rate": 9.57223562810504e-05, + "loss": 0.060517168045043944, + "step": 30150 + }, + { + "epoch": 4.281050390347764, + "grad_norm": 7.1672868728637695, + "learning_rate": 9.57209368346345e-05, + "loss": 0.10456200838088989, + "step": 30160 + }, + { + "epoch": 4.282469836763662, + "grad_norm": 2.916855812072754, + "learning_rate": 9.571951738821861e-05, + "loss": 0.09013462662696839, + "step": 30170 + }, + { + "epoch": 4.28388928317956, + "grad_norm": 0.9611710906028748, + "learning_rate": 9.571809794180269e-05, + "loss": 0.052769911289215085, + "step": 30180 + }, + { + "epoch": 4.2853087295954575, + "grad_norm": 8.947700500488281, + "learning_rate": 9.57166784953868e-05, + "loss": 0.07875499725341797, + "step": 30190 + }, + { + "epoch": 4.286728176011356, + "grad_norm": 11.802430152893066, + "learning_rate": 9.57152590489709e-05, + "loss": 0.08428643941879273, + "step": 30200 + }, + { + "epoch": 4.288147622427253, + "grad_norm": 8.532755851745605, + "learning_rate": 9.571383960255501e-05, + "loss": 0.061253076791763304, + "step": 30210 + }, + { + "epoch": 4.2895670688431515, + "grad_norm": 7.606026649475098, + "learning_rate": 9.571242015613911e-05, + "loss": 0.04498938620090485, + "step": 30220 + }, + { + "epoch": 4.290986515259049, + "grad_norm": 5.194207191467285, + "learning_rate": 9.57110007097232e-05, + "loss": 0.053921067714691163, + "step": 30230 + }, + { + "epoch": 4.292405961674946, + "grad_norm": 2.436835527420044, + "learning_rate": 9.570958126330732e-05, + "loss": 0.0879688322544098, + "step": 30240 + }, + { + "epoch": 4.293825408090845, + "grad_norm": 5.799166202545166, + "learning_rate": 9.570816181689141e-05, + "loss": 0.027118155360221864, + "step": 30250 + }, + { + "epoch": 4.295244854506742, + "grad_norm": 7.482603073120117, + "learning_rate": 9.570674237047553e-05, + "loss": 0.07994829416275025, + "step": 30260 + }, + { + "epoch": 4.29666430092264, + "grad_norm": 0.9970318675041199, + "learning_rate": 9.570532292405962e-05, + "loss": 0.10803250074386597, + "step": 30270 + }, + { + "epoch": 4.298083747338538, + "grad_norm": 8.933618545532227, + "learning_rate": 9.570390347764372e-05, + "loss": 0.07788187265396118, + "step": 30280 + }, + { + "epoch": 4.299503193754436, + "grad_norm": 3.686373472213745, + "learning_rate": 9.570248403122782e-05, + "loss": 0.05900847911834717, + "step": 30290 + }, + { + "epoch": 4.3009226401703335, + "grad_norm": 5.501690864562988, + "learning_rate": 9.570106458481193e-05, + "loss": 0.09661505222320557, + "step": 30300 + }, + { + "epoch": 4.302342086586231, + "grad_norm": 6.418631553649902, + "learning_rate": 9.569964513839603e-05, + "loss": 0.07957556247711181, + "step": 30310 + }, + { + "epoch": 4.303761533002129, + "grad_norm": 6.4076032638549805, + "learning_rate": 9.569822569198014e-05, + "loss": 0.07458949685096741, + "step": 30320 + }, + { + "epoch": 4.305180979418027, + "grad_norm": 0.4291207790374756, + "learning_rate": 9.569680624556423e-05, + "loss": 0.0629243791103363, + "step": 30330 + }, + { + "epoch": 4.306600425833925, + "grad_norm": 1.803011178970337, + "learning_rate": 9.569538679914833e-05, + "loss": 0.07003772854804993, + "step": 30340 + }, + { + "epoch": 4.308019872249822, + "grad_norm": 2.3013916015625, + "learning_rate": 9.569396735273244e-05, + "loss": 0.058683961629867554, + "step": 30350 + }, + { + "epoch": 4.309439318665721, + "grad_norm": 1.1245123147964478, + "learning_rate": 9.569254790631654e-05, + "loss": 0.06056886911392212, + "step": 30360 + }, + { + "epoch": 4.310858765081618, + "grad_norm": 0.9130068421363831, + "learning_rate": 9.569112845990065e-05, + "loss": 0.03365835845470429, + "step": 30370 + }, + { + "epoch": 4.312278211497516, + "grad_norm": 14.019575119018555, + "learning_rate": 9.568970901348473e-05, + "loss": 0.08868294954299927, + "step": 30380 + }, + { + "epoch": 4.313697657913414, + "grad_norm": 5.392879009246826, + "learning_rate": 9.568828956706885e-05, + "loss": 0.09387872219085694, + "step": 30390 + }, + { + "epoch": 4.315117104329311, + "grad_norm": 1.6492738723754883, + "learning_rate": 9.568687012065294e-05, + "loss": 0.041343361139297485, + "step": 30400 + }, + { + "epoch": 4.31653655074521, + "grad_norm": 12.153800010681152, + "learning_rate": 9.568545067423705e-05, + "loss": 0.12785712480545045, + "step": 30410 + }, + { + "epoch": 4.317955997161107, + "grad_norm": 0.7066358923912048, + "learning_rate": 9.568403122782115e-05, + "loss": 0.048977088928222653, + "step": 30420 + }, + { + "epoch": 4.319375443577005, + "grad_norm": 4.7776055335998535, + "learning_rate": 9.568261178140525e-05, + "loss": 0.13408685922622682, + "step": 30430 + }, + { + "epoch": 4.320794889992903, + "grad_norm": 0.4777391850948334, + "learning_rate": 9.568119233498936e-05, + "loss": 0.03871457576751709, + "step": 30440 + }, + { + "epoch": 4.3222143364088, + "grad_norm": 1.3897401094436646, + "learning_rate": 9.567977288857346e-05, + "loss": 0.08946848511695862, + "step": 30450 + }, + { + "epoch": 4.3236337828246985, + "grad_norm": 4.937811851501465, + "learning_rate": 9.567835344215757e-05, + "loss": 0.06665077805519104, + "step": 30460 + }, + { + "epoch": 4.325053229240596, + "grad_norm": 6.3543195724487305, + "learning_rate": 9.567693399574167e-05, + "loss": 0.05886696577072144, + "step": 30470 + }, + { + "epoch": 4.326472675656494, + "grad_norm": 0.3217252194881439, + "learning_rate": 9.567551454932576e-05, + "loss": 0.06166156530380249, + "step": 30480 + }, + { + "epoch": 4.327892122072392, + "grad_norm": 7.3612494468688965, + "learning_rate": 9.567409510290986e-05, + "loss": 0.06469403505325318, + "step": 30490 + }, + { + "epoch": 4.32931156848829, + "grad_norm": 5.631328582763672, + "learning_rate": 9.567267565649397e-05, + "loss": 0.05832824110984802, + "step": 30500 + }, + { + "epoch": 4.32931156848829, + "eval_accuracy": 0.9713232021364532, + "eval_loss": 0.08604324609041214, + "eval_runtime": 34.5081, + "eval_samples_per_second": 455.749, + "eval_steps_per_second": 14.258, + "step": 30500 + }, + { + "epoch": 4.330731014904187, + "grad_norm": 2.4201743602752686, + "learning_rate": 9.567125621007807e-05, + "loss": 0.052916485071182254, + "step": 30510 + }, + { + "epoch": 4.332150461320085, + "grad_norm": 4.678896427154541, + "learning_rate": 9.566983676366218e-05, + "loss": 0.06698215603828431, + "step": 30520 + }, + { + "epoch": 4.333569907735983, + "grad_norm": 8.06701946258545, + "learning_rate": 9.566841731724629e-05, + "loss": 0.09104944467544555, + "step": 30530 + }, + { + "epoch": 4.3349893541518805, + "grad_norm": 3.8929500579833984, + "learning_rate": 9.566699787083037e-05, + "loss": 0.0670436143875122, + "step": 30540 + }, + { + "epoch": 4.336408800567779, + "grad_norm": 7.440217018127441, + "learning_rate": 9.566557842441449e-05, + "loss": 0.08899651765823365, + "step": 30550 + }, + { + "epoch": 4.337828246983676, + "grad_norm": 0.19266436994075775, + "learning_rate": 9.566415897799858e-05, + "loss": 0.1018686056137085, + "step": 30560 + }, + { + "epoch": 4.3392476933995745, + "grad_norm": 6.425552845001221, + "learning_rate": 9.56627395315827e-05, + "loss": 0.06009323000907898, + "step": 30570 + }, + { + "epoch": 4.340667139815472, + "grad_norm": 1.528925895690918, + "learning_rate": 9.566132008516679e-05, + "loss": 0.0655640721321106, + "step": 30580 + }, + { + "epoch": 4.342086586231369, + "grad_norm": 4.253498077392578, + "learning_rate": 9.565990063875089e-05, + "loss": 0.039850252866745, + "step": 30590 + }, + { + "epoch": 4.343506032647268, + "grad_norm": 3.8556296825408936, + "learning_rate": 9.565848119233499e-05, + "loss": 0.09143298268318176, + "step": 30600 + }, + { + "epoch": 4.344925479063165, + "grad_norm": 3.4295029640197754, + "learning_rate": 9.56570617459191e-05, + "loss": 0.06332918405532836, + "step": 30610 + }, + { + "epoch": 4.346344925479063, + "grad_norm": 6.733371734619141, + "learning_rate": 9.565564229950321e-05, + "loss": 0.054583460092544556, + "step": 30620 + }, + { + "epoch": 4.347764371894961, + "grad_norm": 6.199682235717773, + "learning_rate": 9.56542228530873e-05, + "loss": 0.04864825010299682, + "step": 30630 + }, + { + "epoch": 4.349183818310859, + "grad_norm": 1.4338998794555664, + "learning_rate": 9.56528034066714e-05, + "loss": 0.0877656877040863, + "step": 30640 + }, + { + "epoch": 4.3506032647267565, + "grad_norm": 0.36631831526756287, + "learning_rate": 9.56513839602555e-05, + "loss": 0.06484183073043823, + "step": 30650 + }, + { + "epoch": 4.352022711142654, + "grad_norm": 1.5596531629562378, + "learning_rate": 9.564996451383961e-05, + "loss": 0.08244015574455262, + "step": 30660 + }, + { + "epoch": 4.353442157558552, + "grad_norm": 3.677886962890625, + "learning_rate": 9.564854506742371e-05, + "loss": 0.09137284755706787, + "step": 30670 + }, + { + "epoch": 4.35486160397445, + "grad_norm": 0.5668706297874451, + "learning_rate": 9.564712562100782e-05, + "loss": 0.08606833815574647, + "step": 30680 + }, + { + "epoch": 4.356281050390348, + "grad_norm": 2.088705062866211, + "learning_rate": 9.56457061745919e-05, + "loss": 0.053562283515930176, + "step": 30690 + }, + { + "epoch": 4.357700496806245, + "grad_norm": 6.448215484619141, + "learning_rate": 9.564428672817601e-05, + "loss": 0.06233048439025879, + "step": 30700 + }, + { + "epoch": 4.359119943222144, + "grad_norm": 1.7031103372573853, + "learning_rate": 9.564286728176012e-05, + "loss": 0.035029086470603946, + "step": 30710 + }, + { + "epoch": 4.360539389638041, + "grad_norm": 7.886801719665527, + "learning_rate": 9.564144783534422e-05, + "loss": 0.13027390241622924, + "step": 30720 + }, + { + "epoch": 4.3619588360539385, + "grad_norm": 3.4515767097473145, + "learning_rate": 9.564002838892833e-05, + "loss": 0.08356902599334717, + "step": 30730 + }, + { + "epoch": 4.363378282469837, + "grad_norm": 2.99210786819458, + "learning_rate": 9.563860894251242e-05, + "loss": 0.09152278900146485, + "step": 30740 + }, + { + "epoch": 4.364797728885734, + "grad_norm": 2.528543710708618, + "learning_rate": 9.563718949609653e-05, + "loss": 0.029031559824943542, + "step": 30750 + }, + { + "epoch": 4.366217175301633, + "grad_norm": 1.8579683303833008, + "learning_rate": 9.563577004968063e-05, + "loss": 0.0629112720489502, + "step": 30760 + }, + { + "epoch": 4.36763662171753, + "grad_norm": 8.009541511535645, + "learning_rate": 9.563435060326474e-05, + "loss": 0.06843248009681702, + "step": 30770 + }, + { + "epoch": 4.369056068133428, + "grad_norm": 9.022368431091309, + "learning_rate": 9.563293115684883e-05, + "loss": 0.07185935974121094, + "step": 30780 + }, + { + "epoch": 4.370475514549326, + "grad_norm": 10.9826021194458, + "learning_rate": 9.563151171043293e-05, + "loss": 0.05564273595809936, + "step": 30790 + }, + { + "epoch": 4.371894960965223, + "grad_norm": 4.034165859222412, + "learning_rate": 9.563009226401704e-05, + "loss": 0.07726762294769288, + "step": 30800 + }, + { + "epoch": 4.373314407381121, + "grad_norm": 1.4694591760635376, + "learning_rate": 9.562867281760114e-05, + "loss": 0.044573986530303956, + "step": 30810 + }, + { + "epoch": 4.374733853797019, + "grad_norm": 8.172507286071777, + "learning_rate": 9.562725337118525e-05, + "loss": 0.06269959807395935, + "step": 30820 + }, + { + "epoch": 4.376153300212917, + "grad_norm": 2.3326151371002197, + "learning_rate": 9.562583392476935e-05, + "loss": 0.03975181579589844, + "step": 30830 + }, + { + "epoch": 4.377572746628815, + "grad_norm": 1.3543591499328613, + "learning_rate": 9.562441447835344e-05, + "loss": 0.062531578540802, + "step": 30840 + }, + { + "epoch": 4.378992193044713, + "grad_norm": 0.6076768636703491, + "learning_rate": 9.562299503193754e-05, + "loss": 0.06527051329612732, + "step": 30850 + }, + { + "epoch": 4.38041163946061, + "grad_norm": 2.1021621227264404, + "learning_rate": 9.562157558552165e-05, + "loss": 0.03416385054588318, + "step": 30860 + }, + { + "epoch": 4.381831085876508, + "grad_norm": 0.2831692099571228, + "learning_rate": 9.562015613910575e-05, + "loss": 0.051375854015350345, + "step": 30870 + }, + { + "epoch": 4.383250532292406, + "grad_norm": 4.700906753540039, + "learning_rate": 9.561873669268986e-05, + "loss": 0.055076032876968384, + "step": 30880 + }, + { + "epoch": 4.384669978708303, + "grad_norm": 8.541457176208496, + "learning_rate": 9.561731724627396e-05, + "loss": 0.09593217372894287, + "step": 30890 + }, + { + "epoch": 4.386089425124202, + "grad_norm": 3.044889450073242, + "learning_rate": 9.561589779985806e-05, + "loss": 0.08643177151679993, + "step": 30900 + }, + { + "epoch": 4.387508871540099, + "grad_norm": 5.985223293304443, + "learning_rate": 9.561447835344217e-05, + "loss": 0.0780254065990448, + "step": 30910 + }, + { + "epoch": 4.3889283179559975, + "grad_norm": 10.373763084411621, + "learning_rate": 9.561305890702626e-05, + "loss": 0.06224080324172974, + "step": 30920 + }, + { + "epoch": 4.390347764371895, + "grad_norm": 0.9211413264274597, + "learning_rate": 9.561163946061038e-05, + "loss": 0.049274078011512755, + "step": 30930 + }, + { + "epoch": 4.391767210787792, + "grad_norm": 6.857527732849121, + "learning_rate": 9.561022001419447e-05, + "loss": 0.08677098751068116, + "step": 30940 + }, + { + "epoch": 4.393186657203691, + "grad_norm": 4.365863800048828, + "learning_rate": 9.560880056777857e-05, + "loss": 0.09109071493148804, + "step": 30950 + }, + { + "epoch": 4.394606103619588, + "grad_norm": 0.203486829996109, + "learning_rate": 9.560738112136267e-05, + "loss": 0.044797495007514954, + "step": 30960 + }, + { + "epoch": 4.396025550035486, + "grad_norm": 0.31215134263038635, + "learning_rate": 9.560596167494678e-05, + "loss": 0.0890379548072815, + "step": 30970 + }, + { + "epoch": 4.397444996451384, + "grad_norm": 0.34271034598350525, + "learning_rate": 9.560454222853088e-05, + "loss": 0.07275915145874023, + "step": 30980 + }, + { + "epoch": 4.398864442867282, + "grad_norm": 4.180636882781982, + "learning_rate": 9.560312278211499e-05, + "loss": 0.03852737843990326, + "step": 30990 + }, + { + "epoch": 4.4002838892831795, + "grad_norm": 0.41002199053764343, + "learning_rate": 9.560170333569908e-05, + "loss": 0.07415003180503846, + "step": 31000 + }, + { + "epoch": 4.4002838892831795, + "eval_accuracy": 0.9713232021364532, + "eval_loss": 0.0869520977139473, + "eval_runtime": 34.925, + "eval_samples_per_second": 450.307, + "eval_steps_per_second": 14.087, + "step": 31000 + }, + { + "epoch": 4.401703335699077, + "grad_norm": 0.16246958076953888, + "learning_rate": 9.560028388928318e-05, + "loss": 0.04245265126228333, + "step": 31010 + }, + { + "epoch": 4.403122782114975, + "grad_norm": 3.3791964054107666, + "learning_rate": 9.559886444286729e-05, + "loss": 0.08227800130844116, + "step": 31020 + }, + { + "epoch": 4.404542228530873, + "grad_norm": 0.25004979968070984, + "learning_rate": 9.559744499645139e-05, + "loss": 0.05259315967559815, + "step": 31030 + }, + { + "epoch": 4.405961674946771, + "grad_norm": 5.847822666168213, + "learning_rate": 9.55960255500355e-05, + "loss": 0.08936032056808471, + "step": 31040 + }, + { + "epoch": 4.407381121362668, + "grad_norm": 0.43625226616859436, + "learning_rate": 9.559460610361958e-05, + "loss": 0.06444858312606812, + "step": 31050 + }, + { + "epoch": 4.408800567778567, + "grad_norm": 6.678699493408203, + "learning_rate": 9.55931866572037e-05, + "loss": 0.09537352323532104, + "step": 31060 + }, + { + "epoch": 4.410220014194464, + "grad_norm": 11.696027755737305, + "learning_rate": 9.559176721078779e-05, + "loss": 0.11680700778961181, + "step": 31070 + }, + { + "epoch": 4.4116394606103615, + "grad_norm": 6.337532043457031, + "learning_rate": 9.55903477643719e-05, + "loss": 0.04228192269802093, + "step": 31080 + }, + { + "epoch": 4.41305890702626, + "grad_norm": 1.4329833984375, + "learning_rate": 9.5588928317956e-05, + "loss": 0.03295584321022034, + "step": 31090 + }, + { + "epoch": 4.414478353442157, + "grad_norm": 7.40330171585083, + "learning_rate": 9.55875088715401e-05, + "loss": 0.07505521774291993, + "step": 31100 + }, + { + "epoch": 4.4158977998580555, + "grad_norm": 0.8913317918777466, + "learning_rate": 9.558608942512421e-05, + "loss": 0.11474095582962036, + "step": 31110 + }, + { + "epoch": 4.417317246273953, + "grad_norm": 2.8797833919525146, + "learning_rate": 9.55846699787083e-05, + "loss": 0.04045186638832092, + "step": 31120 + }, + { + "epoch": 4.418736692689851, + "grad_norm": 7.083196640014648, + "learning_rate": 9.558325053229242e-05, + "loss": 0.08086020350456238, + "step": 31130 + }, + { + "epoch": 4.420156139105749, + "grad_norm": 7.3477349281311035, + "learning_rate": 9.558183108587652e-05, + "loss": 0.09126662015914917, + "step": 31140 + }, + { + "epoch": 4.421575585521646, + "grad_norm": 2.6615936756134033, + "learning_rate": 9.558041163946061e-05, + "loss": 0.11547311544418334, + "step": 31150 + }, + { + "epoch": 4.422995031937544, + "grad_norm": 0.38098084926605225, + "learning_rate": 9.557899219304471e-05, + "loss": 0.09409705996513366, + "step": 31160 + }, + { + "epoch": 4.424414478353442, + "grad_norm": 4.258413314819336, + "learning_rate": 9.557757274662882e-05, + "loss": 0.07088276147842407, + "step": 31170 + }, + { + "epoch": 4.42583392476934, + "grad_norm": 0.6552639603614807, + "learning_rate": 9.557615330021292e-05, + "loss": 0.0480211079120636, + "step": 31180 + }, + { + "epoch": 4.4272533711852375, + "grad_norm": 0.469436377286911, + "learning_rate": 9.557473385379703e-05, + "loss": 0.03103916049003601, + "step": 31190 + }, + { + "epoch": 4.428672817601136, + "grad_norm": 0.1175021231174469, + "learning_rate": 9.557331440738113e-05, + "loss": 0.050581169128417966, + "step": 31200 + }, + { + "epoch": 4.430092264017033, + "grad_norm": 2.0325958728790283, + "learning_rate": 9.557189496096522e-05, + "loss": 0.08802063465118408, + "step": 31210 + }, + { + "epoch": 4.431511710432932, + "grad_norm": 3.3236446380615234, + "learning_rate": 9.557047551454933e-05, + "loss": 0.040520912408828734, + "step": 31220 + }, + { + "epoch": 4.432931156848829, + "grad_norm": 6.1703596115112305, + "learning_rate": 9.556905606813343e-05, + "loss": 0.051608556509017946, + "step": 31230 + }, + { + "epoch": 4.434350603264726, + "grad_norm": 4.604761123657227, + "learning_rate": 9.556763662171754e-05, + "loss": 0.06144062280654907, + "step": 31240 + }, + { + "epoch": 4.435770049680625, + "grad_norm": 11.01284408569336, + "learning_rate": 9.556621717530164e-05, + "loss": 0.16597810983657837, + "step": 31250 + }, + { + "epoch": 4.437189496096522, + "grad_norm": 8.154959678649902, + "learning_rate": 9.556479772888574e-05, + "loss": 0.048260855674743655, + "step": 31260 + }, + { + "epoch": 4.43860894251242, + "grad_norm": 8.059030532836914, + "learning_rate": 9.556337828246984e-05, + "loss": 0.1349133014678955, + "step": 31270 + }, + { + "epoch": 4.440028388928318, + "grad_norm": 3.50076961517334, + "learning_rate": 9.556195883605395e-05, + "loss": 0.1305612087249756, + "step": 31280 + }, + { + "epoch": 4.441447835344216, + "grad_norm": 2.289435386657715, + "learning_rate": 9.556053938963804e-05, + "loss": 0.05613391995429993, + "step": 31290 + }, + { + "epoch": 4.442867281760114, + "grad_norm": 1.1918927431106567, + "learning_rate": 9.555911994322215e-05, + "loss": 0.03883382380008697, + "step": 31300 + }, + { + "epoch": 4.444286728176011, + "grad_norm": 2.9775784015655518, + "learning_rate": 9.555770049680625e-05, + "loss": 0.06229335069656372, + "step": 31310 + }, + { + "epoch": 4.445706174591909, + "grad_norm": 11.473634719848633, + "learning_rate": 9.555628105039035e-05, + "loss": 0.10125520229339599, + "step": 31320 + }, + { + "epoch": 4.447125621007807, + "grad_norm": 0.14798550307750702, + "learning_rate": 9.555486160397446e-05, + "loss": 0.042588675022125246, + "step": 31330 + }, + { + "epoch": 4.448545067423705, + "grad_norm": 0.5757963061332703, + "learning_rate": 9.555344215755856e-05, + "loss": 0.04882889091968536, + "step": 31340 + }, + { + "epoch": 4.4499645138396025, + "grad_norm": 3.5108330249786377, + "learning_rate": 9.555202271114267e-05, + "loss": 0.11374999284744262, + "step": 31350 + }, + { + "epoch": 4.451383960255501, + "grad_norm": 0.21131829917430878, + "learning_rate": 9.555060326472675e-05, + "loss": 0.06701189875602723, + "step": 31360 + }, + { + "epoch": 4.452803406671398, + "grad_norm": 0.859306812286377, + "learning_rate": 9.554918381831086e-05, + "loss": 0.059961211681365964, + "step": 31370 + }, + { + "epoch": 4.454222853087296, + "grad_norm": 10.532944679260254, + "learning_rate": 9.554776437189496e-05, + "loss": 0.07974779605865479, + "step": 31380 + }, + { + "epoch": 4.455642299503194, + "grad_norm": 16.47031593322754, + "learning_rate": 9.554634492547907e-05, + "loss": 0.0871815800666809, + "step": 31390 + }, + { + "epoch": 4.457061745919091, + "grad_norm": 1.5741618871688843, + "learning_rate": 9.554492547906317e-05, + "loss": 0.036674460768699645, + "step": 31400 + }, + { + "epoch": 4.45848119233499, + "grad_norm": 0.6788744330406189, + "learning_rate": 9.554350603264727e-05, + "loss": 0.06158609390258789, + "step": 31410 + }, + { + "epoch": 4.459900638750887, + "grad_norm": 8.353523254394531, + "learning_rate": 9.554208658623138e-05, + "loss": 0.037455737590789795, + "step": 31420 + }, + { + "epoch": 4.461320085166785, + "grad_norm": 5.952844142913818, + "learning_rate": 9.554066713981547e-05, + "loss": 0.06949056386947632, + "step": 31430 + }, + { + "epoch": 4.462739531582683, + "grad_norm": 6.8104143142700195, + "learning_rate": 9.553924769339959e-05, + "loss": 0.04634210467338562, + "step": 31440 + }, + { + "epoch": 4.46415897799858, + "grad_norm": 1.9237818717956543, + "learning_rate": 9.553782824698368e-05, + "loss": 0.045291933417320254, + "step": 31450 + }, + { + "epoch": 4.4655784244144785, + "grad_norm": 1.631089448928833, + "learning_rate": 9.553640880056778e-05, + "loss": 0.05780811309814453, + "step": 31460 + }, + { + "epoch": 4.466997870830376, + "grad_norm": 1.2360000610351562, + "learning_rate": 9.553498935415188e-05, + "loss": 0.047595706582069394, + "step": 31470 + }, + { + "epoch": 4.468417317246274, + "grad_norm": 4.273767471313477, + "learning_rate": 9.553356990773599e-05, + "loss": 0.029149368405342102, + "step": 31480 + }, + { + "epoch": 4.469836763662172, + "grad_norm": 5.198413848876953, + "learning_rate": 9.553215046132009e-05, + "loss": 0.08129512071609497, + "step": 31490 + }, + { + "epoch": 4.47125621007807, + "grad_norm": 1.2532944679260254, + "learning_rate": 9.55307310149042e-05, + "loss": 0.06522347927093505, + "step": 31500 + }, + { + "epoch": 4.47125621007807, + "eval_accuracy": 0.9718318814777135, + "eval_loss": 0.08366374671459198, + "eval_runtime": 34.8059, + "eval_samples_per_second": 451.849, + "eval_steps_per_second": 14.136, + "step": 31500 + }, + { + "epoch": 4.472675656493967, + "grad_norm": 2.549074411392212, + "learning_rate": 9.55293115684883e-05, + "loss": 0.07184516191482544, + "step": 31510 + }, + { + "epoch": 4.474095102909865, + "grad_norm": 0.25022396445274353, + "learning_rate": 9.552789212207239e-05, + "loss": 0.10292150974273681, + "step": 31520 + }, + { + "epoch": 4.475514549325763, + "grad_norm": 3.1270029544830322, + "learning_rate": 9.55264726756565e-05, + "loss": 0.04953309595584869, + "step": 31530 + }, + { + "epoch": 4.4769339957416605, + "grad_norm": 0.2376168668270111, + "learning_rate": 9.55250532292406e-05, + "loss": 0.029788446426391602, + "step": 31540 + }, + { + "epoch": 4.478353442157559, + "grad_norm": 0.9615975618362427, + "learning_rate": 9.552363378282471e-05, + "loss": 0.04785667657852173, + "step": 31550 + }, + { + "epoch": 4.479772888573456, + "grad_norm": 3.677119255065918, + "learning_rate": 9.552221433640881e-05, + "loss": 0.04802153706550598, + "step": 31560 + }, + { + "epoch": 4.4811923349893545, + "grad_norm": 5.188876152038574, + "learning_rate": 9.55207948899929e-05, + "loss": 0.054722541570663454, + "step": 31570 + }, + { + "epoch": 4.482611781405252, + "grad_norm": 4.769455909729004, + "learning_rate": 9.5519375443577e-05, + "loss": 0.116557776927948, + "step": 31580 + }, + { + "epoch": 4.484031227821149, + "grad_norm": 9.027771949768066, + "learning_rate": 9.551795599716111e-05, + "loss": 0.13705774545669555, + "step": 31590 + }, + { + "epoch": 4.485450674237048, + "grad_norm": 5.028712749481201, + "learning_rate": 9.551653655074521e-05, + "loss": 0.07994774580001832, + "step": 31600 + }, + { + "epoch": 4.486870120652945, + "grad_norm": 3.692356824874878, + "learning_rate": 9.551511710432932e-05, + "loss": 0.049330982565879825, + "step": 31610 + }, + { + "epoch": 4.488289567068843, + "grad_norm": 3.6784050464630127, + "learning_rate": 9.551369765791342e-05, + "loss": 0.03654472827911377, + "step": 31620 + }, + { + "epoch": 4.489709013484741, + "grad_norm": 2.1743340492248535, + "learning_rate": 9.551227821149752e-05, + "loss": 0.11464802026748658, + "step": 31630 + }, + { + "epoch": 4.491128459900639, + "grad_norm": 9.176986694335938, + "learning_rate": 9.551085876508163e-05, + "loss": 0.11709423065185547, + "step": 31640 + }, + { + "epoch": 4.4925479063165366, + "grad_norm": 4.8150811195373535, + "learning_rate": 9.550943931866573e-05, + "loss": 0.06142100691795349, + "step": 31650 + }, + { + "epoch": 4.493967352732434, + "grad_norm": 1.1660048961639404, + "learning_rate": 9.550801987224984e-05, + "loss": 0.05719171762466431, + "step": 31660 + }, + { + "epoch": 4.495386799148332, + "grad_norm": 0.7825644016265869, + "learning_rate": 9.550660042583392e-05, + "loss": 0.08388499021530152, + "step": 31670 + }, + { + "epoch": 4.49680624556423, + "grad_norm": 2.4386682510375977, + "learning_rate": 9.550518097941803e-05, + "loss": 0.08802780508995056, + "step": 31680 + }, + { + "epoch": 4.498225691980128, + "grad_norm": 2.295851230621338, + "learning_rate": 9.550376153300213e-05, + "loss": 0.055353093147277835, + "step": 31690 + }, + { + "epoch": 4.499645138396025, + "grad_norm": 6.695101737976074, + "learning_rate": 9.550234208658624e-05, + "loss": 0.11167588233947753, + "step": 31700 + }, + { + "epoch": 4.501064584811924, + "grad_norm": 2.08304500579834, + "learning_rate": 9.550092264017034e-05, + "loss": 0.07536173462867737, + "step": 31710 + }, + { + "epoch": 4.502484031227821, + "grad_norm": 6.953207969665527, + "learning_rate": 9.549950319375443e-05, + "loss": 0.06825330853462219, + "step": 31720 + }, + { + "epoch": 4.503903477643719, + "grad_norm": 2.6377944946289062, + "learning_rate": 9.549808374733854e-05, + "loss": 0.06334355473518372, + "step": 31730 + }, + { + "epoch": 4.505322924059617, + "grad_norm": 3.363705635070801, + "learning_rate": 9.549666430092264e-05, + "loss": 0.18799341917037965, + "step": 31740 + }, + { + "epoch": 4.506742370475514, + "grad_norm": 1.8969464302062988, + "learning_rate": 9.549524485450675e-05, + "loss": 0.04989268779754639, + "step": 31750 + }, + { + "epoch": 4.508161816891413, + "grad_norm": 1.185655951499939, + "learning_rate": 9.549382540809085e-05, + "loss": 0.08766244649887085, + "step": 31760 + }, + { + "epoch": 4.50958126330731, + "grad_norm": 2.129251003265381, + "learning_rate": 9.549240596167495e-05, + "loss": 0.06428298950195313, + "step": 31770 + }, + { + "epoch": 4.511000709723208, + "grad_norm": 1.2739909887313843, + "learning_rate": 9.549098651525905e-05, + "loss": 0.08616969585418702, + "step": 31780 + }, + { + "epoch": 4.512420156139106, + "grad_norm": 7.885573863983154, + "learning_rate": 9.548956706884316e-05, + "loss": 0.07006771564483642, + "step": 31790 + }, + { + "epoch": 4.513839602555003, + "grad_norm": 4.777563095092773, + "learning_rate": 9.548814762242725e-05, + "loss": 0.05692494511604309, + "step": 31800 + }, + { + "epoch": 4.5152590489709015, + "grad_norm": 1.3167340755462646, + "learning_rate": 9.548672817601136e-05, + "loss": 0.07449572682380676, + "step": 31810 + }, + { + "epoch": 4.516678495386799, + "grad_norm": 5.693206310272217, + "learning_rate": 9.548530872959546e-05, + "loss": 0.05288561582565308, + "step": 31820 + }, + { + "epoch": 4.518097941802697, + "grad_norm": 5.757571220397949, + "learning_rate": 9.548403122782116e-05, + "loss": 0.07998919486999512, + "step": 31830 + }, + { + "epoch": 4.519517388218595, + "grad_norm": 4.328059196472168, + "learning_rate": 9.548261178140526e-05, + "loss": 0.05517913699150086, + "step": 31840 + }, + { + "epoch": 4.520936834634493, + "grad_norm": 0.796028733253479, + "learning_rate": 9.548119233498936e-05, + "loss": 0.047811633348464964, + "step": 31850 + }, + { + "epoch": 4.52235628105039, + "grad_norm": 5.320539951324463, + "learning_rate": 9.547977288857347e-05, + "loss": 0.06639637351036072, + "step": 31860 + }, + { + "epoch": 4.523775727466289, + "grad_norm": 3.544234275817871, + "learning_rate": 9.547835344215756e-05, + "loss": 0.11271839141845703, + "step": 31870 + }, + { + "epoch": 4.525195173882186, + "grad_norm": 3.894627094268799, + "learning_rate": 9.547693399574167e-05, + "loss": 0.0395465612411499, + "step": 31880 + }, + { + "epoch": 4.5266146202980835, + "grad_norm": 4.7489752769470215, + "learning_rate": 9.547551454932577e-05, + "loss": 0.15020207166671753, + "step": 31890 + }, + { + "epoch": 4.528034066713982, + "grad_norm": 2.853905200958252, + "learning_rate": 9.547409510290987e-05, + "loss": 0.08104996681213379, + "step": 31900 + }, + { + "epoch": 4.529453513129879, + "grad_norm": 2.5537586212158203, + "learning_rate": 9.547267565649397e-05, + "loss": 0.08797919750213623, + "step": 31910 + }, + { + "epoch": 4.5308729595457775, + "grad_norm": 3.720608711242676, + "learning_rate": 9.547125621007808e-05, + "loss": 0.12364702224731446, + "step": 31920 + }, + { + "epoch": 4.532292405961675, + "grad_norm": 5.040463447570801, + "learning_rate": 9.546983676366217e-05, + "loss": 0.05389441847801209, + "step": 31930 + }, + { + "epoch": 4.533711852377573, + "grad_norm": 9.118657112121582, + "learning_rate": 9.546841731724629e-05, + "loss": 0.10287294387817383, + "step": 31940 + }, + { + "epoch": 4.535131298793471, + "grad_norm": 9.760089874267578, + "learning_rate": 9.546699787083038e-05, + "loss": 0.10598251819610596, + "step": 31950 + }, + { + "epoch": 4.536550745209368, + "grad_norm": 5.44436502456665, + "learning_rate": 9.546557842441448e-05, + "loss": 0.062908136844635, + "step": 31960 + }, + { + "epoch": 4.537970191625266, + "grad_norm": 7.934990882873535, + "learning_rate": 9.546415897799859e-05, + "loss": 0.06037212014198303, + "step": 31970 + }, + { + "epoch": 4.539389638041164, + "grad_norm": 6.5772480964660645, + "learning_rate": 9.546273953158269e-05, + "loss": 0.07325562834739685, + "step": 31980 + }, + { + "epoch": 4.540809084457062, + "grad_norm": 6.199497699737549, + "learning_rate": 9.54613200851668e-05, + "loss": 0.06814472675323487, + "step": 31990 + }, + { + "epoch": 4.5422285308729595, + "grad_norm": 1.220271348953247, + "learning_rate": 9.545990063875088e-05, + "loss": 0.11154254674911498, + "step": 32000 + }, + { + "epoch": 4.5422285308729595, + "eval_accuracy": 0.9696699942773574, + "eval_loss": 0.09096228331327438, + "eval_runtime": 35.0606, + "eval_samples_per_second": 448.566, + "eval_steps_per_second": 14.033, + "step": 32000 + }, + { + "epoch": 4.543647977288858, + "grad_norm": 1.179792046546936, + "learning_rate": 9.5458481192335e-05, + "loss": 0.033092746138572694, + "step": 32010 + }, + { + "epoch": 4.545067423704755, + "grad_norm": 4.7662553787231445, + "learning_rate": 9.545706174591909e-05, + "loss": 0.05603760480880737, + "step": 32020 + }, + { + "epoch": 4.546486870120653, + "grad_norm": 4.585840225219727, + "learning_rate": 9.54556422995032e-05, + "loss": 0.07328543066978455, + "step": 32030 + }, + { + "epoch": 4.547906316536551, + "grad_norm": 8.541946411132812, + "learning_rate": 9.54542228530873e-05, + "loss": 0.06177548766136169, + "step": 32040 + }, + { + "epoch": 4.549325762952448, + "grad_norm": 6.825544834136963, + "learning_rate": 9.54528034066714e-05, + "loss": 0.03726685047149658, + "step": 32050 + }, + { + "epoch": 4.550745209368347, + "grad_norm": 6.649820804595947, + "learning_rate": 9.545138396025551e-05, + "loss": 0.05233697891235352, + "step": 32060 + }, + { + "epoch": 4.552164655784244, + "grad_norm": 5.592628479003906, + "learning_rate": 9.54499645138396e-05, + "loss": 0.09716169834136963, + "step": 32070 + }, + { + "epoch": 4.553584102200142, + "grad_norm": 7.0894975662231445, + "learning_rate": 9.544854506742372e-05, + "loss": 0.08125547766685486, + "step": 32080 + }, + { + "epoch": 4.55500354861604, + "grad_norm": 8.739372253417969, + "learning_rate": 9.544712562100781e-05, + "loss": 0.13002804517745972, + "step": 32090 + }, + { + "epoch": 4.556422995031937, + "grad_norm": 6.633364677429199, + "learning_rate": 9.544570617459191e-05, + "loss": 0.0681275725364685, + "step": 32100 + }, + { + "epoch": 4.557842441447836, + "grad_norm": 6.804000377655029, + "learning_rate": 9.544428672817601e-05, + "loss": 0.04156455099582672, + "step": 32110 + }, + { + "epoch": 4.559261887863733, + "grad_norm": 0.5662475228309631, + "learning_rate": 9.544286728176012e-05, + "loss": 0.02463839203119278, + "step": 32120 + }, + { + "epoch": 4.560681334279631, + "grad_norm": 0.864768385887146, + "learning_rate": 9.544144783534422e-05, + "loss": 0.0701153576374054, + "step": 32130 + }, + { + "epoch": 4.562100780695529, + "grad_norm": 1.1445688009262085, + "learning_rate": 9.544002838892833e-05, + "loss": 0.023342382907867432, + "step": 32140 + }, + { + "epoch": 4.563520227111427, + "grad_norm": 1.1171879768371582, + "learning_rate": 9.543860894251243e-05, + "loss": 0.04527752697467804, + "step": 32150 + }, + { + "epoch": 4.564939673527324, + "grad_norm": 5.467402458190918, + "learning_rate": 9.543718949609652e-05, + "loss": 0.05509437322616577, + "step": 32160 + }, + { + "epoch": 4.566359119943222, + "grad_norm": 2.1214330196380615, + "learning_rate": 9.543577004968063e-05, + "loss": 0.03169718384742737, + "step": 32170 + }, + { + "epoch": 4.56777856635912, + "grad_norm": 6.447783946990967, + "learning_rate": 9.543435060326473e-05, + "loss": 0.07171815037727355, + "step": 32180 + }, + { + "epoch": 4.569198012775018, + "grad_norm": 1.2676538228988647, + "learning_rate": 9.543293115684884e-05, + "loss": 0.06441798210144042, + "step": 32190 + }, + { + "epoch": 4.570617459190916, + "grad_norm": 3.4596071243286133, + "learning_rate": 9.543151171043293e-05, + "loss": 0.07644802331924438, + "step": 32200 + }, + { + "epoch": 4.572036905606813, + "grad_norm": 4.619117259979248, + "learning_rate": 9.543009226401704e-05, + "loss": 0.04569814503192902, + "step": 32210 + }, + { + "epoch": 4.573456352022712, + "grad_norm": 0.3759884834289551, + "learning_rate": 9.542867281760113e-05, + "loss": 0.11063082218170166, + "step": 32220 + }, + { + "epoch": 4.574875798438609, + "grad_norm": 0.39691707491874695, + "learning_rate": 9.542725337118525e-05, + "loss": 0.06378236413002014, + "step": 32230 + }, + { + "epoch": 4.5762952448545064, + "grad_norm": 2.744389533996582, + "learning_rate": 9.542583392476934e-05, + "loss": 0.11052513122558594, + "step": 32240 + }, + { + "epoch": 4.577714691270405, + "grad_norm": 7.465341091156006, + "learning_rate": 9.542441447835345e-05, + "loss": 0.07533521056175232, + "step": 32250 + }, + { + "epoch": 4.579134137686302, + "grad_norm": 1.2778067588806152, + "learning_rate": 9.542299503193755e-05, + "loss": 0.05032462477684021, + "step": 32260 + }, + { + "epoch": 4.5805535841022005, + "grad_norm": 5.098127365112305, + "learning_rate": 9.542157558552165e-05, + "loss": 0.061452174186706544, + "step": 32270 + }, + { + "epoch": 4.581973030518098, + "grad_norm": 11.057262420654297, + "learning_rate": 9.542015613910576e-05, + "loss": 0.08994705080986024, + "step": 32280 + }, + { + "epoch": 4.583392476933996, + "grad_norm": 0.9384304881095886, + "learning_rate": 9.541873669268986e-05, + "loss": 0.05673693418502808, + "step": 32290 + }, + { + "epoch": 4.584811923349894, + "grad_norm": 7.694385528564453, + "learning_rate": 9.541731724627397e-05, + "loss": 0.1050719141960144, + "step": 32300 + }, + { + "epoch": 4.586231369765791, + "grad_norm": 1.4772675037384033, + "learning_rate": 9.541589779985805e-05, + "loss": 0.0497598260641098, + "step": 32310 + }, + { + "epoch": 4.587650816181689, + "grad_norm": 0.905949056148529, + "learning_rate": 9.541447835344216e-05, + "loss": 0.05204020738601685, + "step": 32320 + }, + { + "epoch": 4.589070262597587, + "grad_norm": 5.82963228225708, + "learning_rate": 9.541305890702626e-05, + "loss": 0.04697149097919464, + "step": 32330 + }, + { + "epoch": 4.590489709013485, + "grad_norm": 5.241878032684326, + "learning_rate": 9.541163946061037e-05, + "loss": 0.06778921484947205, + "step": 32340 + }, + { + "epoch": 4.5919091554293825, + "grad_norm": 0.09300513565540314, + "learning_rate": 9.541022001419447e-05, + "loss": 0.09513026475906372, + "step": 32350 + }, + { + "epoch": 4.593328601845281, + "grad_norm": 4.091070175170898, + "learning_rate": 9.540880056777857e-05, + "loss": 0.048994243144989014, + "step": 32360 + }, + { + "epoch": 4.594748048261178, + "grad_norm": 7.305341720581055, + "learning_rate": 9.540738112136268e-05, + "loss": 0.11231958866119385, + "step": 32370 + }, + { + "epoch": 4.596167494677076, + "grad_norm": 1.8714683055877686, + "learning_rate": 9.540596167494677e-05, + "loss": 0.10454151630401612, + "step": 32380 + }, + { + "epoch": 4.597586941092974, + "grad_norm": 7.5027618408203125, + "learning_rate": 9.540454222853088e-05, + "loss": 0.0959784746170044, + "step": 32390 + }, + { + "epoch": 4.599006387508871, + "grad_norm": 11.03409481048584, + "learning_rate": 9.540312278211498e-05, + "loss": 0.05968649983406067, + "step": 32400 + }, + { + "epoch": 4.60042583392477, + "grad_norm": 4.425593852996826, + "learning_rate": 9.540170333569908e-05, + "loss": 0.05584191679954529, + "step": 32410 + }, + { + "epoch": 4.601845280340667, + "grad_norm": 0.5005938410758972, + "learning_rate": 9.540028388928318e-05, + "loss": 0.07238389253616333, + "step": 32420 + }, + { + "epoch": 4.603264726756565, + "grad_norm": 9.244565963745117, + "learning_rate": 9.539886444286729e-05, + "loss": 0.05424014329910278, + "step": 32430 + }, + { + "epoch": 4.604684173172463, + "grad_norm": 5.8151445388793945, + "learning_rate": 9.539744499645138e-05, + "loss": 0.08108783960342407, + "step": 32440 + }, + { + "epoch": 4.60610361958836, + "grad_norm": 0.5076402425765991, + "learning_rate": 9.53960255500355e-05, + "loss": 0.11353771686553955, + "step": 32450 + }, + { + "epoch": 4.6075230660042585, + "grad_norm": 7.542196750640869, + "learning_rate": 9.539460610361959e-05, + "loss": 0.07132243514060974, + "step": 32460 + }, + { + "epoch": 4.608942512420156, + "grad_norm": 1.686920404434204, + "learning_rate": 9.539318665720369e-05, + "loss": 0.054832571744918825, + "step": 32470 + }, + { + "epoch": 4.610361958836054, + "grad_norm": 8.699249267578125, + "learning_rate": 9.53917672107878e-05, + "loss": 0.08153939843177796, + "step": 32480 + }, + { + "epoch": 4.611781405251952, + "grad_norm": 0.41042953729629517, + "learning_rate": 9.53903477643719e-05, + "loss": 0.054275840520858765, + "step": 32490 + }, + { + "epoch": 4.61320085166785, + "grad_norm": 10.090995788574219, + "learning_rate": 9.538892831795601e-05, + "loss": 0.13572399616241454, + "step": 32500 + }, + { + "epoch": 4.61320085166785, + "eval_accuracy": 0.9771094296432886, + "eval_loss": 0.06673765182495117, + "eval_runtime": 36.3248, + "eval_samples_per_second": 432.955, + "eval_steps_per_second": 13.544, + "step": 32500 + }, + { + "epoch": 4.614620298083747, + "grad_norm": 4.7882819175720215, + "learning_rate": 9.53875088715401e-05, + "loss": 0.05213452577590942, + "step": 32510 + }, + { + "epoch": 4.616039744499645, + "grad_norm": 3.912700653076172, + "learning_rate": 9.53860894251242e-05, + "loss": 0.05222201347351074, + "step": 32520 + }, + { + "epoch": 4.617459190915543, + "grad_norm": 4.201282024383545, + "learning_rate": 9.53846699787083e-05, + "loss": 0.07659255862236022, + "step": 32530 + }, + { + "epoch": 4.6188786373314406, + "grad_norm": 0.3415137529373169, + "learning_rate": 9.538325053229241e-05, + "loss": 0.07810273170471191, + "step": 32540 + }, + { + "epoch": 4.620298083747339, + "grad_norm": 2.1429755687713623, + "learning_rate": 9.538183108587651e-05, + "loss": 0.10537341833114625, + "step": 32550 + }, + { + "epoch": 4.621717530163236, + "grad_norm": 5.316372394561768, + "learning_rate": 9.538041163946061e-05, + "loss": 0.04490730464458466, + "step": 32560 + }, + { + "epoch": 4.623136976579135, + "grad_norm": 11.811970710754395, + "learning_rate": 9.537899219304472e-05, + "loss": 0.08929653763771057, + "step": 32570 + }, + { + "epoch": 4.624556422995032, + "grad_norm": 1.3656327724456787, + "learning_rate": 9.537757274662882e-05, + "loss": 0.028151947259902953, + "step": 32580 + }, + { + "epoch": 4.625975869410929, + "grad_norm": 4.044888973236084, + "learning_rate": 9.537615330021293e-05, + "loss": 0.09239206910133362, + "step": 32590 + }, + { + "epoch": 4.627395315826828, + "grad_norm": 0.7978225350379944, + "learning_rate": 9.537473385379702e-05, + "loss": 0.030372977256774902, + "step": 32600 + }, + { + "epoch": 4.628814762242725, + "grad_norm": 0.3269311785697937, + "learning_rate": 9.537331440738114e-05, + "loss": 0.06625695824623108, + "step": 32610 + }, + { + "epoch": 4.6302342086586235, + "grad_norm": 0.9531834721565247, + "learning_rate": 9.537189496096522e-05, + "loss": 0.02768568992614746, + "step": 32620 + }, + { + "epoch": 4.631653655074521, + "grad_norm": 11.829211235046387, + "learning_rate": 9.537047551454933e-05, + "loss": 0.07944384813308716, + "step": 32630 + }, + { + "epoch": 4.633073101490419, + "grad_norm": 1.9724338054656982, + "learning_rate": 9.536905606813343e-05, + "loss": 0.05105016231536865, + "step": 32640 + }, + { + "epoch": 4.634492547906317, + "grad_norm": 5.076968669891357, + "learning_rate": 9.536763662171754e-05, + "loss": 0.03969843983650208, + "step": 32650 + }, + { + "epoch": 4.635911994322214, + "grad_norm": 5.4567437171936035, + "learning_rate": 9.536621717530164e-05, + "loss": 0.07335436344146729, + "step": 32660 + }, + { + "epoch": 4.637331440738112, + "grad_norm": 0.4687504470348358, + "learning_rate": 9.536479772888573e-05, + "loss": 0.08072584271430969, + "step": 32670 + }, + { + "epoch": 4.63875088715401, + "grad_norm": 10.989912986755371, + "learning_rate": 9.536337828246984e-05, + "loss": 0.13782428503036498, + "step": 32680 + }, + { + "epoch": 4.640170333569908, + "grad_norm": 10.409384727478027, + "learning_rate": 9.536195883605394e-05, + "loss": 0.07241227626800537, + "step": 32690 + }, + { + "epoch": 4.6415897799858055, + "grad_norm": 5.019614219665527, + "learning_rate": 9.536053938963805e-05, + "loss": 0.053052467107772824, + "step": 32700 + }, + { + "epoch": 4.643009226401704, + "grad_norm": 5.739016056060791, + "learning_rate": 9.535911994322215e-05, + "loss": 0.09582682847976684, + "step": 32710 + }, + { + "epoch": 4.644428672817601, + "grad_norm": 7.529049396514893, + "learning_rate": 9.535770049680625e-05, + "loss": 0.025439244508743287, + "step": 32720 + }, + { + "epoch": 4.645848119233499, + "grad_norm": 4.5299835205078125, + "learning_rate": 9.535628105039034e-05, + "loss": 0.04896810054779053, + "step": 32730 + }, + { + "epoch": 4.647267565649397, + "grad_norm": 1.1995518207550049, + "learning_rate": 9.535486160397446e-05, + "loss": 0.05114521980285645, + "step": 32740 + }, + { + "epoch": 4.648687012065294, + "grad_norm": 5.880154609680176, + "learning_rate": 9.535344215755855e-05, + "loss": 0.09556569457054138, + "step": 32750 + }, + { + "epoch": 4.650106458481193, + "grad_norm": 5.677839279174805, + "learning_rate": 9.535202271114266e-05, + "loss": 0.0534260630607605, + "step": 32760 + }, + { + "epoch": 4.65152590489709, + "grad_norm": 0.137897327542305, + "learning_rate": 9.535060326472676e-05, + "loss": 0.07294431924819947, + "step": 32770 + }, + { + "epoch": 4.652945351312988, + "grad_norm": 2.987269639968872, + "learning_rate": 9.534918381831086e-05, + "loss": 0.05872194766998291, + "step": 32780 + }, + { + "epoch": 4.654364797728886, + "grad_norm": 9.59643840789795, + "learning_rate": 9.534776437189497e-05, + "loss": 0.09373766779899598, + "step": 32790 + }, + { + "epoch": 4.655784244144783, + "grad_norm": 5.86336088180542, + "learning_rate": 9.534634492547907e-05, + "loss": 0.09089065194129944, + "step": 32800 + }, + { + "epoch": 4.6572036905606815, + "grad_norm": 9.171552658081055, + "learning_rate": 9.534492547906318e-05, + "loss": 0.04677242934703827, + "step": 32810 + }, + { + "epoch": 4.658623136976579, + "grad_norm": 8.410407066345215, + "learning_rate": 9.534350603264726e-05, + "loss": 0.045021438598632814, + "step": 32820 + }, + { + "epoch": 4.660042583392477, + "grad_norm": 8.053443908691406, + "learning_rate": 9.534208658623137e-05, + "loss": 0.06634845733642578, + "step": 32830 + }, + { + "epoch": 4.661462029808375, + "grad_norm": 10.485276222229004, + "learning_rate": 9.534066713981547e-05, + "loss": 0.092935448884964, + "step": 32840 + }, + { + "epoch": 4.662881476224273, + "grad_norm": 0.24282492697238922, + "learning_rate": 9.533924769339958e-05, + "loss": 0.046061572432518, + "step": 32850 + }, + { + "epoch": 4.66430092264017, + "grad_norm": 6.648067474365234, + "learning_rate": 9.533782824698369e-05, + "loss": 0.039595258235931394, + "step": 32860 + }, + { + "epoch": 4.665720369056068, + "grad_norm": 0.3748902678489685, + "learning_rate": 9.533640880056778e-05, + "loss": 0.06567533612251282, + "step": 32870 + }, + { + "epoch": 4.667139815471966, + "grad_norm": 1.109091877937317, + "learning_rate": 9.533498935415189e-05, + "loss": 0.07075968980789185, + "step": 32880 + }, + { + "epoch": 4.6685592618878635, + "grad_norm": 0.4307088255882263, + "learning_rate": 9.533356990773598e-05, + "loss": 0.03807471692562103, + "step": 32890 + }, + { + "epoch": 4.669978708303762, + "grad_norm": 11.682064056396484, + "learning_rate": 9.53321504613201e-05, + "loss": 0.06912864446640014, + "step": 32900 + }, + { + "epoch": 4.671398154719659, + "grad_norm": 4.182003974914551, + "learning_rate": 9.533073101490419e-05, + "loss": 0.04695388674736023, + "step": 32910 + }, + { + "epoch": 4.6728176011355576, + "grad_norm": 12.598423957824707, + "learning_rate": 9.532931156848829e-05, + "loss": 0.1024010419845581, + "step": 32920 + }, + { + "epoch": 4.674237047551455, + "grad_norm": 1.3875524997711182, + "learning_rate": 9.532789212207239e-05, + "loss": 0.06368948221206665, + "step": 32930 + }, + { + "epoch": 4.675656493967352, + "grad_norm": 2.8007404804229736, + "learning_rate": 9.53264726756565e-05, + "loss": 0.1000246286392212, + "step": 32940 + }, + { + "epoch": 4.677075940383251, + "grad_norm": 1.2782186269760132, + "learning_rate": 9.532505322924061e-05, + "loss": 0.10685174465179444, + "step": 32950 + }, + { + "epoch": 4.678495386799148, + "grad_norm": 1.8826717138290405, + "learning_rate": 9.53236337828247e-05, + "loss": 0.08655181527137756, + "step": 32960 + }, + { + "epoch": 4.679914833215046, + "grad_norm": 4.107776641845703, + "learning_rate": 9.532221433640882e-05, + "loss": 0.0805110514163971, + "step": 32970 + }, + { + "epoch": 4.681334279630944, + "grad_norm": 5.16588830947876, + "learning_rate": 9.53207948899929e-05, + "loss": 0.0815092146396637, + "step": 32980 + }, + { + "epoch": 4.682753726046842, + "grad_norm": 2.872464179992676, + "learning_rate": 9.531937544357701e-05, + "loss": 0.03411953449249268, + "step": 32990 + }, + { + "epoch": 4.68417317246274, + "grad_norm": 0.06958237290382385, + "learning_rate": 9.531795599716111e-05, + "loss": 0.02626045048236847, + "step": 33000 + }, + { + "epoch": 4.68417317246274, + "eval_accuracy": 0.9627392382526865, + "eval_loss": 0.1083284318447113, + "eval_runtime": 36.6163, + "eval_samples_per_second": 429.509, + "eval_steps_per_second": 13.437, + "step": 33000 + }, + { + "epoch": 4.685592618878637, + "grad_norm": 3.2690742015838623, + "learning_rate": 9.531653655074522e-05, + "loss": 0.08817251324653626, + "step": 33010 + }, + { + "epoch": 4.687012065294535, + "grad_norm": 5.778581619262695, + "learning_rate": 9.531511710432932e-05, + "loss": 0.0841533899307251, + "step": 33020 + }, + { + "epoch": 4.688431511710433, + "grad_norm": 0.5700046420097351, + "learning_rate": 9.531369765791341e-05, + "loss": 0.08917995095252991, + "step": 33030 + }, + { + "epoch": 4.689850958126331, + "grad_norm": 1.3010187149047852, + "learning_rate": 9.531227821149753e-05, + "loss": 0.03966775238513946, + "step": 33040 + }, + { + "epoch": 4.691270404542228, + "grad_norm": 1.8215919733047485, + "learning_rate": 9.531085876508162e-05, + "loss": 0.029767876863479613, + "step": 33050 + }, + { + "epoch": 4.692689850958127, + "grad_norm": 7.450717926025391, + "learning_rate": 9.530943931866573e-05, + "loss": 0.07962870597839355, + "step": 33060 + }, + { + "epoch": 4.694109297374024, + "grad_norm": 4.49182653427124, + "learning_rate": 9.530801987224983e-05, + "loss": 0.09771084785461426, + "step": 33070 + }, + { + "epoch": 4.695528743789922, + "grad_norm": 2.3871476650238037, + "learning_rate": 9.530660042583393e-05, + "loss": 0.03243844509124756, + "step": 33080 + }, + { + "epoch": 4.69694819020582, + "grad_norm": 6.932815074920654, + "learning_rate": 9.530518097941803e-05, + "loss": 0.10589628219604492, + "step": 33090 + }, + { + "epoch": 4.698367636621717, + "grad_norm": 5.599147796630859, + "learning_rate": 9.530376153300214e-05, + "loss": 0.0905950427055359, + "step": 33100 + }, + { + "epoch": 4.699787083037616, + "grad_norm": 4.64316463470459, + "learning_rate": 9.530234208658623e-05, + "loss": 0.06359080076217652, + "step": 33110 + }, + { + "epoch": 4.701206529453513, + "grad_norm": 6.096071243286133, + "learning_rate": 9.530092264017035e-05, + "loss": 0.048346295952796936, + "step": 33120 + }, + { + "epoch": 4.702625975869411, + "grad_norm": 1.707812786102295, + "learning_rate": 9.529950319375444e-05, + "loss": 0.044125860929489134, + "step": 33130 + }, + { + "epoch": 4.704045422285309, + "grad_norm": 2.5328729152679443, + "learning_rate": 9.529808374733854e-05, + "loss": 0.0620121955871582, + "step": 33140 + }, + { + "epoch": 4.705464868701206, + "grad_norm": 2.9145994186401367, + "learning_rate": 9.529666430092265e-05, + "loss": 0.047010570764541626, + "step": 33150 + }, + { + "epoch": 4.7068843151171045, + "grad_norm": 4.588191509246826, + "learning_rate": 9.529524485450675e-05, + "loss": 0.037869596481323244, + "step": 33160 + }, + { + "epoch": 4.708303761533002, + "grad_norm": 8.48548698425293, + "learning_rate": 9.529382540809086e-05, + "loss": 0.08703064918518066, + "step": 33170 + }, + { + "epoch": 4.7097232079489, + "grad_norm": 0.5679498910903931, + "learning_rate": 9.529240596167494e-05, + "loss": 0.0526730477809906, + "step": 33180 + }, + { + "epoch": 4.711142654364798, + "grad_norm": 3.060013771057129, + "learning_rate": 9.529098651525905e-05, + "loss": 0.08811714053153992, + "step": 33190 + }, + { + "epoch": 4.712562100780696, + "grad_norm": 0.47054776549339294, + "learning_rate": 9.528956706884315e-05, + "loss": 0.06622061729431153, + "step": 33200 + }, + { + "epoch": 4.713981547196593, + "grad_norm": 1.364973783493042, + "learning_rate": 9.528814762242726e-05, + "loss": 0.03104795217514038, + "step": 33210 + }, + { + "epoch": 4.715400993612491, + "grad_norm": 1.5248595476150513, + "learning_rate": 9.528672817601136e-05, + "loss": 0.06729813814163207, + "step": 33220 + }, + { + "epoch": 4.716820440028389, + "grad_norm": 0.41258639097213745, + "learning_rate": 9.528530872959546e-05, + "loss": 0.05668038725852966, + "step": 33230 + }, + { + "epoch": 4.7182398864442865, + "grad_norm": 3.3833577632904053, + "learning_rate": 9.528388928317957e-05, + "loss": 0.0714464783668518, + "step": 33240 + }, + { + "epoch": 4.719659332860185, + "grad_norm": 5.934215545654297, + "learning_rate": 9.528246983676367e-05, + "loss": 0.06380202770233154, + "step": 33250 + }, + { + "epoch": 4.721078779276082, + "grad_norm": 0.9173101186752319, + "learning_rate": 9.528119233498935e-05, + "loss": 0.064630788564682, + "step": 33260 + }, + { + "epoch": 4.7224982256919805, + "grad_norm": 4.4683709144592285, + "learning_rate": 9.527977288857346e-05, + "loss": 0.030341708660125734, + "step": 33270 + }, + { + "epoch": 4.723917672107878, + "grad_norm": 4.554975509643555, + "learning_rate": 9.527835344215756e-05, + "loss": 0.058397090435028075, + "step": 33280 + }, + { + "epoch": 4.725337118523775, + "grad_norm": 2.262747049331665, + "learning_rate": 9.527693399574167e-05, + "loss": 0.06394088268280029, + "step": 33290 + }, + { + "epoch": 4.726756564939674, + "grad_norm": 8.058440208435059, + "learning_rate": 9.527551454932577e-05, + "loss": 0.05340535044670105, + "step": 33300 + }, + { + "epoch": 4.728176011355571, + "grad_norm": 0.3765454888343811, + "learning_rate": 9.527409510290986e-05, + "loss": 0.0275752991437912, + "step": 33310 + }, + { + "epoch": 4.729595457771469, + "grad_norm": 5.589868545532227, + "learning_rate": 9.527267565649398e-05, + "loss": 0.08953121900558472, + "step": 33320 + }, + { + "epoch": 4.731014904187367, + "grad_norm": 7.95912504196167, + "learning_rate": 9.527125621007807e-05, + "loss": 0.06377414464950562, + "step": 33330 + }, + { + "epoch": 4.732434350603265, + "grad_norm": 1.7751505374908447, + "learning_rate": 9.526983676366218e-05, + "loss": 0.12168240547180176, + "step": 33340 + }, + { + "epoch": 4.7338537970191625, + "grad_norm": 0.1998690515756607, + "learning_rate": 9.526841731724628e-05, + "loss": 0.07634644508361817, + "step": 33350 + }, + { + "epoch": 4.73527324343506, + "grad_norm": 6.665310382843018, + "learning_rate": 9.526699787083038e-05, + "loss": 0.0845515251159668, + "step": 33360 + }, + { + "epoch": 4.736692689850958, + "grad_norm": 6.774913311004639, + "learning_rate": 9.526557842441448e-05, + "loss": 0.07210742831230163, + "step": 33370 + }, + { + "epoch": 4.738112136266856, + "grad_norm": 10.878942489624023, + "learning_rate": 9.526415897799859e-05, + "loss": 0.09000579714775085, + "step": 33380 + }, + { + "epoch": 4.739531582682754, + "grad_norm": 11.169490814208984, + "learning_rate": 9.526273953158268e-05, + "loss": 0.09856179356575012, + "step": 33390 + }, + { + "epoch": 4.740951029098651, + "grad_norm": 3.1584696769714355, + "learning_rate": 9.52613200851668e-05, + "loss": 0.06054343581199646, + "step": 33400 + }, + { + "epoch": 4.74237047551455, + "grad_norm": 2.1179540157318115, + "learning_rate": 9.525990063875089e-05, + "loss": 0.08376701474189759, + "step": 33410 + }, + { + "epoch": 4.743789921930447, + "grad_norm": 7.283944606781006, + "learning_rate": 9.525848119233499e-05, + "loss": 0.0680974006652832, + "step": 33420 + }, + { + "epoch": 4.7452093683463445, + "grad_norm": 1.1434763669967651, + "learning_rate": 9.52570617459191e-05, + "loss": 0.04679420590400696, + "step": 33430 + }, + { + "epoch": 4.746628814762243, + "grad_norm": 2.1488308906555176, + "learning_rate": 9.52556422995032e-05, + "loss": 0.02475724071264267, + "step": 33440 + }, + { + "epoch": 4.74804826117814, + "grad_norm": 1.1518933773040771, + "learning_rate": 9.525422285308731e-05, + "loss": 0.0383320152759552, + "step": 33450 + }, + { + "epoch": 4.749467707594039, + "grad_norm": 3.2653729915618896, + "learning_rate": 9.525280340667139e-05, + "loss": 0.04667494595050812, + "step": 33460 + }, + { + "epoch": 4.750887154009936, + "grad_norm": 4.017123699188232, + "learning_rate": 9.52513839602555e-05, + "loss": 0.05600497722625732, + "step": 33470 + }, + { + "epoch": 4.752306600425834, + "grad_norm": 7.016024589538574, + "learning_rate": 9.52499645138396e-05, + "loss": 0.08545212745666504, + "step": 33480 + }, + { + "epoch": 4.753726046841732, + "grad_norm": 3.9017140865325928, + "learning_rate": 9.524854506742371e-05, + "loss": 0.042384487390518186, + "step": 33490 + }, + { + "epoch": 4.755145493257629, + "grad_norm": 4.046632289886475, + "learning_rate": 9.524712562100781e-05, + "loss": 0.10414813756942749, + "step": 33500 + }, + { + "epoch": 4.755145493257629, + "eval_accuracy": 0.9612767851465632, + "eval_loss": 0.12837225198745728, + "eval_runtime": 35.9316, + "eval_samples_per_second": 437.693, + "eval_steps_per_second": 13.693, + "step": 33500 + }, + { + "epoch": 4.7565649396735274, + "grad_norm": 4.732762813568115, + "learning_rate": 9.52457061745919e-05, + "loss": 0.07615302801132202, + "step": 33510 + }, + { + "epoch": 4.757984386089425, + "grad_norm": 1.0631474256515503, + "learning_rate": 9.524428672817602e-05, + "loss": 0.06459469199180604, + "step": 33520 + }, + { + "epoch": 4.759403832505323, + "grad_norm": 10.640356063842773, + "learning_rate": 9.524286728176011e-05, + "loss": 0.12269865274429322, + "step": 33530 + }, + { + "epoch": 4.760823278921221, + "grad_norm": 4.477070331573486, + "learning_rate": 9.524144783534423e-05, + "loss": 0.0742661714553833, + "step": 33540 + }, + { + "epoch": 4.762242725337119, + "grad_norm": 2.8761069774627686, + "learning_rate": 9.524002838892832e-05, + "loss": 0.07635018825531006, + "step": 33550 + }, + { + "epoch": 4.763662171753016, + "grad_norm": 2.838202476501465, + "learning_rate": 9.523860894251242e-05, + "loss": 0.030411535501480104, + "step": 33560 + }, + { + "epoch": 4.765081618168914, + "grad_norm": 5.472594261169434, + "learning_rate": 9.523718949609652e-05, + "loss": 0.07412365674972535, + "step": 33570 + }, + { + "epoch": 4.766501064584812, + "grad_norm": 5.381545543670654, + "learning_rate": 9.523577004968063e-05, + "loss": 0.059850108623504636, + "step": 33580 + }, + { + "epoch": 4.7679205110007095, + "grad_norm": 3.6640396118164062, + "learning_rate": 9.523435060326473e-05, + "loss": 0.08601389527320862, + "step": 33590 + }, + { + "epoch": 4.769339957416608, + "grad_norm": 1.533301830291748, + "learning_rate": 9.523293115684884e-05, + "loss": 0.05381173491477966, + "step": 33600 + }, + { + "epoch": 4.770759403832505, + "grad_norm": 4.691380023956299, + "learning_rate": 9.523151171043293e-05, + "loss": 0.08660387992858887, + "step": 33610 + }, + { + "epoch": 4.7721788502484035, + "grad_norm": 0.0661584809422493, + "learning_rate": 9.523009226401703e-05, + "loss": 0.0433504581451416, + "step": 33620 + }, + { + "epoch": 4.773598296664301, + "grad_norm": 5.291693687438965, + "learning_rate": 9.522867281760114e-05, + "loss": 0.08284536600112916, + "step": 33630 + }, + { + "epoch": 4.775017743080198, + "grad_norm": 4.733127117156982, + "learning_rate": 9.522725337118524e-05, + "loss": 0.08289542198181152, + "step": 33640 + }, + { + "epoch": 4.776437189496097, + "grad_norm": 10.965649604797363, + "learning_rate": 9.522583392476935e-05, + "loss": 0.09481858015060425, + "step": 33650 + }, + { + "epoch": 4.777856635911994, + "grad_norm": 1.8173820972442627, + "learning_rate": 9.522441447835345e-05, + "loss": 0.06447114944458007, + "step": 33660 + }, + { + "epoch": 4.779276082327892, + "grad_norm": 9.157655715942383, + "learning_rate": 9.522299503193755e-05, + "loss": 0.10042716264724731, + "step": 33670 + }, + { + "epoch": 4.78069552874379, + "grad_norm": 1.8306244611740112, + "learning_rate": 9.522157558552164e-05, + "loss": 0.06390793323516845, + "step": 33680 + }, + { + "epoch": 4.782114975159688, + "grad_norm": 8.975295066833496, + "learning_rate": 9.522015613910575e-05, + "loss": 0.07697048783302307, + "step": 33690 + }, + { + "epoch": 4.7835344215755855, + "grad_norm": 6.4800543785095215, + "learning_rate": 9.521873669268985e-05, + "loss": 0.04765602946281433, + "step": 33700 + }, + { + "epoch": 4.784953867991483, + "grad_norm": 0.37081560492515564, + "learning_rate": 9.521731724627396e-05, + "loss": 0.055331355333328246, + "step": 33710 + }, + { + "epoch": 4.786373314407381, + "grad_norm": 0.17576156556606293, + "learning_rate": 9.521589779985806e-05, + "loss": 0.042742720246315, + "step": 33720 + }, + { + "epoch": 4.787792760823279, + "grad_norm": 5.604384422302246, + "learning_rate": 9.521447835344216e-05, + "loss": 0.1510116219520569, + "step": 33730 + }, + { + "epoch": 4.789212207239177, + "grad_norm": 1.1139140129089355, + "learning_rate": 9.521305890702627e-05, + "loss": 0.08178800940513611, + "step": 33740 + }, + { + "epoch": 4.790631653655074, + "grad_norm": 6.628493785858154, + "learning_rate": 9.521163946061037e-05, + "loss": 0.105251944065094, + "step": 33750 + }, + { + "epoch": 4.792051100070973, + "grad_norm": 6.405097484588623, + "learning_rate": 9.521022001419448e-05, + "loss": 0.03591142892837525, + "step": 33760 + }, + { + "epoch": 4.79347054648687, + "grad_norm": 1.633231282234192, + "learning_rate": 9.520880056777856e-05, + "loss": 0.04583961963653564, + "step": 33770 + }, + { + "epoch": 4.7948899929027675, + "grad_norm": 2.0780718326568604, + "learning_rate": 9.520738112136267e-05, + "loss": 0.06329174041748047, + "step": 33780 + }, + { + "epoch": 4.796309439318666, + "grad_norm": 7.537708759307861, + "learning_rate": 9.520596167494677e-05, + "loss": 0.07685714960098267, + "step": 33790 + }, + { + "epoch": 4.797728885734563, + "grad_norm": 0.6330421566963196, + "learning_rate": 9.520454222853088e-05, + "loss": 0.025817760825157167, + "step": 33800 + }, + { + "epoch": 4.7991483321504615, + "grad_norm": 1.9811617136001587, + "learning_rate": 9.520312278211499e-05, + "loss": 0.12827333211898803, + "step": 33810 + }, + { + "epoch": 4.800567778566359, + "grad_norm": 0.6081200838088989, + "learning_rate": 9.520170333569907e-05, + "loss": 0.033173537254333495, + "step": 33820 + }, + { + "epoch": 4.801987224982257, + "grad_norm": 4.308277130126953, + "learning_rate": 9.520028388928319e-05, + "loss": 0.03841128945350647, + "step": 33830 + }, + { + "epoch": 4.803406671398155, + "grad_norm": 3.689964532852173, + "learning_rate": 9.519886444286728e-05, + "loss": 0.06301182508468628, + "step": 33840 + }, + { + "epoch": 4.804826117814052, + "grad_norm": 3.0603621006011963, + "learning_rate": 9.51974449964514e-05, + "loss": 0.029316383600234985, + "step": 33850 + }, + { + "epoch": 4.80624556422995, + "grad_norm": 0.7296545505523682, + "learning_rate": 9.519602555003549e-05, + "loss": 0.07017461061477662, + "step": 33860 + }, + { + "epoch": 4.807665010645848, + "grad_norm": 0.6221994161605835, + "learning_rate": 9.519460610361959e-05, + "loss": 0.06105865836143494, + "step": 33870 + }, + { + "epoch": 4.809084457061746, + "grad_norm": 0.613722026348114, + "learning_rate": 9.519318665720369e-05, + "loss": 0.04575749039649964, + "step": 33880 + }, + { + "epoch": 4.810503903477644, + "grad_norm": 0.2880522310733795, + "learning_rate": 9.51917672107878e-05, + "loss": 0.035099685192108154, + "step": 33890 + }, + { + "epoch": 4.811923349893542, + "grad_norm": 0.7631796598434448, + "learning_rate": 9.519034776437191e-05, + "loss": 0.09773088097572327, + "step": 33900 + }, + { + "epoch": 4.813342796309439, + "grad_norm": 5.633825778961182, + "learning_rate": 9.5188928317956e-05, + "loss": 0.07228602170944214, + "step": 33910 + }, + { + "epoch": 4.814762242725337, + "grad_norm": 1.5585920810699463, + "learning_rate": 9.51875088715401e-05, + "loss": 0.04053950607776642, + "step": 33920 + }, + { + "epoch": 4.816181689141235, + "grad_norm": 1.0823962688446045, + "learning_rate": 9.51860894251242e-05, + "loss": 0.04932558238506317, + "step": 33930 + }, + { + "epoch": 4.817601135557132, + "grad_norm": 7.6794586181640625, + "learning_rate": 9.518466997870831e-05, + "loss": 0.049499303102493286, + "step": 33940 + }, + { + "epoch": 4.819020581973031, + "grad_norm": 2.137556314468384, + "learning_rate": 9.518325053229241e-05, + "loss": 0.048341837525367734, + "step": 33950 + }, + { + "epoch": 4.820440028388928, + "grad_norm": 3.8483757972717285, + "learning_rate": 9.518183108587652e-05, + "loss": 0.060046988725662234, + "step": 33960 + }, + { + "epoch": 4.8218594748048265, + "grad_norm": 6.301821708679199, + "learning_rate": 9.518041163946062e-05, + "loss": 0.10900142192840576, + "step": 33970 + }, + { + "epoch": 4.823278921220724, + "grad_norm": 8.898664474487305, + "learning_rate": 9.517899219304471e-05, + "loss": 0.05514363646507263, + "step": 33980 + }, + { + "epoch": 4.824698367636621, + "grad_norm": 12.41129207611084, + "learning_rate": 9.517757274662882e-05, + "loss": 0.12840945720672609, + "step": 33990 + }, + { + "epoch": 4.82611781405252, + "grad_norm": 2.5299203395843506, + "learning_rate": 9.517615330021292e-05, + "loss": 0.05092512369155884, + "step": 34000 + }, + { + "epoch": 4.82611781405252, + "eval_accuracy": 0.9595599923698099, + "eval_loss": 0.12192901968955994, + "eval_runtime": 35.0097, + "eval_samples_per_second": 449.218, + "eval_steps_per_second": 14.053, + "step": 34000 + }, + { + "epoch": 4.827537260468417, + "grad_norm": 0.23498234152793884, + "learning_rate": 9.517473385379703e-05, + "loss": 0.07249274253845214, + "step": 34010 + }, + { + "epoch": 4.828956706884315, + "grad_norm": 3.2055423259735107, + "learning_rate": 9.517331440738113e-05, + "loss": 0.05897048711776733, + "step": 34020 + }, + { + "epoch": 4.830376153300213, + "grad_norm": 1.8760758638381958, + "learning_rate": 9.517189496096523e-05, + "loss": 0.05109198689460755, + "step": 34030 + }, + { + "epoch": 4.831795599716111, + "grad_norm": 6.842248916625977, + "learning_rate": 9.517047551454932e-05, + "loss": 0.07260550260543823, + "step": 34040 + }, + { + "epoch": 4.8332150461320085, + "grad_norm": 3.05704927444458, + "learning_rate": 9.516905606813344e-05, + "loss": 0.06829613447189331, + "step": 34050 + }, + { + "epoch": 4.834634492547906, + "grad_norm": 9.268196105957031, + "learning_rate": 9.516763662171753e-05, + "loss": 0.10247148275375366, + "step": 34060 + }, + { + "epoch": 4.836053938963804, + "grad_norm": 8.238924980163574, + "learning_rate": 9.516621717530164e-05, + "loss": 0.09007470607757569, + "step": 34070 + }, + { + "epoch": 4.837473385379702, + "grad_norm": 7.473556995391846, + "learning_rate": 9.516479772888574e-05, + "loss": 0.09785959720611573, + "step": 34080 + }, + { + "epoch": 4.8388928317956, + "grad_norm": 6.157594680786133, + "learning_rate": 9.516337828246984e-05, + "loss": 0.06533307433128357, + "step": 34090 + }, + { + "epoch": 4.840312278211497, + "grad_norm": 7.398861885070801, + "learning_rate": 9.516195883605395e-05, + "loss": 0.03795735538005829, + "step": 34100 + }, + { + "epoch": 4.841731724627396, + "grad_norm": 4.990109920501709, + "learning_rate": 9.516053938963805e-05, + "loss": 0.061510467529296876, + "step": 34110 + }, + { + "epoch": 4.843151171043293, + "grad_norm": 6.985716819763184, + "learning_rate": 9.515911994322216e-05, + "loss": 0.08015224933624268, + "step": 34120 + }, + { + "epoch": 4.8445706174591905, + "grad_norm": 0.4228348135948181, + "learning_rate": 9.515770049680624e-05, + "loss": 0.07228795886039734, + "step": 34130 + }, + { + "epoch": 4.845990063875089, + "grad_norm": 2.4188504219055176, + "learning_rate": 9.515628105039035e-05, + "loss": 0.038450679183006285, + "step": 34140 + }, + { + "epoch": 4.847409510290986, + "grad_norm": 0.691714882850647, + "learning_rate": 9.515486160397445e-05, + "loss": 0.04819165468215943, + "step": 34150 + }, + { + "epoch": 4.8488289567068845, + "grad_norm": 4.017334938049316, + "learning_rate": 9.515344215755856e-05, + "loss": 0.03329123556613922, + "step": 34160 + }, + { + "epoch": 4.850248403122782, + "grad_norm": 5.272017478942871, + "learning_rate": 9.515202271114266e-05, + "loss": 0.07376419901847839, + "step": 34170 + }, + { + "epoch": 4.85166784953868, + "grad_norm": 4.051192760467529, + "learning_rate": 9.515060326472676e-05, + "loss": 0.06695409417152405, + "step": 34180 + }, + { + "epoch": 4.853087295954578, + "grad_norm": 1.7410699129104614, + "learning_rate": 9.514918381831087e-05, + "loss": 0.1089176893234253, + "step": 34190 + }, + { + "epoch": 4.854506742370475, + "grad_norm": 1.8117938041687012, + "learning_rate": 9.514776437189496e-05, + "loss": 0.10838677883148193, + "step": 34200 + }, + { + "epoch": 4.855926188786373, + "grad_norm": 2.6073532104492188, + "learning_rate": 9.514634492547908e-05, + "loss": 0.05355033278465271, + "step": 34210 + }, + { + "epoch": 4.857345635202271, + "grad_norm": 5.410396099090576, + "learning_rate": 9.514492547906317e-05, + "loss": 0.06793102025985717, + "step": 34220 + }, + { + "epoch": 4.858765081618169, + "grad_norm": 6.987936019897461, + "learning_rate": 9.514350603264727e-05, + "loss": 0.07279337644577026, + "step": 34230 + }, + { + "epoch": 4.8601845280340665, + "grad_norm": 8.075188636779785, + "learning_rate": 9.514208658623137e-05, + "loss": 0.040511229634284975, + "step": 34240 + }, + { + "epoch": 4.861603974449965, + "grad_norm": 1.9225119352340698, + "learning_rate": 9.514066713981548e-05, + "loss": 0.0797190546989441, + "step": 34250 + }, + { + "epoch": 4.863023420865862, + "grad_norm": 4.796272277832031, + "learning_rate": 9.513924769339958e-05, + "loss": 0.03872755765914917, + "step": 34260 + }, + { + "epoch": 4.86444286728176, + "grad_norm": 12.368243217468262, + "learning_rate": 9.513782824698369e-05, + "loss": 0.05745645761489868, + "step": 34270 + }, + { + "epoch": 4.865862313697658, + "grad_norm": 0.11512895673513412, + "learning_rate": 9.513640880056778e-05, + "loss": 0.040886858105659486, + "step": 34280 + }, + { + "epoch": 4.867281760113555, + "grad_norm": 8.322936058044434, + "learning_rate": 9.513498935415188e-05, + "loss": 0.0884483814239502, + "step": 34290 + }, + { + "epoch": 4.868701206529454, + "grad_norm": 7.160366535186768, + "learning_rate": 9.513356990773599e-05, + "loss": 0.09933829307556152, + "step": 34300 + }, + { + "epoch": 4.870120652945351, + "grad_norm": 0.4381280243396759, + "learning_rate": 9.513215046132009e-05, + "loss": 0.049053031206130984, + "step": 34310 + }, + { + "epoch": 4.871540099361249, + "grad_norm": 3.679687738418579, + "learning_rate": 9.51307310149042e-05, + "loss": 0.08237858414649964, + "step": 34320 + }, + { + "epoch": 4.872959545777147, + "grad_norm": 0.3392900228500366, + "learning_rate": 9.51293115684883e-05, + "loss": 0.07077882885932922, + "step": 34330 + }, + { + "epoch": 4.874378992193044, + "grad_norm": 14.285609245300293, + "learning_rate": 9.51278921220724e-05, + "loss": 0.13571418523788453, + "step": 34340 + }, + { + "epoch": 4.875798438608943, + "grad_norm": 5.558465957641602, + "learning_rate": 9.512647267565649e-05, + "loss": 0.11963750123977661, + "step": 34350 + }, + { + "epoch": 4.87721788502484, + "grad_norm": 5.402983665466309, + "learning_rate": 9.51250532292406e-05, + "loss": 0.06695018410682678, + "step": 34360 + }, + { + "epoch": 4.878637331440738, + "grad_norm": 4.293505668640137, + "learning_rate": 9.51236337828247e-05, + "loss": 0.04254850745201111, + "step": 34370 + }, + { + "epoch": 4.880056777856636, + "grad_norm": 6.603433609008789, + "learning_rate": 9.512221433640881e-05, + "loss": 0.06627193689346314, + "step": 34380 + }, + { + "epoch": 4.881476224272534, + "grad_norm": 2.177635431289673, + "learning_rate": 9.512079488999291e-05, + "loss": 0.03508804738521576, + "step": 34390 + }, + { + "epoch": 4.8828956706884314, + "grad_norm": 0.21414713561534882, + "learning_rate": 9.5119375443577e-05, + "loss": 0.07482952475547791, + "step": 34400 + }, + { + "epoch": 4.884315117104329, + "grad_norm": 1.777644157409668, + "learning_rate": 9.511795599716112e-05, + "loss": 0.03625571131706238, + "step": 34410 + }, + { + "epoch": 4.885734563520227, + "grad_norm": 1.2347965240478516, + "learning_rate": 9.511653655074521e-05, + "loss": 0.04854299426078797, + "step": 34420 + }, + { + "epoch": 4.887154009936125, + "grad_norm": 1.3775534629821777, + "learning_rate": 9.511511710432933e-05, + "loss": 0.03984416425228119, + "step": 34430 + }, + { + "epoch": 4.888573456352023, + "grad_norm": 2.4557316303253174, + "learning_rate": 9.511369765791341e-05, + "loss": 0.0749228298664093, + "step": 34440 + }, + { + "epoch": 4.88999290276792, + "grad_norm": 9.009647369384766, + "learning_rate": 9.511227821149752e-05, + "loss": 0.09757251739501953, + "step": 34450 + }, + { + "epoch": 4.891412349183819, + "grad_norm": 0.7110661864280701, + "learning_rate": 9.511085876508162e-05, + "loss": 0.08272533416748047, + "step": 34460 + }, + { + "epoch": 4.892831795599716, + "grad_norm": 2.4361939430236816, + "learning_rate": 9.510943931866573e-05, + "loss": 0.042311400175094604, + "step": 34470 + }, + { + "epoch": 4.8942512420156135, + "grad_norm": 0.9503983855247498, + "learning_rate": 9.510801987224983e-05, + "loss": 0.026289787888526917, + "step": 34480 + }, + { + "epoch": 4.895670688431512, + "grad_norm": 0.559457004070282, + "learning_rate": 9.510660042583392e-05, + "loss": 0.028341618180274964, + "step": 34490 + }, + { + "epoch": 4.897090134847409, + "grad_norm": 9.252950668334961, + "learning_rate": 9.510518097941803e-05, + "loss": 0.11836056709289551, + "step": 34500 + }, + { + "epoch": 4.897090134847409, + "eval_accuracy": 0.9775545240668914, + "eval_loss": 0.06881918758153915, + "eval_runtime": 34.9972, + "eval_samples_per_second": 449.378, + "eval_steps_per_second": 14.058, + "step": 34500 + }, + { + "epoch": 4.8985095812633075, + "grad_norm": 1.470131516456604, + "learning_rate": 9.510376153300213e-05, + "loss": 0.06065631508827209, + "step": 34510 + }, + { + "epoch": 4.899929027679205, + "grad_norm": 0.8848506212234497, + "learning_rate": 9.510234208658624e-05, + "loss": 0.10929383039474487, + "step": 34520 + }, + { + "epoch": 4.901348474095103, + "grad_norm": 1.5781108140945435, + "learning_rate": 9.510092264017034e-05, + "loss": 0.055705088376998904, + "step": 34530 + }, + { + "epoch": 4.902767920511001, + "grad_norm": 4.346358299255371, + "learning_rate": 9.509950319375444e-05, + "loss": 0.040107375383377074, + "step": 34540 + }, + { + "epoch": 4.904187366926898, + "grad_norm": 3.9102401733398438, + "learning_rate": 9.509808374733854e-05, + "loss": 0.07414058446884156, + "step": 34550 + }, + { + "epoch": 4.905606813342796, + "grad_norm": 1.8277835845947266, + "learning_rate": 9.509666430092265e-05, + "loss": 0.04326414465904236, + "step": 34560 + }, + { + "epoch": 4.907026259758694, + "grad_norm": 5.239001750946045, + "learning_rate": 9.509524485450674e-05, + "loss": 0.07349801659584046, + "step": 34570 + }, + { + "epoch": 4.908445706174592, + "grad_norm": 0.15750083327293396, + "learning_rate": 9.509382540809085e-05, + "loss": 0.06307926177978515, + "step": 34580 + }, + { + "epoch": 4.9098651525904895, + "grad_norm": 1.198663353919983, + "learning_rate": 9.509240596167495e-05, + "loss": 0.05986272692680359, + "step": 34590 + }, + { + "epoch": 4.911284599006388, + "grad_norm": 0.8945283889770508, + "learning_rate": 9.509098651525905e-05, + "loss": 0.050188446044921876, + "step": 34600 + }, + { + "epoch": 4.912704045422285, + "grad_norm": 5.8996124267578125, + "learning_rate": 9.508956706884316e-05, + "loss": 0.12322105169296264, + "step": 34610 + }, + { + "epoch": 4.914123491838183, + "grad_norm": 4.365360260009766, + "learning_rate": 9.508814762242726e-05, + "loss": 0.06297153234481812, + "step": 34620 + }, + { + "epoch": 4.915542938254081, + "grad_norm": 11.717504501342773, + "learning_rate": 9.508672817601137e-05, + "loss": 0.0508009672164917, + "step": 34630 + }, + { + "epoch": 4.916962384669978, + "grad_norm": 5.38115930557251, + "learning_rate": 9.508530872959545e-05, + "loss": 0.04837482571601868, + "step": 34640 + }, + { + "epoch": 4.918381831085877, + "grad_norm": 7.246404647827148, + "learning_rate": 9.508388928317956e-05, + "loss": 0.05574921369552612, + "step": 34650 + }, + { + "epoch": 4.919801277501774, + "grad_norm": 1.3808271884918213, + "learning_rate": 9.508246983676366e-05, + "loss": 0.030999797582626342, + "step": 34660 + }, + { + "epoch": 4.921220723917672, + "grad_norm": 12.624173164367676, + "learning_rate": 9.508105039034777e-05, + "loss": 0.0817840576171875, + "step": 34670 + }, + { + "epoch": 4.92264017033357, + "grad_norm": 4.791901111602783, + "learning_rate": 9.507963094393187e-05, + "loss": 0.1045034408569336, + "step": 34680 + }, + { + "epoch": 4.924059616749467, + "grad_norm": 1.011415719985962, + "learning_rate": 9.507821149751598e-05, + "loss": 0.05733692049980164, + "step": 34690 + }, + { + "epoch": 4.9254790631653655, + "grad_norm": 3.5056240558624268, + "learning_rate": 9.507679205110008e-05, + "loss": 0.03916033804416656, + "step": 34700 + }, + { + "epoch": 4.926898509581263, + "grad_norm": 3.432095527648926, + "learning_rate": 9.507537260468417e-05, + "loss": 0.05065256357192993, + "step": 34710 + }, + { + "epoch": 4.928317955997161, + "grad_norm": 5.719847679138184, + "learning_rate": 9.507395315826829e-05, + "loss": 0.03222036361694336, + "step": 34720 + }, + { + "epoch": 4.929737402413059, + "grad_norm": 8.551188468933105, + "learning_rate": 9.507253371185238e-05, + "loss": 0.12375338077545166, + "step": 34730 + }, + { + "epoch": 4.931156848828957, + "grad_norm": 7.09042501449585, + "learning_rate": 9.50711142654365e-05, + "loss": 0.06138598918914795, + "step": 34740 + }, + { + "epoch": 4.932576295244854, + "grad_norm": 13.028380393981934, + "learning_rate": 9.506969481902058e-05, + "loss": 0.09170422554016114, + "step": 34750 + }, + { + "epoch": 4.933995741660752, + "grad_norm": 0.827090859413147, + "learning_rate": 9.506827537260469e-05, + "loss": 0.07565316557884216, + "step": 34760 + }, + { + "epoch": 4.93541518807665, + "grad_norm": 8.198225021362305, + "learning_rate": 9.506685592618879e-05, + "loss": 0.06974472999572753, + "step": 34770 + }, + { + "epoch": 4.936834634492548, + "grad_norm": 4.147625923156738, + "learning_rate": 9.50654364797729e-05, + "loss": 0.04362180233001709, + "step": 34780 + }, + { + "epoch": 4.938254080908446, + "grad_norm": 0.3720574975013733, + "learning_rate": 9.5064017033357e-05, + "loss": 0.034629127383232115, + "step": 34790 + }, + { + "epoch": 4.939673527324343, + "grad_norm": 0.04120853543281555, + "learning_rate": 9.506259758694109e-05, + "loss": 0.08741816282272338, + "step": 34800 + }, + { + "epoch": 4.941092973740242, + "grad_norm": 2.3259832859039307, + "learning_rate": 9.50611781405252e-05, + "loss": 0.04545274972915649, + "step": 34810 + }, + { + "epoch": 4.942512420156139, + "grad_norm": 6.789836406707764, + "learning_rate": 9.50597586941093e-05, + "loss": 0.05636190176010132, + "step": 34820 + }, + { + "epoch": 4.943931866572036, + "grad_norm": 3.173494815826416, + "learning_rate": 9.505833924769341e-05, + "loss": 0.050359517335891724, + "step": 34830 + }, + { + "epoch": 4.945351312987935, + "grad_norm": 1.0671789646148682, + "learning_rate": 9.505691980127751e-05, + "loss": 0.06796733736991882, + "step": 34840 + }, + { + "epoch": 4.946770759403832, + "grad_norm": 0.4082445800304413, + "learning_rate": 9.50555003548616e-05, + "loss": 0.06941262483596802, + "step": 34850 + }, + { + "epoch": 4.9481902058197305, + "grad_norm": 12.27849292755127, + "learning_rate": 9.50540809084457e-05, + "loss": 0.10136985778808594, + "step": 34860 + }, + { + "epoch": 4.949609652235628, + "grad_norm": 9.443519592285156, + "learning_rate": 9.505266146202981e-05, + "loss": 0.07273834943771362, + "step": 34870 + }, + { + "epoch": 4.951029098651526, + "grad_norm": 1.492857575416565, + "learning_rate": 9.505124201561391e-05, + "loss": 0.07884240746498108, + "step": 34880 + }, + { + "epoch": 4.952448545067424, + "grad_norm": 0.09069759398698807, + "learning_rate": 9.504982256919802e-05, + "loss": 0.03405750691890717, + "step": 34890 + }, + { + "epoch": 4.953867991483321, + "grad_norm": 3.4466402530670166, + "learning_rate": 9.504840312278212e-05, + "loss": 0.060506463050842285, + "step": 34900 + }, + { + "epoch": 4.955287437899219, + "grad_norm": 5.695678234100342, + "learning_rate": 9.504698367636622e-05, + "loss": 0.08132978677749633, + "step": 34910 + }, + { + "epoch": 4.956706884315117, + "grad_norm": 1.3141629695892334, + "learning_rate": 9.504556422995033e-05, + "loss": 0.08595213890075684, + "step": 34920 + }, + { + "epoch": 4.958126330731015, + "grad_norm": 1.775640606880188, + "learning_rate": 9.504414478353443e-05, + "loss": 0.08925971984863282, + "step": 34930 + }, + { + "epoch": 4.9595457771469125, + "grad_norm": 1.661397933959961, + "learning_rate": 9.504272533711854e-05, + "loss": 0.057273763418197635, + "step": 34940 + }, + { + "epoch": 4.960965223562811, + "grad_norm": 6.331634998321533, + "learning_rate": 9.504130589070262e-05, + "loss": 0.06684795022010803, + "step": 34950 + }, + { + "epoch": 4.962384669978708, + "grad_norm": 3.3791844844818115, + "learning_rate": 9.503988644428673e-05, + "loss": 0.02721550464630127, + "step": 34960 + }, + { + "epoch": 4.963804116394606, + "grad_norm": 3.0752956867218018, + "learning_rate": 9.503846699787083e-05, + "loss": 0.07388071417808532, + "step": 34970 + }, + { + "epoch": 4.965223562810504, + "grad_norm": 4.732667922973633, + "learning_rate": 9.503704755145494e-05, + "loss": 0.049740397930145265, + "step": 34980 + }, + { + "epoch": 4.966643009226401, + "grad_norm": 0.340250700712204, + "learning_rate": 9.503562810503904e-05, + "loss": 0.051161551475524904, + "step": 34990 + }, + { + "epoch": 4.9680624556423, + "grad_norm": 2.337019443511963, + "learning_rate": 9.503420865862313e-05, + "loss": 0.08789908289909362, + "step": 35000 + }, + { + "epoch": 4.9680624556423, + "eval_accuracy": 0.96986074903033, + "eval_loss": 0.09208517521619797, + "eval_runtime": 35.1679, + "eval_samples_per_second": 447.198, + "eval_steps_per_second": 13.99, + "step": 35000 + }, + { + "epoch": 4.969481902058197, + "grad_norm": 1.2752114534378052, + "learning_rate": 9.503278921220724e-05, + "loss": 0.031662821769714355, + "step": 35010 + }, + { + "epoch": 4.970901348474095, + "grad_norm": 4.962484359741211, + "learning_rate": 9.503136976579134e-05, + "loss": 0.03634861707687378, + "step": 35020 + }, + { + "epoch": 4.972320794889993, + "grad_norm": 3.9700536727905273, + "learning_rate": 9.502995031937545e-05, + "loss": 0.09397113919258118, + "step": 35030 + }, + { + "epoch": 4.97374024130589, + "grad_norm": 1.4326708316802979, + "learning_rate": 9.502853087295955e-05, + "loss": 0.055845755338668826, + "step": 35040 + }, + { + "epoch": 4.9751596877217885, + "grad_norm": 0.2545025646686554, + "learning_rate": 9.502711142654366e-05, + "loss": 0.07820132970809937, + "step": 35050 + }, + { + "epoch": 4.976579134137686, + "grad_norm": 0.48985761404037476, + "learning_rate": 9.502569198012775e-05, + "loss": 0.050520259141922, + "step": 35060 + }, + { + "epoch": 4.977998580553584, + "grad_norm": 2.2398428916931152, + "learning_rate": 9.502427253371186e-05, + "loss": 0.061135333776473996, + "step": 35070 + }, + { + "epoch": 4.979418026969482, + "grad_norm": 1.8560504913330078, + "learning_rate": 9.502285308729595e-05, + "loss": 0.04997407197952271, + "step": 35080 + }, + { + "epoch": 4.98083747338538, + "grad_norm": 16.011417388916016, + "learning_rate": 9.502143364088006e-05, + "loss": 0.1147346019744873, + "step": 35090 + }, + { + "epoch": 4.982256919801277, + "grad_norm": 3.327463388442993, + "learning_rate": 9.502001419446418e-05, + "loss": 0.06455175876617432, + "step": 35100 + }, + { + "epoch": 4.983676366217175, + "grad_norm": 2.0864782333374023, + "learning_rate": 9.501859474804826e-05, + "loss": 0.11143285036087036, + "step": 35110 + }, + { + "epoch": 4.985095812633073, + "grad_norm": 2.2053961753845215, + "learning_rate": 9.501717530163237e-05, + "loss": 0.05626400113105774, + "step": 35120 + }, + { + "epoch": 4.9865152590489705, + "grad_norm": 1.0315237045288086, + "learning_rate": 9.501575585521647e-05, + "loss": 0.04532181918621063, + "step": 35130 + }, + { + "epoch": 4.987934705464869, + "grad_norm": 5.396690845489502, + "learning_rate": 9.501433640880058e-05, + "loss": 0.10503163337707519, + "step": 35140 + }, + { + "epoch": 4.989354151880766, + "grad_norm": 10.326635360717773, + "learning_rate": 9.501291696238468e-05, + "loss": 0.09536985158920289, + "step": 35150 + }, + { + "epoch": 4.990773598296665, + "grad_norm": 1.1461143493652344, + "learning_rate": 9.501149751596877e-05, + "loss": 0.07849235534667968, + "step": 35160 + }, + { + "epoch": 4.992193044712562, + "grad_norm": 5.380537033081055, + "learning_rate": 9.501007806955287e-05, + "loss": 0.09231789708137512, + "step": 35170 + }, + { + "epoch": 4.99361249112846, + "grad_norm": 3.530773639678955, + "learning_rate": 9.500865862313698e-05, + "loss": 0.1057739019393921, + "step": 35180 + }, + { + "epoch": 4.995031937544358, + "grad_norm": 7.351833820343018, + "learning_rate": 9.500723917672109e-05, + "loss": 0.09145522713661194, + "step": 35190 + }, + { + "epoch": 4.996451383960255, + "grad_norm": 16.75145721435547, + "learning_rate": 9.500581973030519e-05, + "loss": 0.06758233308792114, + "step": 35200 + }, + { + "epoch": 4.997870830376153, + "grad_norm": 2.8542025089263916, + "learning_rate": 9.500440028388929e-05, + "loss": 0.0541307270526886, + "step": 35210 + }, + { + "epoch": 4.999290276792051, + "grad_norm": 0.6312423348426819, + "learning_rate": 9.500298083747338e-05, + "loss": 0.09540512561798095, + "step": 35220 + }, + { + "epoch": 5.000709723207949, + "grad_norm": 2.2039694786071777, + "learning_rate": 9.50015613910575e-05, + "loss": 0.07570767402648926, + "step": 35230 + }, + { + "epoch": 5.002129169623847, + "grad_norm": 3.426652431488037, + "learning_rate": 9.500014194464159e-05, + "loss": 0.07772423624992371, + "step": 35240 + }, + { + "epoch": 5.003548616039745, + "grad_norm": 7.052762985229492, + "learning_rate": 9.49987224982257e-05, + "loss": 0.0919622004032135, + "step": 35250 + }, + { + "epoch": 5.004968062455642, + "grad_norm": 1.8353569507598877, + "learning_rate": 9.499730305180979e-05, + "loss": 0.07758974432945251, + "step": 35260 + }, + { + "epoch": 5.00638750887154, + "grad_norm": 0.22881099581718445, + "learning_rate": 9.49958836053939e-05, + "loss": 0.06562204360961914, + "step": 35270 + }, + { + "epoch": 5.007806955287438, + "grad_norm": 6.86403751373291, + "learning_rate": 9.499446415897801e-05, + "loss": 0.056407433748245236, + "step": 35280 + }, + { + "epoch": 5.009226401703335, + "grad_norm": 4.984923362731934, + "learning_rate": 9.49930447125621e-05, + "loss": 0.04461563229560852, + "step": 35290 + }, + { + "epoch": 5.010645848119234, + "grad_norm": 6.596005916595459, + "learning_rate": 9.499162526614622e-05, + "loss": 0.08564714789390564, + "step": 35300 + }, + { + "epoch": 5.012065294535131, + "grad_norm": 1.0412707328796387, + "learning_rate": 9.49902058197303e-05, + "loss": 0.05729523301124573, + "step": 35310 + }, + { + "epoch": 5.0134847409510295, + "grad_norm": 2.90087628364563, + "learning_rate": 9.498878637331441e-05, + "loss": 0.032120704650878906, + "step": 35320 + }, + { + "epoch": 5.014904187366927, + "grad_norm": 4.575965404510498, + "learning_rate": 9.498736692689851e-05, + "loss": 0.07489084005355835, + "step": 35330 + }, + { + "epoch": 5.016323633782824, + "grad_norm": 7.065843105316162, + "learning_rate": 9.498594748048262e-05, + "loss": 0.10670671463012696, + "step": 35340 + }, + { + "epoch": 5.017743080198723, + "grad_norm": 0.6793409585952759, + "learning_rate": 9.498452803406672e-05, + "loss": 0.09802578687667847, + "step": 35350 + }, + { + "epoch": 5.01916252661462, + "grad_norm": 5.977087020874023, + "learning_rate": 9.498310858765082e-05, + "loss": 0.06022813320159912, + "step": 35360 + }, + { + "epoch": 5.020581973030518, + "grad_norm": 5.02726936340332, + "learning_rate": 9.498168914123493e-05, + "loss": 0.04116607308387756, + "step": 35370 + }, + { + "epoch": 5.022001419446416, + "grad_norm": 2.7055563926696777, + "learning_rate": 9.498026969481902e-05, + "loss": 0.02340538948774338, + "step": 35380 + }, + { + "epoch": 5.023420865862314, + "grad_norm": 8.319058418273926, + "learning_rate": 9.497885024840313e-05, + "loss": 0.03942314982414245, + "step": 35390 + }, + { + "epoch": 5.0248403122782115, + "grad_norm": 1.0293197631835938, + "learning_rate": 9.497743080198723e-05, + "loss": 0.03575259149074554, + "step": 35400 + }, + { + "epoch": 5.026259758694109, + "grad_norm": 4.049915790557861, + "learning_rate": 9.497601135557134e-05, + "loss": 0.06752086877822876, + "step": 35410 + }, + { + "epoch": 5.027679205110007, + "grad_norm": 0.5083687901496887, + "learning_rate": 9.497459190915543e-05, + "loss": 0.04853183031082153, + "step": 35420 + }, + { + "epoch": 5.029098651525905, + "grad_norm": 0.34340715408325195, + "learning_rate": 9.497317246273954e-05, + "loss": 0.04110357165336609, + "step": 35430 + }, + { + "epoch": 5.030518097941803, + "grad_norm": 6.67487096786499, + "learning_rate": 9.497175301632364e-05, + "loss": 0.0675375759601593, + "step": 35440 + }, + { + "epoch": 5.0319375443577, + "grad_norm": 2.9003336429595947, + "learning_rate": 9.497033356990775e-05, + "loss": 0.034189680218696596, + "step": 35450 + }, + { + "epoch": 5.033356990773599, + "grad_norm": 3.9349417686462402, + "learning_rate": 9.496891412349184e-05, + "loss": 0.021588873863220216, + "step": 35460 + }, + { + "epoch": 5.034776437189496, + "grad_norm": 0.9912833571434021, + "learning_rate": 9.496749467707594e-05, + "loss": 0.05585495829582214, + "step": 35470 + }, + { + "epoch": 5.0361958836053935, + "grad_norm": 1.4849556684494019, + "learning_rate": 9.496607523066005e-05, + "loss": 0.05193337798118591, + "step": 35480 + }, + { + "epoch": 5.037615330021292, + "grad_norm": 1.7137938737869263, + "learning_rate": 9.496465578424415e-05, + "loss": 0.030359289050102232, + "step": 35490 + }, + { + "epoch": 5.039034776437189, + "grad_norm": 5.067256450653076, + "learning_rate": 9.496323633782826e-05, + "loss": 0.061655843257904054, + "step": 35500 + }, + { + "epoch": 5.039034776437189, + "eval_accuracy": 0.9721498060660011, + "eval_loss": 0.08636458963155746, + "eval_runtime": 34.3468, + "eval_samples_per_second": 457.888, + "eval_steps_per_second": 14.324, + "step": 35500 + }, + { + "epoch": 5.0404542228530875, + "grad_norm": 2.4241161346435547, + "learning_rate": 9.496181689141236e-05, + "loss": 0.046561521291732785, + "step": 35510 + }, + { + "epoch": 5.041873669268985, + "grad_norm": 0.9589247703552246, + "learning_rate": 9.496039744499645e-05, + "loss": 0.028034707903861998, + "step": 35520 + }, + { + "epoch": 5.043293115684883, + "grad_norm": 0.2947952449321747, + "learning_rate": 9.495897799858055e-05, + "loss": 0.08396986126899719, + "step": 35530 + }, + { + "epoch": 5.044712562100781, + "grad_norm": 3.244903564453125, + "learning_rate": 9.495755855216466e-05, + "loss": 0.04211449921131134, + "step": 35540 + }, + { + "epoch": 5.046132008516678, + "grad_norm": 8.584826469421387, + "learning_rate": 9.495613910574876e-05, + "loss": 0.06612651944160461, + "step": 35550 + }, + { + "epoch": 5.047551454932576, + "grad_norm": 4.459551811218262, + "learning_rate": 9.495471965933287e-05, + "loss": 0.040356886386871335, + "step": 35560 + }, + { + "epoch": 5.048970901348474, + "grad_norm": 9.547006607055664, + "learning_rate": 9.495330021291697e-05, + "loss": 0.04492848515510559, + "step": 35570 + }, + { + "epoch": 5.050390347764372, + "grad_norm": 4.3004374504089355, + "learning_rate": 9.495188076650107e-05, + "loss": 0.05426920652389526, + "step": 35580 + }, + { + "epoch": 5.0518097941802695, + "grad_norm": 1.8567450046539307, + "learning_rate": 9.495046132008518e-05, + "loss": 0.028099411725997926, + "step": 35590 + }, + { + "epoch": 5.053229240596168, + "grad_norm": 2.021097183227539, + "learning_rate": 9.494904187366927e-05, + "loss": 0.04831460118293762, + "step": 35600 + }, + { + "epoch": 5.054648687012065, + "grad_norm": 6.291840076446533, + "learning_rate": 9.494762242725339e-05, + "loss": 0.04899407923221588, + "step": 35610 + }, + { + "epoch": 5.056068133427963, + "grad_norm": 2.291241407394409, + "learning_rate": 9.494620298083747e-05, + "loss": 0.052272289991378784, + "step": 35620 + }, + { + "epoch": 5.057487579843861, + "grad_norm": 9.583127975463867, + "learning_rate": 9.494478353442158e-05, + "loss": 0.05495100021362305, + "step": 35630 + }, + { + "epoch": 5.058907026259758, + "grad_norm": 7.2725830078125, + "learning_rate": 9.494336408800568e-05, + "loss": 0.07137655615806579, + "step": 35640 + }, + { + "epoch": 5.060326472675657, + "grad_norm": 9.047038078308105, + "learning_rate": 9.494194464158979e-05, + "loss": 0.0471313625574112, + "step": 35650 + }, + { + "epoch": 5.061745919091554, + "grad_norm": 3.2609193325042725, + "learning_rate": 9.494052519517389e-05, + "loss": 0.0600260317325592, + "step": 35660 + }, + { + "epoch": 5.063165365507452, + "grad_norm": 3.8911261558532715, + "learning_rate": 9.493910574875798e-05, + "loss": 0.05886413455009461, + "step": 35670 + }, + { + "epoch": 5.06458481192335, + "grad_norm": 3.445101737976074, + "learning_rate": 9.49376863023421e-05, + "loss": 0.0769159197807312, + "step": 35680 + }, + { + "epoch": 5.066004258339247, + "grad_norm": 4.021439552307129, + "learning_rate": 9.493626685592619e-05, + "loss": 0.08424595594406128, + "step": 35690 + }, + { + "epoch": 5.067423704755146, + "grad_norm": 1.7676706314086914, + "learning_rate": 9.49348474095103e-05, + "loss": 0.029396337270736695, + "step": 35700 + }, + { + "epoch": 5.068843151171043, + "grad_norm": 4.876907825469971, + "learning_rate": 9.49334279630944e-05, + "loss": 0.035354167222976685, + "step": 35710 + }, + { + "epoch": 5.070262597586941, + "grad_norm": 0.8973761796951294, + "learning_rate": 9.49320085166785e-05, + "loss": 0.04921911954879761, + "step": 35720 + }, + { + "epoch": 5.071682044002839, + "grad_norm": 10.738030433654785, + "learning_rate": 9.49305890702626e-05, + "loss": 0.10539579391479492, + "step": 35730 + }, + { + "epoch": 5.073101490418737, + "grad_norm": 0.2019427865743637, + "learning_rate": 9.49291696238467e-05, + "loss": 0.0512103259563446, + "step": 35740 + }, + { + "epoch": 5.0745209368346345, + "grad_norm": 5.051251411437988, + "learning_rate": 9.49277501774308e-05, + "loss": 0.06000564694404602, + "step": 35750 + }, + { + "epoch": 5.075940383250532, + "grad_norm": 2.901967763900757, + "learning_rate": 9.492633073101491e-05, + "loss": 0.07291017174720764, + "step": 35760 + }, + { + "epoch": 5.07735982966643, + "grad_norm": 1.4676152467727661, + "learning_rate": 9.492491128459901e-05, + "loss": 0.06764619946479797, + "step": 35770 + }, + { + "epoch": 5.078779276082328, + "grad_norm": 11.876858711242676, + "learning_rate": 9.492349183818311e-05, + "loss": 0.060235893726348876, + "step": 35780 + }, + { + "epoch": 5.080198722498226, + "grad_norm": 9.063863754272461, + "learning_rate": 9.492207239176722e-05, + "loss": 0.03448966443538666, + "step": 35790 + }, + { + "epoch": 5.081618168914123, + "grad_norm": 1.5545835494995117, + "learning_rate": 9.492065294535132e-05, + "loss": 0.06110445261001587, + "step": 35800 + }, + { + "epoch": 5.083037615330022, + "grad_norm": 3.2116994857788086, + "learning_rate": 9.491923349893543e-05, + "loss": 0.06111306548118591, + "step": 35810 + }, + { + "epoch": 5.084457061745919, + "grad_norm": 0.2644821107387543, + "learning_rate": 9.491781405251953e-05, + "loss": 0.05563850402832031, + "step": 35820 + }, + { + "epoch": 5.0858765081618165, + "grad_norm": 2.6873834133148193, + "learning_rate": 9.491639460610362e-05, + "loss": 0.08348619937896729, + "step": 35830 + }, + { + "epoch": 5.087295954577715, + "grad_norm": 6.330774784088135, + "learning_rate": 9.491497515968772e-05, + "loss": 0.07018688321113586, + "step": 35840 + }, + { + "epoch": 5.088715400993612, + "grad_norm": 1.0025646686553955, + "learning_rate": 9.491355571327183e-05, + "loss": 0.050104659795761106, + "step": 35850 + }, + { + "epoch": 5.0901348474095105, + "grad_norm": 2.3368682861328125, + "learning_rate": 9.491213626685593e-05, + "loss": 0.04211297333240509, + "step": 35860 + }, + { + "epoch": 5.091554293825408, + "grad_norm": 0.6230148077011108, + "learning_rate": 9.491071682044004e-05, + "loss": 0.049572864174842836, + "step": 35870 + }, + { + "epoch": 5.092973740241306, + "grad_norm": 0.10699428617954254, + "learning_rate": 9.490929737402414e-05, + "loss": 0.00935342162847519, + "step": 35880 + }, + { + "epoch": 5.094393186657204, + "grad_norm": 6.48928689956665, + "learning_rate": 9.490787792760823e-05, + "loss": 0.041819396615028384, + "step": 35890 + }, + { + "epoch": 5.095812633073101, + "grad_norm": 4.529843807220459, + "learning_rate": 9.490645848119234e-05, + "loss": 0.03696204125881195, + "step": 35900 + }, + { + "epoch": 5.097232079488999, + "grad_norm": 6.805991172790527, + "learning_rate": 9.490503903477644e-05, + "loss": 0.055303680896759036, + "step": 35910 + }, + { + "epoch": 5.098651525904897, + "grad_norm": 1.9501713514328003, + "learning_rate": 9.490361958836055e-05, + "loss": 0.11662576198577881, + "step": 35920 + }, + { + "epoch": 5.100070972320795, + "grad_norm": 0.07518387585878372, + "learning_rate": 9.490220014194464e-05, + "loss": 0.029073578119277955, + "step": 35930 + }, + { + "epoch": 5.1014904187366925, + "grad_norm": 0.3094475567340851, + "learning_rate": 9.490078069552875e-05, + "loss": 0.01400664895772934, + "step": 35940 + }, + { + "epoch": 5.102909865152591, + "grad_norm": 16.787120819091797, + "learning_rate": 9.489936124911285e-05, + "loss": 0.06732473373413086, + "step": 35950 + }, + { + "epoch": 5.104329311568488, + "grad_norm": 0.21593448519706726, + "learning_rate": 9.489794180269696e-05, + "loss": 0.042519426345825194, + "step": 35960 + }, + { + "epoch": 5.105748757984386, + "grad_norm": 5.305930137634277, + "learning_rate": 9.489652235628105e-05, + "loss": 0.05054143667221069, + "step": 35970 + }, + { + "epoch": 5.107168204400284, + "grad_norm": 8.940295219421387, + "learning_rate": 9.489510290986515e-05, + "loss": 0.06974593400955201, + "step": 35980 + }, + { + "epoch": 5.108587650816181, + "grad_norm": 0.44597089290618896, + "learning_rate": 9.489368346344926e-05, + "loss": 0.11333968639373779, + "step": 35990 + }, + { + "epoch": 5.11000709723208, + "grad_norm": 1.9616674184799194, + "learning_rate": 9.489226401703336e-05, + "loss": 0.03930683135986328, + "step": 36000 + }, + { + "epoch": 5.11000709723208, + "eval_accuracy": 0.9638201818528646, + "eval_loss": 0.12829390168190002, + "eval_runtime": 35.5371, + "eval_samples_per_second": 442.551, + "eval_steps_per_second": 13.845, + "step": 36000 + }, + { + "epoch": 5.111426543647977, + "grad_norm": 0.7007933855056763, + "learning_rate": 9.489084457061747e-05, + "loss": 0.07342724800109864, + "step": 36010 + }, + { + "epoch": 5.112845990063875, + "grad_norm": 8.156115531921387, + "learning_rate": 9.488942512420157e-05, + "loss": 0.05496933460235596, + "step": 36020 + }, + { + "epoch": 5.114265436479773, + "grad_norm": 8.341107368469238, + "learning_rate": 9.488800567778566e-05, + "loss": 0.07536699771881103, + "step": 36030 + }, + { + "epoch": 5.115684882895671, + "grad_norm": 2.163313627243042, + "learning_rate": 9.488658623136976e-05, + "loss": 0.06941324472427368, + "step": 36040 + }, + { + "epoch": 5.1171043293115686, + "grad_norm": 7.8382887840271, + "learning_rate": 9.488516678495387e-05, + "loss": 0.0580519437789917, + "step": 36050 + }, + { + "epoch": 5.118523775727466, + "grad_norm": 0.9207919239997864, + "learning_rate": 9.488374733853797e-05, + "loss": 0.08848693370819091, + "step": 36060 + }, + { + "epoch": 5.119943222143364, + "grad_norm": 4.699718475341797, + "learning_rate": 9.488232789212208e-05, + "loss": 0.05136229991912842, + "step": 36070 + }, + { + "epoch": 5.121362668559262, + "grad_norm": 4.970333099365234, + "learning_rate": 9.488090844570618e-05, + "loss": 0.04451129138469696, + "step": 36080 + }, + { + "epoch": 5.12278211497516, + "grad_norm": 0.04377421736717224, + "learning_rate": 9.487948899929028e-05, + "loss": 0.04046821594238281, + "step": 36090 + }, + { + "epoch": 5.124201561391057, + "grad_norm": 0.19261276721954346, + "learning_rate": 9.487806955287439e-05, + "loss": 0.04373805820941925, + "step": 36100 + }, + { + "epoch": 5.125621007806956, + "grad_norm": 4.2963056564331055, + "learning_rate": 9.487665010645848e-05, + "loss": 0.021022433042526247, + "step": 36110 + }, + { + "epoch": 5.127040454222853, + "grad_norm": 7.420901775360107, + "learning_rate": 9.48752306600426e-05, + "loss": 0.0632928729057312, + "step": 36120 + }, + { + "epoch": 5.128459900638751, + "grad_norm": 5.440396785736084, + "learning_rate": 9.487381121362669e-05, + "loss": 0.09147984385490418, + "step": 36130 + }, + { + "epoch": 5.129879347054649, + "grad_norm": 7.403855800628662, + "learning_rate": 9.487239176721079e-05, + "loss": 0.05113822817802429, + "step": 36140 + }, + { + "epoch": 5.131298793470546, + "grad_norm": 1.6538256406784058, + "learning_rate": 9.487097232079489e-05, + "loss": 0.06598106026649475, + "step": 36150 + }, + { + "epoch": 5.132718239886445, + "grad_norm": 0.8752590417861938, + "learning_rate": 9.4869552874379e-05, + "loss": 0.03561938405036926, + "step": 36160 + }, + { + "epoch": 5.134137686302342, + "grad_norm": 2.622938632965088, + "learning_rate": 9.48681334279631e-05, + "loss": 0.08233972787857055, + "step": 36170 + }, + { + "epoch": 5.13555713271824, + "grad_norm": 0.23157648742198944, + "learning_rate": 9.486671398154721e-05, + "loss": 0.06924783587455749, + "step": 36180 + }, + { + "epoch": 5.136976579134138, + "grad_norm": 2.9473605155944824, + "learning_rate": 9.48652945351313e-05, + "loss": 0.07875468730926513, + "step": 36190 + }, + { + "epoch": 5.138396025550035, + "grad_norm": 10.965639114379883, + "learning_rate": 9.48638750887154e-05, + "loss": 0.058244621753692626, + "step": 36200 + }, + { + "epoch": 5.1398154719659335, + "grad_norm": 0.24280737340450287, + "learning_rate": 9.486245564229951e-05, + "loss": 0.07738088965415954, + "step": 36210 + }, + { + "epoch": 5.141234918381831, + "grad_norm": 7.49735164642334, + "learning_rate": 9.486103619588361e-05, + "loss": 0.03476256728172302, + "step": 36220 + }, + { + "epoch": 5.142654364797729, + "grad_norm": 1.5520763397216797, + "learning_rate": 9.485961674946772e-05, + "loss": 0.04197014570236206, + "step": 36230 + }, + { + "epoch": 5.144073811213627, + "grad_norm": 4.975586414337158, + "learning_rate": 9.48581973030518e-05, + "loss": 0.06271924376487732, + "step": 36240 + }, + { + "epoch": 5.145493257629525, + "grad_norm": 7.479091644287109, + "learning_rate": 9.485677785663592e-05, + "loss": 0.08511911630630493, + "step": 36250 + }, + { + "epoch": 5.146912704045422, + "grad_norm": 0.14201731979846954, + "learning_rate": 9.485535841022001e-05, + "loss": 0.03889679312705994, + "step": 36260 + }, + { + "epoch": 5.14833215046132, + "grad_norm": 1.5078015327453613, + "learning_rate": 9.485393896380412e-05, + "loss": 0.07731766104698182, + "step": 36270 + }, + { + "epoch": 5.149751596877218, + "grad_norm": 0.5280294418334961, + "learning_rate": 9.485251951738822e-05, + "loss": 0.03280209302902222, + "step": 36280 + }, + { + "epoch": 5.1511710432931155, + "grad_norm": 1.5268537998199463, + "learning_rate": 9.485110007097232e-05, + "loss": 0.03509989082813263, + "step": 36290 + }, + { + "epoch": 5.152590489709014, + "grad_norm": 0.8695167303085327, + "learning_rate": 9.484968062455643e-05, + "loss": 0.016241730749607088, + "step": 36300 + }, + { + "epoch": 5.154009936124911, + "grad_norm": 7.062769889831543, + "learning_rate": 9.484826117814053e-05, + "loss": 0.06393226981163025, + "step": 36310 + }, + { + "epoch": 5.1554293825408095, + "grad_norm": 8.141936302185059, + "learning_rate": 9.484684173172464e-05, + "loss": 0.037215083837509155, + "step": 36320 + }, + { + "epoch": 5.156848828956707, + "grad_norm": 1.9921444654464722, + "learning_rate": 9.484542228530874e-05, + "loss": 0.04678789079189301, + "step": 36330 + }, + { + "epoch": 5.158268275372604, + "grad_norm": 0.44700711965560913, + "learning_rate": 9.484400283889283e-05, + "loss": 0.0651922881603241, + "step": 36340 + }, + { + "epoch": 5.159687721788503, + "grad_norm": 1.856458067893982, + "learning_rate": 9.484258339247693e-05, + "loss": 0.04245249330997467, + "step": 36350 + }, + { + "epoch": 5.1611071682044, + "grad_norm": 10.253634452819824, + "learning_rate": 9.484116394606104e-05, + "loss": 0.07513232231140136, + "step": 36360 + }, + { + "epoch": 5.162526614620298, + "grad_norm": 0.31568190455436707, + "learning_rate": 9.483974449964514e-05, + "loss": 0.08399287462234498, + "step": 36370 + }, + { + "epoch": 5.163946061036196, + "grad_norm": 1.2396879196166992, + "learning_rate": 9.483832505322925e-05, + "loss": 0.11422721147537232, + "step": 36380 + }, + { + "epoch": 5.165365507452094, + "grad_norm": 4.058791160583496, + "learning_rate": 9.483690560681335e-05, + "loss": 0.07308083772659302, + "step": 36390 + }, + { + "epoch": 5.1667849538679915, + "grad_norm": 5.865930557250977, + "learning_rate": 9.483548616039744e-05, + "loss": 0.057479435205459596, + "step": 36400 + }, + { + "epoch": 5.168204400283889, + "grad_norm": 13.093545913696289, + "learning_rate": 9.483406671398155e-05, + "loss": 0.050871860980987546, + "step": 36410 + }, + { + "epoch": 5.169623846699787, + "grad_norm": 2.405416250228882, + "learning_rate": 9.483264726756565e-05, + "loss": 0.02025536894798279, + "step": 36420 + }, + { + "epoch": 5.171043293115685, + "grad_norm": 0.4331030249595642, + "learning_rate": 9.483122782114976e-05, + "loss": 0.01977370083332062, + "step": 36430 + }, + { + "epoch": 5.172462739531583, + "grad_norm": 3.3781073093414307, + "learning_rate": 9.482980837473385e-05, + "loss": 0.041225132346153257, + "step": 36440 + }, + { + "epoch": 5.17388218594748, + "grad_norm": 7.368563175201416, + "learning_rate": 9.482838892831796e-05, + "loss": 0.0713624656200409, + "step": 36450 + }, + { + "epoch": 5.175301632363379, + "grad_norm": 4.2806878089904785, + "learning_rate": 9.482696948190206e-05, + "loss": 0.044384431838989255, + "step": 36460 + }, + { + "epoch": 5.176721078779276, + "grad_norm": 1.133421778678894, + "learning_rate": 9.482555003548617e-05, + "loss": 0.0395077645778656, + "step": 36470 + }, + { + "epoch": 5.1781405251951735, + "grad_norm": 5.6397929191589355, + "learning_rate": 9.482413058907026e-05, + "loss": 0.07517208456993103, + "step": 36480 + }, + { + "epoch": 5.179559971611072, + "grad_norm": 4.359251499176025, + "learning_rate": 9.482271114265437e-05, + "loss": 0.05270699858665466, + "step": 36490 + }, + { + "epoch": 5.180979418026969, + "grad_norm": 0.030488723888993263, + "learning_rate": 9.482129169623847e-05, + "loss": 0.0488809198141098, + "step": 36500 + }, + { + "epoch": 5.180979418026969, + "eval_accuracy": 0.9735486742544669, + "eval_loss": 0.07985691726207733, + "eval_runtime": 34.4709, + "eval_samples_per_second": 456.24, + "eval_steps_per_second": 14.273, + "step": 36500 + }, + { + "epoch": 5.182398864442868, + "grad_norm": 4.528557777404785, + "learning_rate": 9.481987224982257e-05, + "loss": 0.050593554973602295, + "step": 36510 + }, + { + "epoch": 5.183818310858765, + "grad_norm": 11.558876991271973, + "learning_rate": 9.481845280340668e-05, + "loss": 0.07353735566139222, + "step": 36520 + }, + { + "epoch": 5.185237757274663, + "grad_norm": 5.571136474609375, + "learning_rate": 9.481703335699078e-05, + "loss": 0.046426203846931455, + "step": 36530 + }, + { + "epoch": 5.186657203690561, + "grad_norm": 5.435025215148926, + "learning_rate": 9.481561391057489e-05, + "loss": 0.0408222883939743, + "step": 36540 + }, + { + "epoch": 5.188076650106458, + "grad_norm": 5.731179714202881, + "learning_rate": 9.481419446415897e-05, + "loss": 0.07564018964767456, + "step": 36550 + }, + { + "epoch": 5.189496096522356, + "grad_norm": 2.5727474689483643, + "learning_rate": 9.481277501774308e-05, + "loss": 0.05871484279632568, + "step": 36560 + }, + { + "epoch": 5.190915542938254, + "grad_norm": 12.591144561767578, + "learning_rate": 9.481135557132718e-05, + "loss": 0.08494226336479187, + "step": 36570 + }, + { + "epoch": 5.192334989354152, + "grad_norm": 4.414670944213867, + "learning_rate": 9.480993612491129e-05, + "loss": 0.05334811806678772, + "step": 36580 + }, + { + "epoch": 5.19375443577005, + "grad_norm": 0.2967151403427124, + "learning_rate": 9.48085166784954e-05, + "loss": 0.020326825976371764, + "step": 36590 + }, + { + "epoch": 5.195173882185948, + "grad_norm": 2.503615617752075, + "learning_rate": 9.480709723207949e-05, + "loss": 0.025067511200904845, + "step": 36600 + }, + { + "epoch": 5.196593328601845, + "grad_norm": 0.2529163062572479, + "learning_rate": 9.48056777856636e-05, + "loss": 0.018686428666114807, + "step": 36610 + }, + { + "epoch": 5.198012775017743, + "grad_norm": 2.891233444213867, + "learning_rate": 9.48042583392477e-05, + "loss": 0.06542560458183289, + "step": 36620 + }, + { + "epoch": 5.199432221433641, + "grad_norm": 6.781946182250977, + "learning_rate": 9.48028388928318e-05, + "loss": 0.03696680366992951, + "step": 36630 + }, + { + "epoch": 5.2008516678495385, + "grad_norm": 0.25263258814811707, + "learning_rate": 9.48014194464159e-05, + "loss": 0.05601266026496887, + "step": 36640 + }, + { + "epoch": 5.202271114265437, + "grad_norm": 3.6889445781707764, + "learning_rate": 9.48e-05, + "loss": 0.04601848125457764, + "step": 36650 + }, + { + "epoch": 5.203690560681334, + "grad_norm": 6.032188415527344, + "learning_rate": 9.47985805535841e-05, + "loss": 0.03191192746162415, + "step": 36660 + }, + { + "epoch": 5.2051100070972325, + "grad_norm": 2.5597052574157715, + "learning_rate": 9.479716110716821e-05, + "loss": 0.08680691719055175, + "step": 36670 + }, + { + "epoch": 5.20652945351313, + "grad_norm": 1.3042011260986328, + "learning_rate": 9.479574166075232e-05, + "loss": 0.07259045839309693, + "step": 36680 + }, + { + "epoch": 5.207948899929027, + "grad_norm": 6.466866970062256, + "learning_rate": 9.479432221433642e-05, + "loss": 0.1063350796699524, + "step": 36690 + }, + { + "epoch": 5.209368346344926, + "grad_norm": 3.7391083240509033, + "learning_rate": 9.479290276792051e-05, + "loss": 0.06057397127151489, + "step": 36700 + }, + { + "epoch": 5.210787792760823, + "grad_norm": 5.033336162567139, + "learning_rate": 9.479148332150461e-05, + "loss": 0.03979503214359283, + "step": 36710 + }, + { + "epoch": 5.212207239176721, + "grad_norm": 4.238401889801025, + "learning_rate": 9.479006387508872e-05, + "loss": 0.04253645837306976, + "step": 36720 + }, + { + "epoch": 5.213626685592619, + "grad_norm": 6.895720958709717, + "learning_rate": 9.478864442867282e-05, + "loss": 0.0989041805267334, + "step": 36730 + }, + { + "epoch": 5.215046132008517, + "grad_norm": 0.8805387616157532, + "learning_rate": 9.478722498225693e-05, + "loss": 0.06594863533973694, + "step": 36740 + }, + { + "epoch": 5.2164655784244145, + "grad_norm": 0.778193473815918, + "learning_rate": 9.478580553584101e-05, + "loss": 0.04552145004272461, + "step": 36750 + }, + { + "epoch": 5.217885024840312, + "grad_norm": 1.0025285482406616, + "learning_rate": 9.478438608942513e-05, + "loss": 0.09854941368103028, + "step": 36760 + }, + { + "epoch": 5.21930447125621, + "grad_norm": 2.777564764022827, + "learning_rate": 9.478296664300924e-05, + "loss": 0.07063305974006653, + "step": 36770 + }, + { + "epoch": 5.220723917672108, + "grad_norm": 6.000669479370117, + "learning_rate": 9.478154719659333e-05, + "loss": 0.04376820921897888, + "step": 36780 + }, + { + "epoch": 5.222143364088006, + "grad_norm": 9.674980163574219, + "learning_rate": 9.478012775017744e-05, + "loss": 0.039281606674194336, + "step": 36790 + }, + { + "epoch": 5.223562810503903, + "grad_norm": 1.4637360572814941, + "learning_rate": 9.477870830376154e-05, + "loss": 0.04983239769935608, + "step": 36800 + }, + { + "epoch": 5.224982256919802, + "grad_norm": 0.23426099121570587, + "learning_rate": 9.477728885734564e-05, + "loss": 0.07621067762374878, + "step": 36810 + }, + { + "epoch": 5.226401703335699, + "grad_norm": 0.1280343383550644, + "learning_rate": 9.477586941092974e-05, + "loss": 0.07941646575927734, + "step": 36820 + }, + { + "epoch": 5.2278211497515965, + "grad_norm": 6.921216011047363, + "learning_rate": 9.477444996451385e-05, + "loss": 0.08770122528076171, + "step": 36830 + }, + { + "epoch": 5.229240596167495, + "grad_norm": 0.5032868981361389, + "learning_rate": 9.477303051809795e-05, + "loss": 0.04545081257820129, + "step": 36840 + }, + { + "epoch": 5.230660042583392, + "grad_norm": 0.8603189587593079, + "learning_rate": 9.477161107168206e-05, + "loss": 0.09249699711799622, + "step": 36850 + }, + { + "epoch": 5.2320794889992905, + "grad_norm": 3.6071572303771973, + "learning_rate": 9.477019162526615e-05, + "loss": 0.08706681132316589, + "step": 36860 + }, + { + "epoch": 5.233498935415188, + "grad_norm": 0.3033102750778198, + "learning_rate": 9.476877217885025e-05, + "loss": 0.05687382221221924, + "step": 36870 + }, + { + "epoch": 5.234918381831086, + "grad_norm": 1.9104326963424683, + "learning_rate": 9.476735273243436e-05, + "loss": 0.05650158524513245, + "step": 36880 + }, + { + "epoch": 5.236337828246984, + "grad_norm": 0.33066046237945557, + "learning_rate": 9.476593328601846e-05, + "loss": 0.03637203574180603, + "step": 36890 + }, + { + "epoch": 5.237757274662881, + "grad_norm": 9.79216194152832, + "learning_rate": 9.476451383960257e-05, + "loss": 0.10526165962219239, + "step": 36900 + }, + { + "epoch": 5.239176721078779, + "grad_norm": 0.6268110871315002, + "learning_rate": 9.476309439318665e-05, + "loss": 0.05048244595527649, + "step": 36910 + }, + { + "epoch": 5.240596167494677, + "grad_norm": 3.2916431427001953, + "learning_rate": 9.476167494677077e-05, + "loss": 0.10257794857025146, + "step": 36920 + }, + { + "epoch": 5.242015613910575, + "grad_norm": 0.32357263565063477, + "learning_rate": 9.476025550035486e-05, + "loss": 0.034938329458236696, + "step": 36930 + }, + { + "epoch": 5.2434350603264726, + "grad_norm": 9.606283187866211, + "learning_rate": 9.475883605393897e-05, + "loss": 0.08114267587661743, + "step": 36940 + }, + { + "epoch": 5.244854506742371, + "grad_norm": 5.870957851409912, + "learning_rate": 9.475741660752307e-05, + "loss": 0.06058700084686279, + "step": 36950 + }, + { + "epoch": 5.246273953158268, + "grad_norm": 4.593484401702881, + "learning_rate": 9.475599716110717e-05, + "loss": 0.05402443408966064, + "step": 36960 + }, + { + "epoch": 5.247693399574166, + "grad_norm": 1.3845820426940918, + "learning_rate": 9.475457771469128e-05, + "loss": 0.047877585887908934, + "step": 36970 + }, + { + "epoch": 5.249112845990064, + "grad_norm": 2.0945143699645996, + "learning_rate": 9.475315826827538e-05, + "loss": 0.05654643177986145, + "step": 36980 + }, + { + "epoch": 5.250532292405961, + "grad_norm": 5.1305131912231445, + "learning_rate": 9.475173882185949e-05, + "loss": 0.09039227962493897, + "step": 36990 + }, + { + "epoch": 5.25195173882186, + "grad_norm": 0.9498898983001709, + "learning_rate": 9.475031937544358e-05, + "loss": 0.05969501733779907, + "step": 37000 + }, + { + "epoch": 5.25195173882186, + "eval_accuracy": 0.9731671647485216, + "eval_loss": 0.08746004849672318, + "eval_runtime": 33.9754, + "eval_samples_per_second": 462.894, + "eval_steps_per_second": 14.481, + "step": 37000 + }, + { + "epoch": 5.253371185237757, + "grad_norm": 0.6349175572395325, + "learning_rate": 9.474889992902768e-05, + "loss": 0.05443019866943359, + "step": 37010 + }, + { + "epoch": 5.2547906316536555, + "grad_norm": 5.677085876464844, + "learning_rate": 9.474748048261178e-05, + "loss": 0.0985885202884674, + "step": 37020 + }, + { + "epoch": 5.256210078069553, + "grad_norm": 8.88677978515625, + "learning_rate": 9.474606103619589e-05, + "loss": 0.13014729022979737, + "step": 37030 + }, + { + "epoch": 5.25762952448545, + "grad_norm": 5.704558372497559, + "learning_rate": 9.474464158977999e-05, + "loss": 0.043677717447280884, + "step": 37040 + }, + { + "epoch": 5.259048970901349, + "grad_norm": 4.478132724761963, + "learning_rate": 9.47432221433641e-05, + "loss": 0.0692388117313385, + "step": 37050 + }, + { + "epoch": 5.260468417317246, + "grad_norm": 0.5155969262123108, + "learning_rate": 9.47418026969482e-05, + "loss": 0.028971996903419495, + "step": 37060 + }, + { + "epoch": 5.261887863733144, + "grad_norm": 0.47446149587631226, + "learning_rate": 9.47403832505323e-05, + "loss": 0.0572409987449646, + "step": 37070 + }, + { + "epoch": 5.263307310149042, + "grad_norm": 1.3588443994522095, + "learning_rate": 9.47389638041164e-05, + "loss": 0.08811056017875671, + "step": 37080 + }, + { + "epoch": 5.26472675656494, + "grad_norm": 4.575846195220947, + "learning_rate": 9.47375443577005e-05, + "loss": 0.0533088743686676, + "step": 37090 + }, + { + "epoch": 5.2661462029808375, + "grad_norm": 3.1430861949920654, + "learning_rate": 9.473612491128461e-05, + "loss": 0.08389832377433777, + "step": 37100 + }, + { + "epoch": 5.267565649396735, + "grad_norm": 0.3461446166038513, + "learning_rate": 9.47347054648687e-05, + "loss": 0.07585886120796204, + "step": 37110 + }, + { + "epoch": 5.268985095812633, + "grad_norm": 7.318383693695068, + "learning_rate": 9.473328601845281e-05, + "loss": 0.05129183530807495, + "step": 37120 + }, + { + "epoch": 5.270404542228531, + "grad_norm": 3.846140146255493, + "learning_rate": 9.47318665720369e-05, + "loss": 0.041881787776947024, + "step": 37130 + }, + { + "epoch": 5.271823988644429, + "grad_norm": 6.777071952819824, + "learning_rate": 9.473044712562102e-05, + "loss": 0.0678529143333435, + "step": 37140 + }, + { + "epoch": 5.273243435060326, + "grad_norm": 4.96095085144043, + "learning_rate": 9.472902767920511e-05, + "loss": 0.04452368021011353, + "step": 37150 + }, + { + "epoch": 5.274662881476225, + "grad_norm": 3.6459364891052246, + "learning_rate": 9.472760823278922e-05, + "loss": 0.061465442180633545, + "step": 37160 + }, + { + "epoch": 5.276082327892122, + "grad_norm": 2.436566114425659, + "learning_rate": 9.472618878637332e-05, + "loss": 0.05536556839942932, + "step": 37170 + }, + { + "epoch": 5.2775017743080195, + "grad_norm": 7.050469398498535, + "learning_rate": 9.472476933995742e-05, + "loss": 0.04715914726257324, + "step": 37180 + }, + { + "epoch": 5.278921220723918, + "grad_norm": 1.642188549041748, + "learning_rate": 9.472334989354153e-05, + "loss": 0.03356336355209351, + "step": 37190 + }, + { + "epoch": 5.280340667139815, + "grad_norm": 0.2856753468513489, + "learning_rate": 9.472193044712563e-05, + "loss": 0.03416549563407898, + "step": 37200 + }, + { + "epoch": 5.2817601135557135, + "grad_norm": 3.587663173675537, + "learning_rate": 9.472051100070974e-05, + "loss": 0.05653611421585083, + "step": 37210 + }, + { + "epoch": 5.283179559971611, + "grad_norm": 6.947723865509033, + "learning_rate": 9.471909155429382e-05, + "loss": 0.091396963596344, + "step": 37220 + }, + { + "epoch": 5.284599006387509, + "grad_norm": 7.335968971252441, + "learning_rate": 9.471767210787793e-05, + "loss": 0.03362211585044861, + "step": 37230 + }, + { + "epoch": 5.286018452803407, + "grad_norm": 0.9858488440513611, + "learning_rate": 9.471625266146203e-05, + "loss": 0.04085931479930878, + "step": 37240 + }, + { + "epoch": 5.287437899219304, + "grad_norm": 5.122075080871582, + "learning_rate": 9.471483321504614e-05, + "loss": 0.05024126172065735, + "step": 37250 + }, + { + "epoch": 5.288857345635202, + "grad_norm": Infinity, + "learning_rate": 9.471341376863024e-05, + "loss": 0.06409624218940735, + "step": 37260 + }, + { + "epoch": 5.2902767920511, + "grad_norm": 7.42424201965332, + "learning_rate": 9.471213626685594e-05, + "loss": 0.06915678977966308, + "step": 37270 + }, + { + "epoch": 5.291696238466998, + "grad_norm": 0.18197417259216309, + "learning_rate": 9.471071682044003e-05, + "loss": 0.12947027683258056, + "step": 37280 + }, + { + "epoch": 5.2931156848828955, + "grad_norm": 1.1386280059814453, + "learning_rate": 9.470929737402413e-05, + "loss": 0.0524638831615448, + "step": 37290 + }, + { + "epoch": 5.294535131298794, + "grad_norm": 2.983981132507324, + "learning_rate": 9.470787792760823e-05, + "loss": 0.051947909593582156, + "step": 37300 + }, + { + "epoch": 5.295954577714691, + "grad_norm": 0.2438533753156662, + "learning_rate": 9.470645848119234e-05, + "loss": 0.03344468176364899, + "step": 37310 + }, + { + "epoch": 5.297374024130589, + "grad_norm": 1.9252828359603882, + "learning_rate": 9.470503903477644e-05, + "loss": 0.04618232250213623, + "step": 37320 + }, + { + "epoch": 5.298793470546487, + "grad_norm": 0.39696675539016724, + "learning_rate": 9.470361958836055e-05, + "loss": 0.08025044798851014, + "step": 37330 + }, + { + "epoch": 5.300212916962384, + "grad_norm": 2.89485239982605, + "learning_rate": 9.470220014194465e-05, + "loss": 0.08881823420524597, + "step": 37340 + }, + { + "epoch": 5.301632363378283, + "grad_norm": 8.957134246826172, + "learning_rate": 9.470078069552874e-05, + "loss": 0.15155066251754762, + "step": 37350 + }, + { + "epoch": 5.30305180979418, + "grad_norm": 1.5332341194152832, + "learning_rate": 9.469936124911285e-05, + "loss": 0.0627815842628479, + "step": 37360 + }, + { + "epoch": 5.304471256210078, + "grad_norm": 8.660079956054688, + "learning_rate": 9.469794180269695e-05, + "loss": 0.08449615240097046, + "step": 37370 + }, + { + "epoch": 5.305890702625976, + "grad_norm": 7.650805473327637, + "learning_rate": 9.469652235628106e-05, + "loss": 0.058890581130981445, + "step": 37380 + }, + { + "epoch": 5.307310149041873, + "grad_norm": 0.46144038438796997, + "learning_rate": 9.469510290986515e-05, + "loss": 0.035400664806365965, + "step": 37390 + }, + { + "epoch": 5.308729595457772, + "grad_norm": 6.973151206970215, + "learning_rate": 9.469368346344926e-05, + "loss": 0.05486550331115723, + "step": 37400 + }, + { + "epoch": 5.310149041873669, + "grad_norm": 5.088735580444336, + "learning_rate": 9.469226401703335e-05, + "loss": 0.0546966552734375, + "step": 37410 + }, + { + "epoch": 5.311568488289567, + "grad_norm": 2.2214155197143555, + "learning_rate": 9.469084457061747e-05, + "loss": 0.08320272564888001, + "step": 37420 + }, + { + "epoch": 5.312987934705465, + "grad_norm": 8.335865020751953, + "learning_rate": 9.468942512420158e-05, + "loss": 0.09053044319152832, + "step": 37430 + }, + { + "epoch": 5.314407381121363, + "grad_norm": 2.290205717086792, + "learning_rate": 9.468800567778566e-05, + "loss": 0.016748277842998503, + "step": 37440 + }, + { + "epoch": 5.31582682753726, + "grad_norm": 6.0070109367370605, + "learning_rate": 9.468658623136977e-05, + "loss": 0.037306949496269226, + "step": 37450 + }, + { + "epoch": 5.317246273953158, + "grad_norm": 0.7360553741455078, + "learning_rate": 9.468516678495387e-05, + "loss": 0.04275312125682831, + "step": 37460 + }, + { + "epoch": 5.318665720369056, + "grad_norm": 1.0693514347076416, + "learning_rate": 9.468374733853798e-05, + "loss": 0.053118348121643066, + "step": 37470 + }, + { + "epoch": 5.320085166784954, + "grad_norm": 0.3035983443260193, + "learning_rate": 9.468232789212208e-05, + "loss": 0.06972123384475708, + "step": 37480 + }, + { + "epoch": 5.321504613200852, + "grad_norm": 5.642818450927734, + "learning_rate": 9.468090844570619e-05, + "loss": 0.05097317695617676, + "step": 37490 + }, + { + "epoch": 5.322924059616749, + "grad_norm": 2.78389835357666, + "learning_rate": 9.467948899929027e-05, + "loss": 0.02805333733558655, + "step": 37500 + }, + { + "epoch": 5.322924059616749, + "eval_accuracy": 0.9726584854072614, + "eval_loss": 0.08883775025606155, + "eval_runtime": 33.5333, + "eval_samples_per_second": 468.997, + "eval_steps_per_second": 14.672, + "step": 37500 + }, + { + "epoch": 5.324343506032648, + "grad_norm": 5.956612586975098, + "learning_rate": 9.467806955287438e-05, + "loss": 0.03515351712703705, + "step": 37510 + }, + { + "epoch": 5.325762952448545, + "grad_norm": 11.028221130371094, + "learning_rate": 9.46766501064585e-05, + "loss": 0.07912625670433045, + "step": 37520 + }, + { + "epoch": 5.3271823988644424, + "grad_norm": 10.001873016357422, + "learning_rate": 9.467523066004259e-05, + "loss": 0.06761111617088318, + "step": 37530 + }, + { + "epoch": 5.328601845280341, + "grad_norm": 1.0504896640777588, + "learning_rate": 9.46738112136267e-05, + "loss": 0.06048610210418701, + "step": 37540 + }, + { + "epoch": 5.330021291696238, + "grad_norm": 3.470850944519043, + "learning_rate": 9.467239176721079e-05, + "loss": 0.05224236249923706, + "step": 37550 + }, + { + "epoch": 5.3314407381121365, + "grad_norm": 0.3818908929824829, + "learning_rate": 9.46709723207949e-05, + "loss": 0.03469651639461517, + "step": 37560 + }, + { + "epoch": 5.332860184528034, + "grad_norm": 7.192812919616699, + "learning_rate": 9.4669552874379e-05, + "loss": 0.10160307884216309, + "step": 37570 + }, + { + "epoch": 5.334279630943932, + "grad_norm": 6.561454772949219, + "learning_rate": 9.46681334279631e-05, + "loss": 0.0493013322353363, + "step": 37580 + }, + { + "epoch": 5.33569907735983, + "grad_norm": 4.2305755615234375, + "learning_rate": 9.46667139815472e-05, + "loss": 0.053803282976150515, + "step": 37590 + }, + { + "epoch": 5.337118523775727, + "grad_norm": 3.255946636199951, + "learning_rate": 9.46652945351313e-05, + "loss": 0.05600963830947876, + "step": 37600 + }, + { + "epoch": 5.338537970191625, + "grad_norm": 7.0095109939575195, + "learning_rate": 9.466387508871541e-05, + "loss": 0.032292142510414124, + "step": 37610 + }, + { + "epoch": 5.339957416607523, + "grad_norm": 3.136387825012207, + "learning_rate": 9.466245564229951e-05, + "loss": 0.0346368670463562, + "step": 37620 + }, + { + "epoch": 5.341376863023421, + "grad_norm": 6.414516448974609, + "learning_rate": 9.466103619588362e-05, + "loss": 0.04215942919254303, + "step": 37630 + }, + { + "epoch": 5.3427963094393185, + "grad_norm": 6.037017345428467, + "learning_rate": 9.465961674946772e-05, + "loss": 0.06676658987998962, + "step": 37640 + }, + { + "epoch": 5.344215755855217, + "grad_norm": 0.9441502690315247, + "learning_rate": 9.465819730305181e-05, + "loss": 0.03622086644172669, + "step": 37650 + }, + { + "epoch": 5.345635202271114, + "grad_norm": 4.125903606414795, + "learning_rate": 9.465677785663591e-05, + "loss": 0.06060633063316345, + "step": 37660 + }, + { + "epoch": 5.347054648687012, + "grad_norm": 4.286660671234131, + "learning_rate": 9.465535841022002e-05, + "loss": 0.044310915470123294, + "step": 37670 + }, + { + "epoch": 5.34847409510291, + "grad_norm": 0.5203285813331604, + "learning_rate": 9.465393896380412e-05, + "loss": 0.07237310409545898, + "step": 37680 + }, + { + "epoch": 5.349893541518807, + "grad_norm": 11.040092468261719, + "learning_rate": 9.465251951738823e-05, + "loss": 0.07519057989120484, + "step": 37690 + }, + { + "epoch": 5.351312987934706, + "grad_norm": 3.501375913619995, + "learning_rate": 9.465110007097231e-05, + "loss": 0.057271170616149905, + "step": 37700 + }, + { + "epoch": 5.352732434350603, + "grad_norm": 2.5016627311706543, + "learning_rate": 9.464968062455642e-05, + "loss": 0.042032480239868164, + "step": 37710 + }, + { + "epoch": 5.354151880766501, + "grad_norm": 5.547698497772217, + "learning_rate": 9.464826117814054e-05, + "loss": 0.06820942163467407, + "step": 37720 + }, + { + "epoch": 5.355571327182399, + "grad_norm": 3.99342679977417, + "learning_rate": 9.464684173172463e-05, + "loss": 0.06758497953414917, + "step": 37730 + }, + { + "epoch": 5.356990773598296, + "grad_norm": 1.67915678024292, + "learning_rate": 9.464542228530874e-05, + "loss": 0.05731663703918457, + "step": 37740 + }, + { + "epoch": 5.3584102200141945, + "grad_norm": 6.3496174812316895, + "learning_rate": 9.464400283889283e-05, + "loss": 0.06755791902542115, + "step": 37750 + }, + { + "epoch": 5.359829666430092, + "grad_norm": 7.569107532501221, + "learning_rate": 9.464272533711853e-05, + "loss": 0.13933613300323486, + "step": 37760 + }, + { + "epoch": 5.36124911284599, + "grad_norm": 7.283881664276123, + "learning_rate": 9.464130589070262e-05, + "loss": 0.07494657635688781, + "step": 37770 + }, + { + "epoch": 5.362668559261888, + "grad_norm": 6.227843284606934, + "learning_rate": 9.463988644428673e-05, + "loss": 0.04237803816795349, + "step": 37780 + }, + { + "epoch": 5.364088005677786, + "grad_norm": 7.2157392501831055, + "learning_rate": 9.463846699787083e-05, + "loss": 0.06301524639129638, + "step": 37790 + }, + { + "epoch": 5.365507452093683, + "grad_norm": 5.091018199920654, + "learning_rate": 9.463704755145494e-05, + "loss": 0.07857232093811035, + "step": 37800 + }, + { + "epoch": 5.366926898509581, + "grad_norm": 0.757509171962738, + "learning_rate": 9.463562810503904e-05, + "loss": 0.0769877016544342, + "step": 37810 + }, + { + "epoch": 5.368346344925479, + "grad_norm": 3.653813362121582, + "learning_rate": 9.463420865862315e-05, + "loss": 0.08140221238136292, + "step": 37820 + }, + { + "epoch": 5.3697657913413765, + "grad_norm": 6.798269271850586, + "learning_rate": 9.463278921220724e-05, + "loss": 0.04424488544464111, + "step": 37830 + }, + { + "epoch": 5.371185237757275, + "grad_norm": 9.487317085266113, + "learning_rate": 9.463136976579135e-05, + "loss": 0.05726785659790039, + "step": 37840 + }, + { + "epoch": 5.372604684173172, + "grad_norm": 0.3824310898780823, + "learning_rate": 9.462995031937544e-05, + "loss": 0.03568733036518097, + "step": 37850 + }, + { + "epoch": 5.374024130589071, + "grad_norm": 0.5051906108856201, + "learning_rate": 9.462853087295955e-05, + "loss": 0.05948272943496704, + "step": 37860 + }, + { + "epoch": 5.375443577004968, + "grad_norm": 1.7530025243759155, + "learning_rate": 9.462711142654365e-05, + "loss": 0.03755594789981842, + "step": 37870 + }, + { + "epoch": 5.376863023420865, + "grad_norm": 1.4626020193099976, + "learning_rate": 9.462569198012775e-05, + "loss": 0.04836176633834839, + "step": 37880 + }, + { + "epoch": 5.378282469836764, + "grad_norm": 0.16351209580898285, + "learning_rate": 9.462427253371186e-05, + "loss": 0.023823167383670806, + "step": 37890 + }, + { + "epoch": 5.379701916252661, + "grad_norm": 5.490500450134277, + "learning_rate": 9.462285308729596e-05, + "loss": 0.04158731102943421, + "step": 37900 + }, + { + "epoch": 5.3811213626685594, + "grad_norm": 1.351608157157898, + "learning_rate": 9.462143364088007e-05, + "loss": 0.0956190824508667, + "step": 37910 + }, + { + "epoch": 5.382540809084457, + "grad_norm": 0.13233773410320282, + "learning_rate": 9.462001419446417e-05, + "loss": 0.07234857082366944, + "step": 37920 + }, + { + "epoch": 5.383960255500355, + "grad_norm": 3.7168662548065186, + "learning_rate": 9.461859474804826e-05, + "loss": 0.040309539437294005, + "step": 37930 + }, + { + "epoch": 5.385379701916253, + "grad_norm": 8.766815185546875, + "learning_rate": 9.461717530163236e-05, + "loss": 0.08582027554512024, + "step": 37940 + }, + { + "epoch": 5.38679914833215, + "grad_norm": 1.3776038885116577, + "learning_rate": 9.461575585521647e-05, + "loss": 0.11603788137435914, + "step": 37950 + }, + { + "epoch": 5.388218594748048, + "grad_norm": 2.4015753269195557, + "learning_rate": 9.461433640880057e-05, + "loss": 0.060989999771118165, + "step": 37960 + }, + { + "epoch": 5.389638041163946, + "grad_norm": 0.7234444618225098, + "learning_rate": 9.461291696238468e-05, + "loss": 0.0282410591840744, + "step": 37970 + }, + { + "epoch": 5.391057487579844, + "grad_norm": 1.9186277389526367, + "learning_rate": 9.461149751596878e-05, + "loss": 0.05424323081970215, + "step": 37980 + }, + { + "epoch": 5.3924769339957415, + "grad_norm": 2.9660613536834717, + "learning_rate": 9.461007806955287e-05, + "loss": 0.04484150111675263, + "step": 37990 + }, + { + "epoch": 5.39389638041164, + "grad_norm": 4.6889967918396, + "learning_rate": 9.460865862313699e-05, + "loss": 0.05380064845085144, + "step": 38000 + }, + { + "epoch": 5.39389638041164, + "eval_accuracy": 0.970814522795193, + "eval_loss": 0.09178540110588074, + "eval_runtime": 33.7285, + "eval_samples_per_second": 466.282, + "eval_steps_per_second": 14.587, + "step": 38000 + }, + { + "epoch": 5.395315826827537, + "grad_norm": 9.106490135192871, + "learning_rate": 9.460723917672108e-05, + "loss": 0.05757981538772583, + "step": 38010 + }, + { + "epoch": 5.396735273243435, + "grad_norm": 3.3271431922912598, + "learning_rate": 9.46058197303052e-05, + "loss": 0.049441322684288025, + "step": 38020 + }, + { + "epoch": 5.398154719659333, + "grad_norm": 1.027500033378601, + "learning_rate": 9.460440028388928e-05, + "loss": 0.07134292721748352, + "step": 38030 + }, + { + "epoch": 5.39957416607523, + "grad_norm": 8.88465404510498, + "learning_rate": 9.460298083747339e-05, + "loss": 0.02503353953361511, + "step": 38040 + }, + { + "epoch": 5.400993612491129, + "grad_norm": 6.1659626960754395, + "learning_rate": 9.460156139105749e-05, + "loss": 0.07239366173744202, + "step": 38050 + }, + { + "epoch": 5.402413058907026, + "grad_norm": 8.75742244720459, + "learning_rate": 9.46001419446416e-05, + "loss": 0.052278178930282596, + "step": 38060 + }, + { + "epoch": 5.403832505322924, + "grad_norm": 0.5453545451164246, + "learning_rate": 9.45987224982257e-05, + "loss": 0.03611307740211487, + "step": 38070 + }, + { + "epoch": 5.405251951738822, + "grad_norm": 0.5815269351005554, + "learning_rate": 9.459730305180979e-05, + "loss": 0.02320513129234314, + "step": 38080 + }, + { + "epoch": 5.406671398154719, + "grad_norm": 7.123127460479736, + "learning_rate": 9.45958836053939e-05, + "loss": 0.08660604953765869, + "step": 38090 + }, + { + "epoch": 5.4080908445706175, + "grad_norm": 3.231856346130371, + "learning_rate": 9.4594464158978e-05, + "loss": 0.09372188448905945, + "step": 38100 + }, + { + "epoch": 5.409510290986515, + "grad_norm": 0.1635567545890808, + "learning_rate": 9.459304471256211e-05, + "loss": 0.039569467306137085, + "step": 38110 + }, + { + "epoch": 5.410929737402413, + "grad_norm": 5.7268829345703125, + "learning_rate": 9.459162526614621e-05, + "loss": 0.048163232207298276, + "step": 38120 + }, + { + "epoch": 5.412349183818311, + "grad_norm": 4.400755405426025, + "learning_rate": 9.45902058197303e-05, + "loss": 0.055844247341156006, + "step": 38130 + }, + { + "epoch": 5.413768630234209, + "grad_norm": 0.08212675154209137, + "learning_rate": 9.45887863733144e-05, + "loss": 0.015825629234313965, + "step": 38140 + }, + { + "epoch": 5.415188076650106, + "grad_norm": 2.162956953048706, + "learning_rate": 9.458736692689851e-05, + "loss": 0.03781618475914002, + "step": 38150 + }, + { + "epoch": 5.416607523066004, + "grad_norm": 4.835059642791748, + "learning_rate": 9.458594748048261e-05, + "loss": 0.02739281952381134, + "step": 38160 + }, + { + "epoch": 5.418026969481902, + "grad_norm": 0.8784237504005432, + "learning_rate": 9.458452803406672e-05, + "loss": 0.05759773254394531, + "step": 38170 + }, + { + "epoch": 5.4194464158977995, + "grad_norm": 4.168929576873779, + "learning_rate": 9.458310858765082e-05, + "loss": 0.040823107957839964, + "step": 38180 + }, + { + "epoch": 5.420865862313698, + "grad_norm": 2.2118477821350098, + "learning_rate": 9.458168914123492e-05, + "loss": 0.051774638891220096, + "step": 38190 + }, + { + "epoch": 5.422285308729595, + "grad_norm": 1.5984776020050049, + "learning_rate": 9.458026969481903e-05, + "loss": 0.03739486932754517, + "step": 38200 + }, + { + "epoch": 5.4237047551454936, + "grad_norm": 1.8358681201934814, + "learning_rate": 9.457885024840313e-05, + "loss": 0.05115988254547119, + "step": 38210 + }, + { + "epoch": 5.425124201561391, + "grad_norm": 2.7211074829101562, + "learning_rate": 9.457743080198724e-05, + "loss": 0.02570842206478119, + "step": 38220 + }, + { + "epoch": 5.426543647977288, + "grad_norm": 0.14551569521427155, + "learning_rate": 9.457601135557133e-05, + "loss": 0.07109014987945557, + "step": 38230 + }, + { + "epoch": 5.427963094393187, + "grad_norm": 2.1508007049560547, + "learning_rate": 9.457459190915543e-05, + "loss": 0.06561747789382935, + "step": 38240 + }, + { + "epoch": 5.429382540809084, + "grad_norm": 1.7346519231796265, + "learning_rate": 9.457317246273953e-05, + "loss": 0.03357102572917938, + "step": 38250 + }, + { + "epoch": 5.430801987224982, + "grad_norm": 2.5490376949310303, + "learning_rate": 9.457175301632364e-05, + "loss": 0.04644973874092102, + "step": 38260 + }, + { + "epoch": 5.43222143364088, + "grad_norm": 6.230116844177246, + "learning_rate": 9.457033356990774e-05, + "loss": 0.08044976592063904, + "step": 38270 + }, + { + "epoch": 5.433640880056778, + "grad_norm": 8.601234436035156, + "learning_rate": 9.456891412349185e-05, + "loss": 0.07310090065002442, + "step": 38280 + }, + { + "epoch": 5.435060326472676, + "grad_norm": 10.968649864196777, + "learning_rate": 9.456749467707594e-05, + "loss": 0.06937228441238404, + "step": 38290 + }, + { + "epoch": 5.436479772888574, + "grad_norm": 0.40604132413864136, + "learning_rate": 9.456607523066004e-05, + "loss": 0.0455585777759552, + "step": 38300 + }, + { + "epoch": 5.437899219304471, + "grad_norm": 6.999582767486572, + "learning_rate": 9.456465578424415e-05, + "loss": 0.057856935262680056, + "step": 38310 + }, + { + "epoch": 5.439318665720369, + "grad_norm": 9.098925590515137, + "learning_rate": 9.456323633782825e-05, + "loss": 0.10670442581176758, + "step": 38320 + }, + { + "epoch": 5.440738112136267, + "grad_norm": 5.516963481903076, + "learning_rate": 9.456181689141236e-05, + "loss": 0.07467976808547974, + "step": 38330 + }, + { + "epoch": 5.442157558552164, + "grad_norm": 0.20835815370082855, + "learning_rate": 9.456039744499645e-05, + "loss": 0.08520526289939881, + "step": 38340 + }, + { + "epoch": 5.443577004968063, + "grad_norm": 6.751248359680176, + "learning_rate": 9.455897799858056e-05, + "loss": 0.060392063856124875, + "step": 38350 + }, + { + "epoch": 5.44499645138396, + "grad_norm": 3.8347504138946533, + "learning_rate": 9.455755855216465e-05, + "loss": 0.059272587299346924, + "step": 38360 + }, + { + "epoch": 5.4464158977998585, + "grad_norm": 5.555981636047363, + "learning_rate": 9.455613910574876e-05, + "loss": 0.05174638628959656, + "step": 38370 + }, + { + "epoch": 5.447835344215756, + "grad_norm": 15.028047561645508, + "learning_rate": 9.455471965933288e-05, + "loss": 0.06913697719573975, + "step": 38380 + }, + { + "epoch": 5.449254790631653, + "grad_norm": 0.3131970167160034, + "learning_rate": 9.455330021291696e-05, + "loss": 0.06802209615707397, + "step": 38390 + }, + { + "epoch": 5.450674237047552, + "grad_norm": 5.729186534881592, + "learning_rate": 9.455188076650107e-05, + "loss": 0.04004753828048706, + "step": 38400 + }, + { + "epoch": 5.452093683463449, + "grad_norm": 0.8130164742469788, + "learning_rate": 9.455046132008517e-05, + "loss": 0.038001006841659545, + "step": 38410 + }, + { + "epoch": 5.453513129879347, + "grad_norm": 4.351025581359863, + "learning_rate": 9.454904187366928e-05, + "loss": 0.03257591426372528, + "step": 38420 + }, + { + "epoch": 5.454932576295245, + "grad_norm": 0.4877346158027649, + "learning_rate": 9.454762242725338e-05, + "loss": 0.08598875999450684, + "step": 38430 + }, + { + "epoch": 5.456352022711143, + "grad_norm": 2.889662981033325, + "learning_rate": 9.454620298083747e-05, + "loss": 0.05721787214279175, + "step": 38440 + }, + { + "epoch": 5.4577714691270405, + "grad_norm": 1.7848842144012451, + "learning_rate": 9.454478353442157e-05, + "loss": 0.07306339144706726, + "step": 38450 + }, + { + "epoch": 5.459190915542938, + "grad_norm": 5.2257914543151855, + "learning_rate": 9.454336408800568e-05, + "loss": 0.020994843542575838, + "step": 38460 + }, + { + "epoch": 5.460610361958836, + "grad_norm": 0.30310195684432983, + "learning_rate": 9.454194464158979e-05, + "loss": 0.044001060724258426, + "step": 38470 + }, + { + "epoch": 5.462029808374734, + "grad_norm": 10.677133560180664, + "learning_rate": 9.454052519517389e-05, + "loss": 0.10154651403427124, + "step": 38480 + }, + { + "epoch": 5.463449254790632, + "grad_norm": 2.1424472332000732, + "learning_rate": 9.453910574875799e-05, + "loss": 0.014285300672054291, + "step": 38490 + }, + { + "epoch": 5.464868701206529, + "grad_norm": 4.392820835113525, + "learning_rate": 9.453768630234208e-05, + "loss": 0.07753425240516662, + "step": 38500 + }, + { + "epoch": 5.464868701206529, + "eval_accuracy": 0.9799707509378776, + "eval_loss": 0.05647183209657669, + "eval_runtime": 35.3162, + "eval_samples_per_second": 445.32, + "eval_steps_per_second": 13.931, + "step": 38500 + }, + { + "epoch": 5.466288147622428, + "grad_norm": 2.9922590255737305, + "learning_rate": 9.45362668559262e-05, + "loss": 0.07568216323852539, + "step": 38510 + }, + { + "epoch": 5.467707594038325, + "grad_norm": 0.7704054713249207, + "learning_rate": 9.453484740951029e-05, + "loss": 0.04570820927619934, + "step": 38520 + }, + { + "epoch": 5.4691270404542225, + "grad_norm": 0.6614516377449036, + "learning_rate": 9.45334279630944e-05, + "loss": 0.048297053575515746, + "step": 38530 + }, + { + "epoch": 5.470546486870121, + "grad_norm": 0.7620261907577515, + "learning_rate": 9.45320085166785e-05, + "loss": 0.04648005664348602, + "step": 38540 + }, + { + "epoch": 5.471965933286018, + "grad_norm": 0.19767679274082184, + "learning_rate": 9.45305890702626e-05, + "loss": 0.03164273500442505, + "step": 38550 + }, + { + "epoch": 5.4733853797019165, + "grad_norm": 16.225950241088867, + "learning_rate": 9.452916962384671e-05, + "loss": 0.06350845098495483, + "step": 38560 + }, + { + "epoch": 5.474804826117814, + "grad_norm": 0.54612135887146, + "learning_rate": 9.45277501774308e-05, + "loss": 0.05994898080825806, + "step": 38570 + }, + { + "epoch": 5.476224272533712, + "grad_norm": 10.0214204788208, + "learning_rate": 9.452633073101492e-05, + "loss": 0.07067713737487794, + "step": 38580 + }, + { + "epoch": 5.47764371894961, + "grad_norm": 1.5006757974624634, + "learning_rate": 9.452491128459902e-05, + "loss": 0.04153343141078949, + "step": 38590 + }, + { + "epoch": 5.479063165365507, + "grad_norm": 9.297212600708008, + "learning_rate": 9.452349183818311e-05, + "loss": 0.049632930755615236, + "step": 38600 + }, + { + "epoch": 5.480482611781405, + "grad_norm": 0.7019015550613403, + "learning_rate": 9.452207239176721e-05, + "loss": 0.05248759984970093, + "step": 38610 + }, + { + "epoch": 5.481902058197303, + "grad_norm": 0.5330097675323486, + "learning_rate": 9.452065294535132e-05, + "loss": 0.06436276435852051, + "step": 38620 + }, + { + "epoch": 5.483321504613201, + "grad_norm": 10.498361587524414, + "learning_rate": 9.451923349893542e-05, + "loss": 0.06726992130279541, + "step": 38630 + }, + { + "epoch": 5.4847409510290985, + "grad_norm": 1.389711618423462, + "learning_rate": 9.451781405251953e-05, + "loss": 0.04922493100166321, + "step": 38640 + }, + { + "epoch": 5.486160397444997, + "grad_norm": 6.544168472290039, + "learning_rate": 9.451639460610363e-05, + "loss": 0.043633729219436646, + "step": 38650 + }, + { + "epoch": 5.487579843860894, + "grad_norm": 2.614717960357666, + "learning_rate": 9.451497515968772e-05, + "loss": 0.05251113176345825, + "step": 38660 + }, + { + "epoch": 5.488999290276792, + "grad_norm": 3.5543386936187744, + "learning_rate": 9.451355571327183e-05, + "loss": 0.12290339469909668, + "step": 38670 + }, + { + "epoch": 5.49041873669269, + "grad_norm": 3.551682472229004, + "learning_rate": 9.451213626685593e-05, + "loss": 0.020476463437080383, + "step": 38680 + }, + { + "epoch": 5.491838183108587, + "grad_norm": 2.683623790740967, + "learning_rate": 9.451071682044004e-05, + "loss": 0.09093580842018127, + "step": 38690 + }, + { + "epoch": 5.493257629524486, + "grad_norm": 0.2011883705854416, + "learning_rate": 9.450929737402413e-05, + "loss": 0.05041297674179077, + "step": 38700 + }, + { + "epoch": 5.494677075940383, + "grad_norm": 6.980527877807617, + "learning_rate": 9.450787792760824e-05, + "loss": 0.06176745891571045, + "step": 38710 + }, + { + "epoch": 5.496096522356281, + "grad_norm": 1.4676660299301147, + "learning_rate": 9.450645848119234e-05, + "loss": 0.07806309461593627, + "step": 38720 + }, + { + "epoch": 5.497515968772179, + "grad_norm": 0.9213213324546814, + "learning_rate": 9.450503903477645e-05, + "loss": 0.08935214877128601, + "step": 38730 + }, + { + "epoch": 5.498935415188076, + "grad_norm": 0.4523489773273468, + "learning_rate": 9.450361958836054e-05, + "loss": 0.03591077327728272, + "step": 38740 + }, + { + "epoch": 5.500354861603975, + "grad_norm": 10.078042984008789, + "learning_rate": 9.450220014194464e-05, + "loss": 0.06557263135910034, + "step": 38750 + }, + { + "epoch": 5.501774308019872, + "grad_norm": 1.226947546005249, + "learning_rate": 9.450078069552875e-05, + "loss": 0.04932633936405182, + "step": 38760 + }, + { + "epoch": 5.50319375443577, + "grad_norm": 11.178776741027832, + "learning_rate": 9.449936124911285e-05, + "loss": 0.05480325222015381, + "step": 38770 + }, + { + "epoch": 5.504613200851668, + "grad_norm": 0.29021137952804565, + "learning_rate": 9.449794180269696e-05, + "loss": 0.02817882001399994, + "step": 38780 + }, + { + "epoch": 5.506032647267566, + "grad_norm": 0.24028928577899933, + "learning_rate": 9.449652235628106e-05, + "loss": 0.053332853317260745, + "step": 38790 + }, + { + "epoch": 5.5074520936834634, + "grad_norm": 3.2386868000030518, + "learning_rate": 9.449510290986515e-05, + "loss": 0.054778027534484866, + "step": 38800 + }, + { + "epoch": 5.508871540099361, + "grad_norm": 8.147454261779785, + "learning_rate": 9.449368346344925e-05, + "loss": 0.07028309106826783, + "step": 38810 + }, + { + "epoch": 5.510290986515259, + "grad_norm": 0.2091905176639557, + "learning_rate": 9.449226401703336e-05, + "loss": 0.0604537308216095, + "step": 38820 + }, + { + "epoch": 5.511710432931157, + "grad_norm": 2.969684600830078, + "learning_rate": 9.449084457061746e-05, + "loss": 0.04221682250499725, + "step": 38830 + }, + { + "epoch": 5.513129879347055, + "grad_norm": 0.9687553644180298, + "learning_rate": 9.448942512420157e-05, + "loss": 0.024466480314731597, + "step": 38840 + }, + { + "epoch": 5.514549325762952, + "grad_norm": 2.4636096954345703, + "learning_rate": 9.448800567778567e-05, + "loss": 0.10217208862304687, + "step": 38850 + }, + { + "epoch": 5.515968772178851, + "grad_norm": 0.06745173037052155, + "learning_rate": 9.448658623136977e-05, + "loss": 0.05621076226234436, + "step": 38860 + }, + { + "epoch": 5.517388218594748, + "grad_norm": 5.0668535232543945, + "learning_rate": 9.448516678495388e-05, + "loss": 0.07827832102775574, + "step": 38870 + }, + { + "epoch": 5.518807665010646, + "grad_norm": 0.08209887892007828, + "learning_rate": 9.448374733853797e-05, + "loss": 0.055400484800338747, + "step": 38880 + }, + { + "epoch": 5.520227111426544, + "grad_norm": 13.148024559020996, + "learning_rate": 9.448232789212209e-05, + "loss": 0.05364589095115661, + "step": 38890 + }, + { + "epoch": 5.521646557842441, + "grad_norm": 2.172600507736206, + "learning_rate": 9.448090844570618e-05, + "loss": 0.0865916907787323, + "step": 38900 + }, + { + "epoch": 5.5230660042583395, + "grad_norm": 3.1451680660247803, + "learning_rate": 9.447948899929028e-05, + "loss": 0.05537939667701721, + "step": 38910 + }, + { + "epoch": 5.524485450674237, + "grad_norm": 6.0753960609436035, + "learning_rate": 9.447806955287438e-05, + "loss": 0.06018974781036377, + "step": 38920 + }, + { + "epoch": 5.525904897090135, + "grad_norm": 10.356642723083496, + "learning_rate": 9.447665010645849e-05, + "loss": 0.07976688146591186, + "step": 38930 + }, + { + "epoch": 5.527324343506033, + "grad_norm": 1.3878400325775146, + "learning_rate": 9.447523066004259e-05, + "loss": 0.03053358793258667, + "step": 38940 + }, + { + "epoch": 5.528743789921931, + "grad_norm": 1.5194265842437744, + "learning_rate": 9.44738112136267e-05, + "loss": 0.054735350608825686, + "step": 38950 + }, + { + "epoch": 5.530163236337828, + "grad_norm": 5.2882304191589355, + "learning_rate": 9.44723917672108e-05, + "loss": 0.04336060583591461, + "step": 38960 + }, + { + "epoch": 5.531582682753726, + "grad_norm": 1.1984394788742065, + "learning_rate": 9.447097232079489e-05, + "loss": 0.06638429760932922, + "step": 38970 + }, + { + "epoch": 5.533002129169624, + "grad_norm": 5.800374984741211, + "learning_rate": 9.4469552874379e-05, + "loss": 0.0930544376373291, + "step": 38980 + }, + { + "epoch": 5.5344215755855215, + "grad_norm": 2.947801113128662, + "learning_rate": 9.44681334279631e-05, + "loss": 0.047572529315948485, + "step": 38990 + }, + { + "epoch": 5.53584102200142, + "grad_norm": 0.7058837413787842, + "learning_rate": 9.446671398154721e-05, + "loss": 0.0859229564666748, + "step": 39000 + }, + { + "epoch": 5.53584102200142, + "eval_accuracy": 0.9780632034081516, + "eval_loss": 0.06798505038022995, + "eval_runtime": 34.2002, + "eval_samples_per_second": 459.852, + "eval_steps_per_second": 14.386, + "step": 39000 + }, + { + "epoch": 5.537260468417317, + "grad_norm": 0.11149133741855621, + "learning_rate": 9.44652945351313e-05, + "loss": 0.027533328533172606, + "step": 39010 + }, + { + "epoch": 5.5386799148332155, + "grad_norm": 4.605347156524658, + "learning_rate": 9.44638750887154e-05, + "loss": 0.029569646716117857, + "step": 39020 + }, + { + "epoch": 5.540099361249113, + "grad_norm": 1.489418864250183, + "learning_rate": 9.44624556422995e-05, + "loss": 0.05133401155471802, + "step": 39030 + }, + { + "epoch": 5.54151880766501, + "grad_norm": 2.5126025676727295, + "learning_rate": 9.446103619588361e-05, + "loss": 0.03453618586063385, + "step": 39040 + }, + { + "epoch": 5.542938254080909, + "grad_norm": 2.8823859691619873, + "learning_rate": 9.445961674946771e-05, + "loss": 0.03072114586830139, + "step": 39050 + }, + { + "epoch": 5.544357700496806, + "grad_norm": 2.666760206222534, + "learning_rate": 9.445819730305181e-05, + "loss": 0.11776949167251587, + "step": 39060 + }, + { + "epoch": 5.545777146912704, + "grad_norm": 0.38773733377456665, + "learning_rate": 9.445677785663592e-05, + "loss": 0.020077161490917206, + "step": 39070 + }, + { + "epoch": 5.547196593328602, + "grad_norm": 4.955237865447998, + "learning_rate": 9.445535841022002e-05, + "loss": 0.05261182188987732, + "step": 39080 + }, + { + "epoch": 5.5486160397445, + "grad_norm": 2.406719207763672, + "learning_rate": 9.445393896380413e-05, + "loss": 0.04864847362041473, + "step": 39090 + }, + { + "epoch": 5.5500354861603975, + "grad_norm": 7.484744548797607, + "learning_rate": 9.445251951738823e-05, + "loss": 0.09151811003684998, + "step": 39100 + }, + { + "epoch": 5.551454932576295, + "grad_norm": 5.400770664215088, + "learning_rate": 9.445110007097232e-05, + "loss": 0.06537050008773804, + "step": 39110 + }, + { + "epoch": 5.552874378992193, + "grad_norm": 4.668606281280518, + "learning_rate": 9.444968062455642e-05, + "loss": 0.07697397470474243, + "step": 39120 + }, + { + "epoch": 5.554293825408091, + "grad_norm": 4.4636006355285645, + "learning_rate": 9.444826117814053e-05, + "loss": 0.05506072640419006, + "step": 39130 + }, + { + "epoch": 5.555713271823989, + "grad_norm": 13.243671417236328, + "learning_rate": 9.444684173172463e-05, + "loss": 0.061540770530700686, + "step": 39140 + }, + { + "epoch": 5.557132718239886, + "grad_norm": 3.05822491645813, + "learning_rate": 9.444542228530874e-05, + "loss": 0.09180397987365722, + "step": 39150 + }, + { + "epoch": 5.558552164655785, + "grad_norm": 2.3664355278015137, + "learning_rate": 9.444400283889284e-05, + "loss": 0.026255601644515993, + "step": 39160 + }, + { + "epoch": 5.559971611071682, + "grad_norm": 2.924701452255249, + "learning_rate": 9.444258339247693e-05, + "loss": 0.0476565808057785, + "step": 39170 + }, + { + "epoch": 5.56139105748758, + "grad_norm": 4.52599573135376, + "learning_rate": 9.444116394606104e-05, + "loss": 0.05514841079711914, + "step": 39180 + }, + { + "epoch": 5.562810503903478, + "grad_norm": 9.638506889343262, + "learning_rate": 9.443974449964514e-05, + "loss": 0.04046670794486999, + "step": 39190 + }, + { + "epoch": 5.564229950319375, + "grad_norm": 4.365987777709961, + "learning_rate": 9.443832505322925e-05, + "loss": 0.07949233055114746, + "step": 39200 + }, + { + "epoch": 5.565649396735274, + "grad_norm": 4.490222454071045, + "learning_rate": 9.443690560681335e-05, + "loss": 0.1337314486503601, + "step": 39210 + }, + { + "epoch": 5.567068843151171, + "grad_norm": 4.05878210067749, + "learning_rate": 9.443548616039745e-05, + "loss": 0.04588429927825928, + "step": 39220 + }, + { + "epoch": 5.568488289567069, + "grad_norm": 0.48254233598709106, + "learning_rate": 9.443406671398155e-05, + "loss": 0.028879329562187195, + "step": 39230 + }, + { + "epoch": 5.569907735982967, + "grad_norm": 2.425044298171997, + "learning_rate": 9.443264726756566e-05, + "loss": 0.07074435949325561, + "step": 39240 + }, + { + "epoch": 5.571327182398864, + "grad_norm": 4.822774410247803, + "learning_rate": 9.443122782114975e-05, + "loss": 0.049528279900550844, + "step": 39250 + }, + { + "epoch": 5.5727466288147625, + "grad_norm": 6.030632495880127, + "learning_rate": 9.442980837473386e-05, + "loss": 0.07618424892425538, + "step": 39260 + }, + { + "epoch": 5.57416607523066, + "grad_norm": 2.610471725463867, + "learning_rate": 9.442838892831796e-05, + "loss": 0.037158846855163574, + "step": 39270 + }, + { + "epoch": 5.575585521646558, + "grad_norm": 1.8794517517089844, + "learning_rate": 9.442696948190206e-05, + "loss": 0.04584873616695404, + "step": 39280 + }, + { + "epoch": 5.577004968062456, + "grad_norm": 6.433529376983643, + "learning_rate": 9.442555003548617e-05, + "loss": 0.0613658607006073, + "step": 39290 + }, + { + "epoch": 5.578424414478354, + "grad_norm": 3.764742374420166, + "learning_rate": 9.442413058907027e-05, + "loss": 0.04258022904396057, + "step": 39300 + }, + { + "epoch": 5.579843860894251, + "grad_norm": 7.699685573577881, + "learning_rate": 9.442271114265438e-05, + "loss": 0.05763416886329651, + "step": 39310 + }, + { + "epoch": 5.581263307310149, + "grad_norm": 1.2105742692947388, + "learning_rate": 9.442129169623846e-05, + "loss": 0.029989880323410035, + "step": 39320 + }, + { + "epoch": 5.582682753726047, + "grad_norm": 3.355437994003296, + "learning_rate": 9.441987224982257e-05, + "loss": 0.08556990623474121, + "step": 39330 + }, + { + "epoch": 5.5841022001419445, + "grad_norm": 0.5614784955978394, + "learning_rate": 9.441845280340667e-05, + "loss": 0.04003655612468719, + "step": 39340 + }, + { + "epoch": 5.585521646557843, + "grad_norm": 1.7917828559875488, + "learning_rate": 9.441703335699078e-05, + "loss": 0.03968890905380249, + "step": 39350 + }, + { + "epoch": 5.58694109297374, + "grad_norm": 0.16788877546787262, + "learning_rate": 9.441561391057488e-05, + "loss": 0.03502267599105835, + "step": 39360 + }, + { + "epoch": 5.5883605393896385, + "grad_norm": 0.6021848917007446, + "learning_rate": 9.441419446415898e-05, + "loss": 0.06506335139274597, + "step": 39370 + }, + { + "epoch": 5.589779985805536, + "grad_norm": 0.4147074222564697, + "learning_rate": 9.441277501774309e-05, + "loss": 0.04189459681510925, + "step": 39380 + }, + { + "epoch": 5.591199432221433, + "grad_norm": 8.627924919128418, + "learning_rate": 9.441135557132718e-05, + "loss": 0.044555434584617616, + "step": 39390 + }, + { + "epoch": 5.592618878637332, + "grad_norm": 2.4682135581970215, + "learning_rate": 9.44099361249113e-05, + "loss": 0.13713971376419068, + "step": 39400 + }, + { + "epoch": 5.594038325053229, + "grad_norm": 2.6810388565063477, + "learning_rate": 9.440851667849539e-05, + "loss": 0.020219671726226806, + "step": 39410 + }, + { + "epoch": 5.595457771469127, + "grad_norm": 9.803950309753418, + "learning_rate": 9.440709723207949e-05, + "loss": 0.07011809349060058, + "step": 39420 + }, + { + "epoch": 5.596877217885025, + "grad_norm": 0.29333794116973877, + "learning_rate": 9.440567778566359e-05, + "loss": 0.020935848355293274, + "step": 39430 + }, + { + "epoch": 5.598296664300923, + "grad_norm": 1.5691699981689453, + "learning_rate": 9.44042583392477e-05, + "loss": 0.04376091659069061, + "step": 39440 + }, + { + "epoch": 5.5997161107168205, + "grad_norm": 0.5065397024154663, + "learning_rate": 9.44028388928318e-05, + "loss": 0.04781602025032043, + "step": 39450 + }, + { + "epoch": 5.601135557132718, + "grad_norm": 4.79033899307251, + "learning_rate": 9.440141944641591e-05, + "loss": 0.10913141965866088, + "step": 39460 + }, + { + "epoch": 5.602555003548616, + "grad_norm": 3.315653085708618, + "learning_rate": 9.44e-05, + "loss": 0.0696462333202362, + "step": 39470 + }, + { + "epoch": 5.603974449964514, + "grad_norm": 0.12675778567790985, + "learning_rate": 9.43985805535841e-05, + "loss": 0.017715385556221007, + "step": 39480 + }, + { + "epoch": 5.605393896380412, + "grad_norm": 7.688170433044434, + "learning_rate": 9.439716110716821e-05, + "loss": 0.061501210927963255, + "step": 39490 + }, + { + "epoch": 5.606813342796309, + "grad_norm": 7.34026575088501, + "learning_rate": 9.439574166075231e-05, + "loss": 0.057365798950195314, + "step": 39500 + }, + { + "epoch": 5.606813342796309, + "eval_accuracy": 0.9736758440897819, + "eval_loss": 0.07783501595258713, + "eval_runtime": 34.6522, + "eval_samples_per_second": 453.853, + "eval_steps_per_second": 14.198, + "step": 39500 + }, + { + "epoch": 5.608232789212208, + "grad_norm": 6.776620864868164, + "learning_rate": 9.439432221433642e-05, + "loss": 0.08461334109306336, + "step": 39510 + }, + { + "epoch": 5.609652235628105, + "grad_norm": 7.816592693328857, + "learning_rate": 9.43929027679205e-05, + "loss": 0.05237630009651184, + "step": 39520 + }, + { + "epoch": 5.6110716820440025, + "grad_norm": 2.657180070877075, + "learning_rate": 9.439148332150462e-05, + "loss": 0.04638761878013611, + "step": 39530 + }, + { + "epoch": 5.612491128459901, + "grad_norm": 3.7408230304718018, + "learning_rate": 9.439006387508871e-05, + "loss": 0.05367375612258911, + "step": 39540 + }, + { + "epoch": 5.613910574875798, + "grad_norm": 2.930955648422241, + "learning_rate": 9.438864442867282e-05, + "loss": 0.06972458958625793, + "step": 39550 + }, + { + "epoch": 5.615330021291697, + "grad_norm": 16.335681915283203, + "learning_rate": 9.438722498225692e-05, + "loss": 0.11342012882232666, + "step": 39560 + }, + { + "epoch": 5.616749467707594, + "grad_norm": 0.9312990307807922, + "learning_rate": 9.438580553584103e-05, + "loss": 0.09171445965766907, + "step": 39570 + }, + { + "epoch": 5.618168914123492, + "grad_norm": 0.5279941558837891, + "learning_rate": 9.438438608942513e-05, + "loss": 0.0824202299118042, + "step": 39580 + }, + { + "epoch": 5.61958836053939, + "grad_norm": 5.812867164611816, + "learning_rate": 9.438296664300923e-05, + "loss": 0.03917689919471741, + "step": 39590 + }, + { + "epoch": 5.621007806955287, + "grad_norm": 4.642354965209961, + "learning_rate": 9.438154719659334e-05, + "loss": 0.05176680088043213, + "step": 39600 + }, + { + "epoch": 5.622427253371185, + "grad_norm": 4.812618255615234, + "learning_rate": 9.438012775017744e-05, + "loss": 0.05726593136787415, + "step": 39610 + }, + { + "epoch": 5.623846699787083, + "grad_norm": 6.474045276641846, + "learning_rate": 9.437870830376155e-05, + "loss": 0.06338353157043457, + "step": 39620 + }, + { + "epoch": 5.625266146202981, + "grad_norm": 3.1899099349975586, + "learning_rate": 9.437728885734563e-05, + "loss": 0.03615102469921112, + "step": 39630 + }, + { + "epoch": 5.626685592618879, + "grad_norm": 1.5344411134719849, + "learning_rate": 9.437586941092974e-05, + "loss": 0.052175390720367434, + "step": 39640 + }, + { + "epoch": 5.628105039034777, + "grad_norm": 4.2242255210876465, + "learning_rate": 9.437444996451384e-05, + "loss": 0.02579593062400818, + "step": 39650 + }, + { + "epoch": 5.629524485450674, + "grad_norm": 2.9231820106506348, + "learning_rate": 9.437303051809795e-05, + "loss": 0.058068424463272095, + "step": 39660 + }, + { + "epoch": 5.630943931866572, + "grad_norm": 2.0201687812805176, + "learning_rate": 9.437161107168206e-05, + "loss": 0.038857880234718326, + "step": 39670 + }, + { + "epoch": 5.63236337828247, + "grad_norm": 7.435208797454834, + "learning_rate": 9.437019162526614e-05, + "loss": 0.05277242660522461, + "step": 39680 + }, + { + "epoch": 5.633782824698367, + "grad_norm": 0.2021070271730423, + "learning_rate": 9.436877217885025e-05, + "loss": 0.07399315237998963, + "step": 39690 + }, + { + "epoch": 5.635202271114266, + "grad_norm": 2.765399932861328, + "learning_rate": 9.436735273243435e-05, + "loss": 0.030564799904823303, + "step": 39700 + }, + { + "epoch": 5.636621717530163, + "grad_norm": 6.345180511474609, + "learning_rate": 9.436593328601846e-05, + "loss": 0.03482165336608887, + "step": 39710 + }, + { + "epoch": 5.6380411639460615, + "grad_norm": 12.207220077514648, + "learning_rate": 9.436451383960256e-05, + "loss": 0.07414867281913758, + "step": 39720 + }, + { + "epoch": 5.639460610361959, + "grad_norm": 1.6320226192474365, + "learning_rate": 9.436309439318666e-05, + "loss": 0.03807423412799835, + "step": 39730 + }, + { + "epoch": 5.640880056777856, + "grad_norm": 10.701844215393066, + "learning_rate": 9.436167494677076e-05, + "loss": 0.1101304531097412, + "step": 39740 + }, + { + "epoch": 5.642299503193755, + "grad_norm": 8.42212963104248, + "learning_rate": 9.436025550035487e-05, + "loss": 0.06819977760314941, + "step": 39750 + }, + { + "epoch": 5.643718949609652, + "grad_norm": 3.634274959564209, + "learning_rate": 9.435883605393898e-05, + "loss": 0.032803896069526675, + "step": 39760 + }, + { + "epoch": 5.64513839602555, + "grad_norm": 6.771927356719971, + "learning_rate": 9.435741660752307e-05, + "loss": 0.07954181432723999, + "step": 39770 + }, + { + "epoch": 5.646557842441448, + "grad_norm": 2.6635067462921143, + "learning_rate": 9.435599716110717e-05, + "loss": 0.04880297780036926, + "step": 39780 + }, + { + "epoch": 5.647977288857346, + "grad_norm": 5.2400922775268555, + "learning_rate": 9.435457771469127e-05, + "loss": 0.09463647603988648, + "step": 39790 + }, + { + "epoch": 5.6493967352732435, + "grad_norm": 0.019930781796574593, + "learning_rate": 9.435315826827538e-05, + "loss": 0.03061905801296234, + "step": 39800 + }, + { + "epoch": 5.650816181689141, + "grad_norm": 0.7077277302742004, + "learning_rate": 9.435173882185948e-05, + "loss": 0.060614013671875, + "step": 39810 + }, + { + "epoch": 5.652235628105039, + "grad_norm": 6.0713348388671875, + "learning_rate": 9.435031937544359e-05, + "loss": 0.025630703568458556, + "step": 39820 + }, + { + "epoch": 5.653655074520937, + "grad_norm": 4.086087226867676, + "learning_rate": 9.434889992902767e-05, + "loss": 0.018952296674251558, + "step": 39830 + }, + { + "epoch": 5.655074520936835, + "grad_norm": 1.135500431060791, + "learning_rate": 9.434748048261178e-05, + "loss": 0.03191319704055786, + "step": 39840 + }, + { + "epoch": 5.656493967352732, + "grad_norm": 0.2785182595252991, + "learning_rate": 9.43460610361959e-05, + "loss": 0.02202431410551071, + "step": 39850 + }, + { + "epoch": 5.657913413768631, + "grad_norm": 0.11208788305521011, + "learning_rate": 9.434464158977999e-05, + "loss": 0.03365016877651215, + "step": 39860 + }, + { + "epoch": 5.659332860184528, + "grad_norm": 0.42555415630340576, + "learning_rate": 9.43432221433641e-05, + "loss": 0.04779731035232544, + "step": 39870 + }, + { + "epoch": 5.6607523066004255, + "grad_norm": 0.16418029367923737, + "learning_rate": 9.434180269694819e-05, + "loss": 0.03143316805362702, + "step": 39880 + }, + { + "epoch": 5.662171753016324, + "grad_norm": 8.50043773651123, + "learning_rate": 9.43403832505323e-05, + "loss": 0.09453284740447998, + "step": 39890 + }, + { + "epoch": 5.663591199432221, + "grad_norm": 11.144037246704102, + "learning_rate": 9.43389638041164e-05, + "loss": 0.015583939850330353, + "step": 39900 + }, + { + "epoch": 5.6650106458481195, + "grad_norm": 3.8421754837036133, + "learning_rate": 9.43375443577005e-05, + "loss": 0.04787985980510712, + "step": 39910 + }, + { + "epoch": 5.666430092264017, + "grad_norm": 2.839329481124878, + "learning_rate": 9.43361249112846e-05, + "loss": 0.05071015357971191, + "step": 39920 + }, + { + "epoch": 5.667849538679915, + "grad_norm": 1.3259540796279907, + "learning_rate": 9.433470546486871e-05, + "loss": 0.06693935990333558, + "step": 39930 + }, + { + "epoch": 5.669268985095813, + "grad_norm": 1.9949698448181152, + "learning_rate": 9.43332860184528e-05, + "loss": 0.09938514232635498, + "step": 39940 + }, + { + "epoch": 5.67068843151171, + "grad_norm": 5.349208831787109, + "learning_rate": 9.433186657203691e-05, + "loss": 0.07142342925071717, + "step": 39950 + }, + { + "epoch": 5.672107877927608, + "grad_norm": 6.508642673492432, + "learning_rate": 9.433044712562102e-05, + "loss": 0.07783754467964173, + "step": 39960 + }, + { + "epoch": 5.673527324343506, + "grad_norm": 6.658568859100342, + "learning_rate": 9.432902767920512e-05, + "loss": 0.08317623138427735, + "step": 39970 + }, + { + "epoch": 5.674946770759404, + "grad_norm": 0.1687992364168167, + "learning_rate": 9.432760823278923e-05, + "loss": 0.0768187940120697, + "step": 39980 + }, + { + "epoch": 5.6763662171753015, + "grad_norm": 2.4580390453338623, + "learning_rate": 9.432618878637331e-05, + "loss": 0.08780956864356995, + "step": 39990 + }, + { + "epoch": 5.6777856635912, + "grad_norm": 0.24730490148067474, + "learning_rate": 9.432476933995742e-05, + "loss": 0.06389739513397216, + "step": 40000 + }, + { + "epoch": 5.6777856635912, + "eval_accuracy": 0.9646467857824124, + "eval_loss": 0.1065235510468483, + "eval_runtime": 35.9304, + "eval_samples_per_second": 437.708, + "eval_steps_per_second": 13.693, + "step": 40000 + }, + { + "epoch": 5.679205110007097, + "grad_norm": 1.2596027851104736, + "learning_rate": 9.432334989354152e-05, + "loss": 0.025333791971206665, + "step": 40010 + }, + { + "epoch": 5.680624556422995, + "grad_norm": 4.28132963180542, + "learning_rate": 9.432193044712563e-05, + "loss": 0.07308688759803772, + "step": 40020 + }, + { + "epoch": 5.682044002838893, + "grad_norm": 2.235490322113037, + "learning_rate": 9.432051100070973e-05, + "loss": 0.04388102889060974, + "step": 40030 + }, + { + "epoch": 5.68346344925479, + "grad_norm": 0.5646092891693115, + "learning_rate": 9.431909155429383e-05, + "loss": 0.07124125361442565, + "step": 40040 + }, + { + "epoch": 5.684882895670689, + "grad_norm": 0.07894527912139893, + "learning_rate": 9.431767210787794e-05, + "loss": 0.06131964325904846, + "step": 40050 + }, + { + "epoch": 5.686302342086586, + "grad_norm": 1.2791597843170166, + "learning_rate": 9.431625266146203e-05, + "loss": 0.05140694975852966, + "step": 40060 + }, + { + "epoch": 5.687721788502484, + "grad_norm": 0.9377229809761047, + "learning_rate": 9.431483321504614e-05, + "loss": 0.07500240802764893, + "step": 40070 + }, + { + "epoch": 5.689141234918382, + "grad_norm": 5.653379440307617, + "learning_rate": 9.431341376863024e-05, + "loss": 0.05739631056785584, + "step": 40080 + }, + { + "epoch": 5.690560681334279, + "grad_norm": 5.261422634124756, + "learning_rate": 9.431199432221434e-05, + "loss": 0.06874457597732545, + "step": 40090 + }, + { + "epoch": 5.691980127750178, + "grad_norm": 9.214577674865723, + "learning_rate": 9.431057487579844e-05, + "loss": 0.044308590888977054, + "step": 40100 + }, + { + "epoch": 5.693399574166075, + "grad_norm": 1.361165165901184, + "learning_rate": 9.430915542938255e-05, + "loss": 0.022246035933494567, + "step": 40110 + }, + { + "epoch": 5.694819020581973, + "grad_norm": 8.207006454467773, + "learning_rate": 9.430773598296665e-05, + "loss": 0.047158649563789366, + "step": 40120 + }, + { + "epoch": 5.696238466997871, + "grad_norm": 4.108800411224365, + "learning_rate": 9.430631653655076e-05, + "loss": 0.07500581741333008, + "step": 40130 + }, + { + "epoch": 5.697657913413769, + "grad_norm": 8.759567260742188, + "learning_rate": 9.430489709013485e-05, + "loss": 0.038922271132469176, + "step": 40140 + }, + { + "epoch": 5.6990773598296665, + "grad_norm": 4.76708459854126, + "learning_rate": 9.430347764371895e-05, + "loss": 0.030303937196731568, + "step": 40150 + }, + { + "epoch": 5.700496806245564, + "grad_norm": 0.7679837942123413, + "learning_rate": 9.430205819730306e-05, + "loss": 0.04279916882514954, + "step": 40160 + }, + { + "epoch": 5.701916252661462, + "grad_norm": 1.9951245784759521, + "learning_rate": 9.430063875088716e-05, + "loss": 0.04763171076774597, + "step": 40170 + }, + { + "epoch": 5.70333569907736, + "grad_norm": 1.4225386381149292, + "learning_rate": 9.429921930447127e-05, + "loss": 0.059112942218780516, + "step": 40180 + }, + { + "epoch": 5.704755145493258, + "grad_norm": 5.624825477600098, + "learning_rate": 9.429779985805535e-05, + "loss": 0.08224546909332275, + "step": 40190 + }, + { + "epoch": 5.706174591909155, + "grad_norm": 4.557068824768066, + "learning_rate": 9.429638041163946e-05, + "loss": 0.07699850797653199, + "step": 40200 + }, + { + "epoch": 5.707594038325054, + "grad_norm": 2.6185569763183594, + "learning_rate": 9.429496096522356e-05, + "loss": 0.09195753931999207, + "step": 40210 + }, + { + "epoch": 5.709013484740951, + "grad_norm": 9.599045753479004, + "learning_rate": 9.429354151880767e-05, + "loss": 0.07318518757820129, + "step": 40220 + }, + { + "epoch": 5.7104329311568485, + "grad_norm": 8.182303428649902, + "learning_rate": 9.429212207239177e-05, + "loss": 0.06409326791763306, + "step": 40230 + }, + { + "epoch": 5.711852377572747, + "grad_norm": 1.4199198484420776, + "learning_rate": 9.429070262597587e-05, + "loss": 0.05758379697799683, + "step": 40240 + }, + { + "epoch": 5.713271823988644, + "grad_norm": 0.9231958389282227, + "learning_rate": 9.428928317955998e-05, + "loss": 0.04129588007926941, + "step": 40250 + }, + { + "epoch": 5.7146912704045425, + "grad_norm": 4.508418560028076, + "learning_rate": 9.428786373314408e-05, + "loss": 0.06449969410896302, + "step": 40260 + }, + { + "epoch": 5.71611071682044, + "grad_norm": 7.092264652252197, + "learning_rate": 9.428644428672819e-05, + "loss": 0.045121192932128906, + "step": 40270 + }, + { + "epoch": 5.717530163236338, + "grad_norm": 0.6932799816131592, + "learning_rate": 9.428502484031228e-05, + "loss": 0.03739106059074402, + "step": 40280 + }, + { + "epoch": 5.718949609652236, + "grad_norm": 0.5188174247741699, + "learning_rate": 9.42836053938964e-05, + "loss": 0.042255252599716187, + "step": 40290 + }, + { + "epoch": 5.720369056068133, + "grad_norm": 1.1262272596359253, + "learning_rate": 9.428218594748048e-05, + "loss": 0.03085188865661621, + "step": 40300 + }, + { + "epoch": 5.721788502484031, + "grad_norm": 1.6696752309799194, + "learning_rate": 9.428076650106459e-05, + "loss": 0.02104969471693039, + "step": 40310 + }, + { + "epoch": 5.723207948899929, + "grad_norm": 1.4816834926605225, + "learning_rate": 9.427934705464869e-05, + "loss": 0.03856719136238098, + "step": 40320 + }, + { + "epoch": 5.724627395315827, + "grad_norm": 2.560551643371582, + "learning_rate": 9.42779276082328e-05, + "loss": 0.06555436849594116, + "step": 40330 + }, + { + "epoch": 5.7260468417317245, + "grad_norm": 6.567645072937012, + "learning_rate": 9.42765081618169e-05, + "loss": 0.05767791271209717, + "step": 40340 + }, + { + "epoch": 5.727466288147623, + "grad_norm": 7.098581790924072, + "learning_rate": 9.4275088715401e-05, + "loss": 0.08214937448501587, + "step": 40350 + }, + { + "epoch": 5.72888573456352, + "grad_norm": 0.1502898931503296, + "learning_rate": 9.42736692689851e-05, + "loss": 0.07196863293647766, + "step": 40360 + }, + { + "epoch": 5.730305180979418, + "grad_norm": 7.313935279846191, + "learning_rate": 9.42722498225692e-05, + "loss": 0.04690050184726715, + "step": 40370 + }, + { + "epoch": 5.731724627395316, + "grad_norm": 4.765439510345459, + "learning_rate": 9.427083037615331e-05, + "loss": 0.11581621170043946, + "step": 40380 + }, + { + "epoch": 5.733144073811213, + "grad_norm": 0.062072690576314926, + "learning_rate": 9.426941092973741e-05, + "loss": 0.04716021716594696, + "step": 40390 + }, + { + "epoch": 5.734563520227112, + "grad_norm": 6.344238758087158, + "learning_rate": 9.426799148332151e-05, + "loss": 0.07715264558792115, + "step": 40400 + }, + { + "epoch": 5.735982966643009, + "grad_norm": 4.1125593185424805, + "learning_rate": 9.42665720369056e-05, + "loss": 0.1007968544960022, + "step": 40410 + }, + { + "epoch": 5.737402413058907, + "grad_norm": 0.2826620936393738, + "learning_rate": 9.426515259048972e-05, + "loss": 0.05582694411277771, + "step": 40420 + }, + { + "epoch": 5.738821859474805, + "grad_norm": 8.204134941101074, + "learning_rate": 9.426373314407381e-05, + "loss": 0.03588265776634216, + "step": 40430 + }, + { + "epoch": 5.740241305890702, + "grad_norm": 3.2922136783599854, + "learning_rate": 9.426231369765792e-05, + "loss": 0.06283365488052368, + "step": 40440 + }, + { + "epoch": 5.741660752306601, + "grad_norm": 5.972446441650391, + "learning_rate": 9.426089425124202e-05, + "loss": 0.059499341249465945, + "step": 40450 + }, + { + "epoch": 5.743080198722498, + "grad_norm": 3.592957019805908, + "learning_rate": 9.425947480482612e-05, + "loss": 0.07895704507827758, + "step": 40460 + }, + { + "epoch": 5.744499645138396, + "grad_norm": 2.2135820388793945, + "learning_rate": 9.425805535841023e-05, + "loss": 0.045309117436408995, + "step": 40470 + }, + { + "epoch": 5.745919091554294, + "grad_norm": 7.3867340087890625, + "learning_rate": 9.425663591199433e-05, + "loss": 0.04623596966266632, + "step": 40480 + }, + { + "epoch": 5.747338537970192, + "grad_norm": 0.20550206303596497, + "learning_rate": 9.425521646557844e-05, + "loss": 0.05645106434822082, + "step": 40490 + }, + { + "epoch": 5.748757984386089, + "grad_norm": 0.5938138365745544, + "learning_rate": 9.425379701916252e-05, + "loss": 0.03962576985359192, + "step": 40500 + }, + { + "epoch": 5.748757984386089, + "eval_accuracy": 0.9722133909836587, + "eval_loss": 0.08429045230150223, + "eval_runtime": 33.4191, + "eval_samples_per_second": 470.6, + "eval_steps_per_second": 14.722, + "step": 40500 + }, + { + "epoch": 5.750177430801987, + "grad_norm": 2.9440619945526123, + "learning_rate": 9.425237757274663e-05, + "loss": 0.06737836599349975, + "step": 40510 + }, + { + "epoch": 5.751596877217885, + "grad_norm": 13.42500114440918, + "learning_rate": 9.425095812633073e-05, + "loss": 0.14449608325958252, + "step": 40520 + }, + { + "epoch": 5.753016323633783, + "grad_norm": 0.19540852308273315, + "learning_rate": 9.424953867991484e-05, + "loss": 0.11976728439331055, + "step": 40530 + }, + { + "epoch": 5.754435770049681, + "grad_norm": 5.771694660186768, + "learning_rate": 9.424811923349894e-05, + "loss": 0.0465570330619812, + "step": 40540 + }, + { + "epoch": 5.755855216465578, + "grad_norm": 0.7781568169593811, + "learning_rate": 9.424669978708304e-05, + "loss": 0.06043409109115601, + "step": 40550 + }, + { + "epoch": 5.757274662881477, + "grad_norm": 4.123456954956055, + "learning_rate": 9.424528034066715e-05, + "loss": 0.02993989586830139, + "step": 40560 + }, + { + "epoch": 5.758694109297374, + "grad_norm": 5.118284225463867, + "learning_rate": 9.424386089425124e-05, + "loss": 0.0467894971370697, + "step": 40570 + }, + { + "epoch": 5.760113555713271, + "grad_norm": 2.537217378616333, + "learning_rate": 9.424244144783535e-05, + "loss": 0.028548663854598998, + "step": 40580 + }, + { + "epoch": 5.76153300212917, + "grad_norm": 7.4742112159729, + "learning_rate": 9.424102200141945e-05, + "loss": 0.0495083749294281, + "step": 40590 + }, + { + "epoch": 5.762952448545067, + "grad_norm": 6.256778240203857, + "learning_rate": 9.423960255500355e-05, + "loss": 0.05922043919563293, + "step": 40600 + }, + { + "epoch": 5.7643718949609655, + "grad_norm": 5.0860490798950195, + "learning_rate": 9.423818310858765e-05, + "loss": 0.05987945795059204, + "step": 40610 + }, + { + "epoch": 5.765791341376863, + "grad_norm": 12.065763473510742, + "learning_rate": 9.423676366217176e-05, + "loss": 0.05129314064979553, + "step": 40620 + }, + { + "epoch": 5.767210787792761, + "grad_norm": 2.2333362102508545, + "learning_rate": 9.423534421575586e-05, + "loss": 0.025675442814826966, + "step": 40630 + }, + { + "epoch": 5.768630234208659, + "grad_norm": 0.7619284987449646, + "learning_rate": 9.423392476933997e-05, + "loss": 0.03333350419998169, + "step": 40640 + }, + { + "epoch": 5.770049680624556, + "grad_norm": 3.0292410850524902, + "learning_rate": 9.423250532292406e-05, + "loss": 0.032198408246040346, + "step": 40650 + }, + { + "epoch": 5.771469127040454, + "grad_norm": 1.155750036239624, + "learning_rate": 9.423108587650816e-05, + "loss": 0.03320641815662384, + "step": 40660 + }, + { + "epoch": 5.772888573456352, + "grad_norm": 2.6847808361053467, + "learning_rate": 9.422966643009227e-05, + "loss": 0.05150899887084961, + "step": 40670 + }, + { + "epoch": 5.77430801987225, + "grad_norm": 3.288931369781494, + "learning_rate": 9.422824698367637e-05, + "loss": 0.06830101013183594, + "step": 40680 + }, + { + "epoch": 5.7757274662881475, + "grad_norm": 6.511233806610107, + "learning_rate": 9.422682753726048e-05, + "loss": 0.059491848945617674, + "step": 40690 + }, + { + "epoch": 5.777146912704046, + "grad_norm": 0.7103070616722107, + "learning_rate": 9.422540809084458e-05, + "loss": 0.06547984480857849, + "step": 40700 + }, + { + "epoch": 5.778566359119943, + "grad_norm": 0.3059501647949219, + "learning_rate": 9.422398864442868e-05, + "loss": 0.05114136934280396, + "step": 40710 + }, + { + "epoch": 5.779985805535841, + "grad_norm": 5.036358833312988, + "learning_rate": 9.422256919801277e-05, + "loss": 0.04435153007507324, + "step": 40720 + }, + { + "epoch": 5.781405251951739, + "grad_norm": 1.1257925033569336, + "learning_rate": 9.422114975159688e-05, + "loss": 0.07170879244804382, + "step": 40730 + }, + { + "epoch": 5.782824698367636, + "grad_norm": 2.28365421295166, + "learning_rate": 9.421973030518098e-05, + "loss": 0.08844174146652221, + "step": 40740 + }, + { + "epoch": 5.784244144783535, + "grad_norm": 6.227928638458252, + "learning_rate": 9.421845280340668e-05, + "loss": 0.03891546130180359, + "step": 40750 + }, + { + "epoch": 5.785663591199432, + "grad_norm": 7.569578170776367, + "learning_rate": 9.421703335699078e-05, + "loss": 0.07715065479278564, + "step": 40760 + }, + { + "epoch": 5.78708303761533, + "grad_norm": 3.209096670150757, + "learning_rate": 9.421561391057489e-05, + "loss": 0.05780371427536011, + "step": 40770 + }, + { + "epoch": 5.788502484031228, + "grad_norm": 5.905817031860352, + "learning_rate": 9.421419446415897e-05, + "loss": 0.06400578618049621, + "step": 40780 + }, + { + "epoch": 5.789921930447125, + "grad_norm": 1.9129750728607178, + "learning_rate": 9.421277501774308e-05, + "loss": 0.07512367963790893, + "step": 40790 + }, + { + "epoch": 5.7913413768630235, + "grad_norm": 1.1263295412063599, + "learning_rate": 9.42113555713272e-05, + "loss": 0.038415607810020444, + "step": 40800 + }, + { + "epoch": 5.792760823278921, + "grad_norm": 0.9057685732841492, + "learning_rate": 9.420993612491129e-05, + "loss": 0.025888726115226746, + "step": 40810 + }, + { + "epoch": 5.794180269694819, + "grad_norm": 5.407680988311768, + "learning_rate": 9.42085166784954e-05, + "loss": 0.08174926638603211, + "step": 40820 + }, + { + "epoch": 5.795599716110717, + "grad_norm": 2.7758450508117676, + "learning_rate": 9.420709723207949e-05, + "loss": 0.04903877377510071, + "step": 40830 + }, + { + "epoch": 5.797019162526615, + "grad_norm": 3.875502109527588, + "learning_rate": 9.42056777856636e-05, + "loss": 0.0529244601726532, + "step": 40840 + }, + { + "epoch": 5.798438608942512, + "grad_norm": 2.0841777324676514, + "learning_rate": 9.42042583392477e-05, + "loss": 0.03719224333763123, + "step": 40850 + }, + { + "epoch": 5.79985805535841, + "grad_norm": 0.20633159577846527, + "learning_rate": 9.42028388928318e-05, + "loss": 0.0415228933095932, + "step": 40860 + }, + { + "epoch": 5.801277501774308, + "grad_norm": 1.6225624084472656, + "learning_rate": 9.42014194464159e-05, + "loss": 0.07819415926933289, + "step": 40870 + }, + { + "epoch": 5.8026969481902055, + "grad_norm": 3.071683883666992, + "learning_rate": 9.42e-05, + "loss": 0.037554305791854856, + "step": 40880 + }, + { + "epoch": 5.804116394606104, + "grad_norm": 0.5660277009010315, + "learning_rate": 9.419858055358411e-05, + "loss": 0.0829436719417572, + "step": 40890 + }, + { + "epoch": 5.805535841022001, + "grad_norm": 0.9265013933181763, + "learning_rate": 9.419716110716821e-05, + "loss": 0.035265910625457766, + "step": 40900 + }, + { + "epoch": 5.8069552874379, + "grad_norm": 0.10838378220796585, + "learning_rate": 9.419574166075232e-05, + "loss": 0.03172871470451355, + "step": 40910 + }, + { + "epoch": 5.808374733853797, + "grad_norm": 11.150486946105957, + "learning_rate": 9.419432221433642e-05, + "loss": 0.05358134508132935, + "step": 40920 + }, + { + "epoch": 5.809794180269694, + "grad_norm": 5.653853416442871, + "learning_rate": 9.419290276792051e-05, + "loss": 0.053334379196166994, + "step": 40930 + }, + { + "epoch": 5.811213626685593, + "grad_norm": 0.3218175172805786, + "learning_rate": 9.419148332150461e-05, + "loss": 0.04683685302734375, + "step": 40940 + }, + { + "epoch": 5.81263307310149, + "grad_norm": 2.429058313369751, + "learning_rate": 9.419006387508872e-05, + "loss": 0.034245806932449344, + "step": 40950 + }, + { + "epoch": 5.814052519517388, + "grad_norm": 5.519045829772949, + "learning_rate": 9.418864442867282e-05, + "loss": 0.09080212116241455, + "step": 40960 + }, + { + "epoch": 5.815471965933286, + "grad_norm": 2.262676477432251, + "learning_rate": 9.418722498225693e-05, + "loss": 0.04305407404899597, + "step": 40970 + }, + { + "epoch": 5.816891412349184, + "grad_norm": 6.300297260284424, + "learning_rate": 9.418580553584103e-05, + "loss": 0.08451374769210815, + "step": 40980 + }, + { + "epoch": 5.818310858765082, + "grad_norm": 6.759078502655029, + "learning_rate": 9.418438608942512e-05, + "loss": 0.040090644359588624, + "step": 40990 + }, + { + "epoch": 5.819730305180979, + "grad_norm": 0.04662134125828743, + "learning_rate": 9.418296664300924e-05, + "loss": 0.0369631290435791, + "step": 41000 + }, + { + "epoch": 5.819730305180979, + "eval_accuracy": 0.9738665988427545, + "eval_loss": 0.07505597919225693, + "eval_runtime": 33.4209, + "eval_samples_per_second": 470.574, + "eval_steps_per_second": 14.721, + "step": 41000 + }, + { + "epoch": 5.821149751596877, + "grad_norm": 1.048794150352478, + "learning_rate": 9.418154719659333e-05, + "loss": 0.03393426835536957, + "step": 41010 + }, + { + "epoch": 5.822569198012775, + "grad_norm": 1.4327489137649536, + "learning_rate": 9.418012775017744e-05, + "loss": 0.03338783085346222, + "step": 41020 + }, + { + "epoch": 5.823988644428673, + "grad_norm": 0.037851158529520035, + "learning_rate": 9.417870830376154e-05, + "loss": 0.021252821385860442, + "step": 41030 + }, + { + "epoch": 5.8254080908445705, + "grad_norm": 2.149754285812378, + "learning_rate": 9.417728885734564e-05, + "loss": 0.06314542889595032, + "step": 41040 + }, + { + "epoch": 5.826827537260469, + "grad_norm": 0.4033157229423523, + "learning_rate": 9.417586941092974e-05, + "loss": 0.02093043476343155, + "step": 41050 + }, + { + "epoch": 5.828246983676366, + "grad_norm": 2.183093547821045, + "learning_rate": 9.417444996451385e-05, + "loss": 0.02726356089115143, + "step": 41060 + }, + { + "epoch": 5.829666430092264, + "grad_norm": 4.68568229675293, + "learning_rate": 9.417303051809794e-05, + "loss": 0.05667162537574768, + "step": 41070 + }, + { + "epoch": 5.831085876508162, + "grad_norm": 0.24591878056526184, + "learning_rate": 9.417161107168206e-05, + "loss": 0.046596580743789674, + "step": 41080 + }, + { + "epoch": 5.832505322924059, + "grad_norm": 2.6592085361480713, + "learning_rate": 9.417019162526615e-05, + "loss": 0.05997686982154846, + "step": 41090 + }, + { + "epoch": 5.833924769339958, + "grad_norm": 15.260762214660645, + "learning_rate": 9.416877217885025e-05, + "loss": 0.0799644410610199, + "step": 41100 + }, + { + "epoch": 5.835344215755855, + "grad_norm": 6.797810077667236, + "learning_rate": 9.416735273243436e-05, + "loss": 0.04322269558906555, + "step": 41110 + }, + { + "epoch": 5.836763662171753, + "grad_norm": 0.5926470160484314, + "learning_rate": 9.416593328601846e-05, + "loss": 0.029363343119621278, + "step": 41120 + }, + { + "epoch": 5.838183108587651, + "grad_norm": 0.012577028945088387, + "learning_rate": 9.416451383960257e-05, + "loss": 0.038662773370742795, + "step": 41130 + }, + { + "epoch": 5.839602555003548, + "grad_norm": 11.325472831726074, + "learning_rate": 9.416309439318665e-05, + "loss": 0.11654891967773437, + "step": 41140 + }, + { + "epoch": 5.8410220014194465, + "grad_norm": 3.3514904975891113, + "learning_rate": 9.416167494677076e-05, + "loss": 0.03298133611679077, + "step": 41150 + }, + { + "epoch": 5.842441447835344, + "grad_norm": 4.035382270812988, + "learning_rate": 9.416025550035486e-05, + "loss": 0.0738753318786621, + "step": 41160 + }, + { + "epoch": 5.843860894251242, + "grad_norm": 0.33365610241889954, + "learning_rate": 9.415883605393897e-05, + "loss": 0.06353077292442322, + "step": 41170 + }, + { + "epoch": 5.84528034066714, + "grad_norm": 0.7018740773200989, + "learning_rate": 9.415741660752307e-05, + "loss": 0.08425546288490296, + "step": 41180 + }, + { + "epoch": 5.846699787083038, + "grad_norm": 6.3598151206970215, + "learning_rate": 9.415599716110717e-05, + "loss": 0.06130687594413757, + "step": 41190 + }, + { + "epoch": 5.848119233498935, + "grad_norm": 7.904303550720215, + "learning_rate": 9.415457771469128e-05, + "loss": 0.0328709602355957, + "step": 41200 + }, + { + "epoch": 5.849538679914833, + "grad_norm": 4.881137371063232, + "learning_rate": 9.415315826827538e-05, + "loss": 0.03672412037849426, + "step": 41210 + }, + { + "epoch": 5.850958126330731, + "grad_norm": 5.240328311920166, + "learning_rate": 9.415173882185949e-05, + "loss": 0.04333122968673706, + "step": 41220 + }, + { + "epoch": 5.8523775727466285, + "grad_norm": 6.439934253692627, + "learning_rate": 9.415031937544358e-05, + "loss": 0.06000009775161743, + "step": 41230 + }, + { + "epoch": 5.853797019162527, + "grad_norm": 2.4920706748962402, + "learning_rate": 9.414889992902768e-05, + "loss": 0.05984145402908325, + "step": 41240 + }, + { + "epoch": 5.855216465578424, + "grad_norm": 4.731122970581055, + "learning_rate": 9.414748048261178e-05, + "loss": 0.059812134504318236, + "step": 41250 + }, + { + "epoch": 5.8566359119943225, + "grad_norm": 0.4531116187572479, + "learning_rate": 9.414606103619589e-05, + "loss": 0.07219678163528442, + "step": 41260 + }, + { + "epoch": 5.85805535841022, + "grad_norm": 1.3103761672973633, + "learning_rate": 9.414464158977999e-05, + "loss": 0.022967004776000978, + "step": 41270 + }, + { + "epoch": 5.859474804826117, + "grad_norm": 3.049083709716797, + "learning_rate": 9.41432221433641e-05, + "loss": 0.04617577791213989, + "step": 41280 + }, + { + "epoch": 5.860894251242016, + "grad_norm": 6.9003472328186035, + "learning_rate": 9.41418026969482e-05, + "loss": 0.05837984681129456, + "step": 41290 + }, + { + "epoch": 5.862313697657913, + "grad_norm": 4.321074485778809, + "learning_rate": 9.414038325053229e-05, + "loss": 0.05662268400192261, + "step": 41300 + }, + { + "epoch": 5.863733144073811, + "grad_norm": 1.8019779920578003, + "learning_rate": 9.41389638041164e-05, + "loss": 0.04414930045604706, + "step": 41310 + }, + { + "epoch": 5.865152590489709, + "grad_norm": 4.754904270172119, + "learning_rate": 9.41375443577005e-05, + "loss": 0.0796053946018219, + "step": 41320 + }, + { + "epoch": 5.866572036905607, + "grad_norm": 0.6924619078636169, + "learning_rate": 9.413612491128461e-05, + "loss": 0.04946174323558807, + "step": 41330 + }, + { + "epoch": 5.8679914833215046, + "grad_norm": 0.18079739809036255, + "learning_rate": 9.413470546486871e-05, + "loss": 0.10200127363204955, + "step": 41340 + }, + { + "epoch": 5.869410929737402, + "grad_norm": 9.690866470336914, + "learning_rate": 9.41332860184528e-05, + "loss": 0.060866284370422366, + "step": 41350 + }, + { + "epoch": 5.8708303761533, + "grad_norm": 3.259993076324463, + "learning_rate": 9.41318665720369e-05, + "loss": 0.05056840181350708, + "step": 41360 + }, + { + "epoch": 5.872249822569198, + "grad_norm": 0.3155812919139862, + "learning_rate": 9.413044712562101e-05, + "loss": 0.058551359176635745, + "step": 41370 + }, + { + "epoch": 5.873669268985096, + "grad_norm": 6.132554531097412, + "learning_rate": 9.412902767920511e-05, + "loss": 0.059990334510803225, + "step": 41380 + }, + { + "epoch": 5.875088715400993, + "grad_norm": 0.24582448601722717, + "learning_rate": 9.412760823278922e-05, + "loss": 0.06940883994102479, + "step": 41390 + }, + { + "epoch": 5.876508161816892, + "grad_norm": 2.3477296829223633, + "learning_rate": 9.412618878637332e-05, + "loss": 0.041541433334350585, + "step": 41400 + }, + { + "epoch": 5.877927608232789, + "grad_norm": 1.636675477027893, + "learning_rate": 9.412476933995742e-05, + "loss": 0.0951632797718048, + "step": 41410 + }, + { + "epoch": 5.879347054648687, + "grad_norm": 1.0382994413375854, + "learning_rate": 9.412334989354153e-05, + "loss": 0.04100165367126465, + "step": 41420 + }, + { + "epoch": 5.880766501064585, + "grad_norm": 3.1466052532196045, + "learning_rate": 9.412193044712563e-05, + "loss": 0.024977406859397887, + "step": 41430 + }, + { + "epoch": 5.882185947480482, + "grad_norm": 3.585829019546509, + "learning_rate": 9.412051100070974e-05, + "loss": 0.06898729801177979, + "step": 41440 + }, + { + "epoch": 5.883605393896381, + "grad_norm": 8.466485977172852, + "learning_rate": 9.411909155429382e-05, + "loss": 0.10010728836059571, + "step": 41450 + }, + { + "epoch": 5.885024840312278, + "grad_norm": 0.8343414068222046, + "learning_rate": 9.411767210787793e-05, + "loss": 0.05715740919113159, + "step": 41460 + }, + { + "epoch": 5.886444286728176, + "grad_norm": 9.334309577941895, + "learning_rate": 9.411625266146203e-05, + "loss": 0.12129504680633545, + "step": 41470 + }, + { + "epoch": 5.887863733144074, + "grad_norm": 4.06323766708374, + "learning_rate": 9.411483321504614e-05, + "loss": 0.030147609114646912, + "step": 41480 + }, + { + "epoch": 5.889283179559971, + "grad_norm": 2.0144970417022705, + "learning_rate": 9.411341376863024e-05, + "loss": 0.06269176602363587, + "step": 41490 + }, + { + "epoch": 5.8907026259758695, + "grad_norm": 1.7438490390777588, + "learning_rate": 9.411199432221433e-05, + "loss": 0.0560301661491394, + "step": 41500 + }, + { + "epoch": 5.8907026259758695, + "eval_accuracy": 0.9686526355948369, + "eval_loss": 0.09519174695014954, + "eval_runtime": 35.0451, + "eval_samples_per_second": 448.765, + "eval_steps_per_second": 14.039, + "step": 41500 + }, + { + "epoch": 5.892122072391767, + "grad_norm": 15.525542259216309, + "learning_rate": 9.411057487579845e-05, + "loss": 0.13895368576049805, + "step": 41510 + }, + { + "epoch": 5.893541518807665, + "grad_norm": 8.046239852905273, + "learning_rate": 9.410915542938254e-05, + "loss": 0.1090732455253601, + "step": 41520 + }, + { + "epoch": 5.894960965223563, + "grad_norm": 0.7156932353973389, + "learning_rate": 9.410773598296665e-05, + "loss": 0.07319698333740235, + "step": 41530 + }, + { + "epoch": 5.896380411639461, + "grad_norm": 4.401820659637451, + "learning_rate": 9.410631653655075e-05, + "loss": 0.034980499744415285, + "step": 41540 + }, + { + "epoch": 5.897799858055358, + "grad_norm": 0.6378658413887024, + "learning_rate": 9.410489709013485e-05, + "loss": 0.03048483729362488, + "step": 41550 + }, + { + "epoch": 5.899219304471256, + "grad_norm": 6.762858867645264, + "learning_rate": 9.410347764371895e-05, + "loss": 0.11323305368423461, + "step": 41560 + }, + { + "epoch": 5.900638750887154, + "grad_norm": 2.8695895671844482, + "learning_rate": 9.410205819730306e-05, + "loss": 0.07545459866523743, + "step": 41570 + }, + { + "epoch": 5.9020581973030515, + "grad_norm": 3.977581024169922, + "learning_rate": 9.410063875088715e-05, + "loss": 0.031920188665390016, + "step": 41580 + }, + { + "epoch": 5.90347764371895, + "grad_norm": 0.39675480127334595, + "learning_rate": 9.409921930447127e-05, + "loss": 0.022741490602493288, + "step": 41590 + }, + { + "epoch": 5.904897090134847, + "grad_norm": 6.030241012573242, + "learning_rate": 9.409779985805536e-05, + "loss": 0.044648933410644534, + "step": 41600 + }, + { + "epoch": 5.9063165365507455, + "grad_norm": 2.218195676803589, + "learning_rate": 9.409638041163946e-05, + "loss": 0.02160325050354004, + "step": 41610 + }, + { + "epoch": 5.907735982966643, + "grad_norm": 1.204745888710022, + "learning_rate": 9.409496096522357e-05, + "loss": 0.06839647889137268, + "step": 41620 + }, + { + "epoch": 5.90915542938254, + "grad_norm": 0.15682987868785858, + "learning_rate": 9.409354151880767e-05, + "loss": 0.033197081089019774, + "step": 41630 + }, + { + "epoch": 5.910574875798439, + "grad_norm": 3.0369253158569336, + "learning_rate": 9.409212207239178e-05, + "loss": 0.06591796278953552, + "step": 41640 + }, + { + "epoch": 5.911994322214336, + "grad_norm": 5.717238426208496, + "learning_rate": 9.409070262597586e-05, + "loss": 0.06664312481880189, + "step": 41650 + }, + { + "epoch": 5.913413768630234, + "grad_norm": 0.033254224807024, + "learning_rate": 9.408928317955997e-05, + "loss": 0.0270003616809845, + "step": 41660 + }, + { + "epoch": 5.914833215046132, + "grad_norm": 0.2468717098236084, + "learning_rate": 9.408786373314407e-05, + "loss": 0.034814268350601196, + "step": 41670 + }, + { + "epoch": 5.91625266146203, + "grad_norm": 9.007941246032715, + "learning_rate": 9.408644428672818e-05, + "loss": 0.07450242042541504, + "step": 41680 + }, + { + "epoch": 5.9176721078779275, + "grad_norm": 11.96431827545166, + "learning_rate": 9.408502484031228e-05, + "loss": 0.047266215085983276, + "step": 41690 + }, + { + "epoch": 5.919091554293825, + "grad_norm": 4.716080188751221, + "learning_rate": 9.408360539389639e-05, + "loss": 0.04762662053108215, + "step": 41700 + }, + { + "epoch": 5.920511000709723, + "grad_norm": 7.223570823669434, + "learning_rate": 9.408218594748049e-05, + "loss": 0.06301666498184204, + "step": 41710 + }, + { + "epoch": 5.921930447125621, + "grad_norm": 9.651657104492188, + "learning_rate": 9.408076650106459e-05, + "loss": 0.052734434604644775, + "step": 41720 + }, + { + "epoch": 5.923349893541519, + "grad_norm": 0.2429005652666092, + "learning_rate": 9.40793470546487e-05, + "loss": 0.02907339930534363, + "step": 41730 + }, + { + "epoch": 5.924769339957416, + "grad_norm": 6.333399772644043, + "learning_rate": 9.40779276082328e-05, + "loss": 0.11444522142410278, + "step": 41740 + }, + { + "epoch": 5.926188786373315, + "grad_norm": 5.317239761352539, + "learning_rate": 9.40765081618169e-05, + "loss": 0.06607084274291992, + "step": 41750 + }, + { + "epoch": 5.927608232789212, + "grad_norm": 0.490640252828598, + "learning_rate": 9.407508871540099e-05, + "loss": 0.03365239799022675, + "step": 41760 + }, + { + "epoch": 5.9290276792051095, + "grad_norm": 8.204052925109863, + "learning_rate": 9.40736692689851e-05, + "loss": 0.0700852930545807, + "step": 41770 + }, + { + "epoch": 5.930447125621008, + "grad_norm": 0.6442670822143555, + "learning_rate": 9.40722498225692e-05, + "loss": 0.045039495825767516, + "step": 41780 + }, + { + "epoch": 5.931866572036905, + "grad_norm": 7.721465587615967, + "learning_rate": 9.407083037615331e-05, + "loss": 0.06508231163024902, + "step": 41790 + }, + { + "epoch": 5.933286018452804, + "grad_norm": 1.8151298761367798, + "learning_rate": 9.40694109297374e-05, + "loss": 0.04082694351673126, + "step": 41800 + }, + { + "epoch": 5.934705464868701, + "grad_norm": 0.27777761220932007, + "learning_rate": 9.40679914833215e-05, + "loss": 0.03647947609424591, + "step": 41810 + }, + { + "epoch": 5.936124911284599, + "grad_norm": 2.4215736389160156, + "learning_rate": 9.406657203690561e-05, + "loss": 0.06282040476799011, + "step": 41820 + }, + { + "epoch": 5.937544357700497, + "grad_norm": 5.381174564361572, + "learning_rate": 9.406515259048971e-05, + "loss": 0.07924935817718506, + "step": 41830 + }, + { + "epoch": 5.938963804116394, + "grad_norm": 0.8996217250823975, + "learning_rate": 9.406373314407382e-05, + "loss": 0.0540692925453186, + "step": 41840 + }, + { + "epoch": 5.940383250532292, + "grad_norm": 0.28388017416000366, + "learning_rate": 9.406231369765792e-05, + "loss": 0.0712361752986908, + "step": 41850 + }, + { + "epoch": 5.94180269694819, + "grad_norm": 0.06892205774784088, + "learning_rate": 9.406089425124202e-05, + "loss": 0.08142430782318115, + "step": 41860 + }, + { + "epoch": 5.943222143364088, + "grad_norm": 4.6068434715271, + "learning_rate": 9.405947480482611e-05, + "loss": 0.07882195711135864, + "step": 41870 + }, + { + "epoch": 5.944641589779986, + "grad_norm": 7.996845722198486, + "learning_rate": 9.405805535841022e-05, + "loss": 0.06462631225585938, + "step": 41880 + }, + { + "epoch": 5.946061036195884, + "grad_norm": 5.027513027191162, + "learning_rate": 9.405663591199432e-05, + "loss": 0.03442648947238922, + "step": 41890 + }, + { + "epoch": 5.947480482611781, + "grad_norm": 10.417757034301758, + "learning_rate": 9.405521646557843e-05, + "loss": 0.05618232488632202, + "step": 41900 + }, + { + "epoch": 5.948899929027679, + "grad_norm": 5.782607078552246, + "learning_rate": 9.405379701916253e-05, + "loss": 0.06721282005310059, + "step": 41910 + }, + { + "epoch": 5.950319375443577, + "grad_norm": 0.7739474773406982, + "learning_rate": 9.405237757274663e-05, + "loss": 0.03563763499259949, + "step": 41920 + }, + { + "epoch": 5.9517388218594744, + "grad_norm": 9.562399864196777, + "learning_rate": 9.405095812633074e-05, + "loss": 0.07549918293952942, + "step": 41930 + }, + { + "epoch": 5.953158268275373, + "grad_norm": 3.9365859031677246, + "learning_rate": 9.404953867991484e-05, + "loss": 0.052819907665252686, + "step": 41940 + }, + { + "epoch": 5.95457771469127, + "grad_norm": 0.7143135070800781, + "learning_rate": 9.404811923349895e-05, + "loss": 0.07506571412086487, + "step": 41950 + }, + { + "epoch": 5.9559971611071685, + "grad_norm": 1.248931646347046, + "learning_rate": 9.404669978708303e-05, + "loss": 0.06863842606544494, + "step": 41960 + }, + { + "epoch": 5.957416607523066, + "grad_norm": 0.1544966995716095, + "learning_rate": 9.404528034066714e-05, + "loss": 0.04283129572868347, + "step": 41970 + }, + { + "epoch": 5.958836053938963, + "grad_norm": 3.303541660308838, + "learning_rate": 9.404386089425124e-05, + "loss": 0.03427127003669739, + "step": 41980 + }, + { + "epoch": 5.960255500354862, + "grad_norm": 9.369416236877441, + "learning_rate": 9.404244144783535e-05, + "loss": 0.12133831977844238, + "step": 41990 + }, + { + "epoch": 5.961674946770759, + "grad_norm": 1.9785155057907104, + "learning_rate": 9.404102200141946e-05, + "loss": 0.06636718511581421, + "step": 42000 + }, + { + "epoch": 5.961674946770759, + "eval_accuracy": 0.9723405608189737, + "eval_loss": 0.08659365773200989, + "eval_runtime": 32.6749, + "eval_samples_per_second": 481.317, + "eval_steps_per_second": 15.057, + "step": 42000 + }, + { + "epoch": 5.963094393186657, + "grad_norm": 4.923141002655029, + "learning_rate": 9.403960255500356e-05, + "loss": 0.06877887845039368, + "step": 42010 + }, + { + "epoch": 5.964513839602555, + "grad_norm": 2.2776553630828857, + "learning_rate": 9.403818310858766e-05, + "loss": 0.0712451994419098, + "step": 42020 + }, + { + "epoch": 5.965933286018453, + "grad_norm": 1.6378326416015625, + "learning_rate": 9.403676366217175e-05, + "loss": 0.03723881542682648, + "step": 42030 + }, + { + "epoch": 5.9673527324343505, + "grad_norm": 2.101365089416504, + "learning_rate": 9.403534421575586e-05, + "loss": 0.056721025705337526, + "step": 42040 + }, + { + "epoch": 5.968772178850248, + "grad_norm": 0.6855022311210632, + "learning_rate": 9.403392476933996e-05, + "loss": 0.08039049506187439, + "step": 42050 + }, + { + "epoch": 5.970191625266146, + "grad_norm": 4.207023620605469, + "learning_rate": 9.403250532292407e-05, + "loss": 0.10304387807846069, + "step": 42060 + }, + { + "epoch": 5.971611071682044, + "grad_norm": 9.549059867858887, + "learning_rate": 9.403108587650816e-05, + "loss": 0.05132551789283753, + "step": 42070 + }, + { + "epoch": 5.973030518097942, + "grad_norm": 3.993290662765503, + "learning_rate": 9.402966643009227e-05, + "loss": 0.0616031289100647, + "step": 42080 + }, + { + "epoch": 5.974449964513839, + "grad_norm": 7.922762393951416, + "learning_rate": 9.402824698367636e-05, + "loss": 0.06184162497520447, + "step": 42090 + }, + { + "epoch": 5.975869410929738, + "grad_norm": 0.07029788196086884, + "learning_rate": 9.402682753726048e-05, + "loss": 0.03704347312450409, + "step": 42100 + }, + { + "epoch": 5.977288857345635, + "grad_norm": 0.35138562321662903, + "learning_rate": 9.402540809084459e-05, + "loss": 0.016092486679553986, + "step": 42110 + }, + { + "epoch": 5.9787083037615325, + "grad_norm": 0.23122110962867737, + "learning_rate": 9.402398864442867e-05, + "loss": 0.030700623989105225, + "step": 42120 + }, + { + "epoch": 5.980127750177431, + "grad_norm": 9.871964454650879, + "learning_rate": 9.402256919801278e-05, + "loss": 0.05316352844238281, + "step": 42130 + }, + { + "epoch": 5.981547196593328, + "grad_norm": 8.557596206665039, + "learning_rate": 9.402114975159688e-05, + "loss": 0.08352534174919128, + "step": 42140 + }, + { + "epoch": 5.9829666430092265, + "grad_norm": 0.055390872061252594, + "learning_rate": 9.401973030518099e-05, + "loss": 0.021974930167198183, + "step": 42150 + }, + { + "epoch": 5.984386089425124, + "grad_norm": 0.12106958776712418, + "learning_rate": 9.401831085876509e-05, + "loss": 0.09234058260917663, + "step": 42160 + }, + { + "epoch": 5.985805535841022, + "grad_norm": 1.0915229320526123, + "learning_rate": 9.401689141234918e-05, + "loss": 0.039049354195594785, + "step": 42170 + }, + { + "epoch": 5.98722498225692, + "grad_norm": 1.639233112335205, + "learning_rate": 9.401547196593328e-05, + "loss": 0.07456170916557311, + "step": 42180 + }, + { + "epoch": 5.988644428672818, + "grad_norm": 0.9587175846099854, + "learning_rate": 9.401405251951739e-05, + "loss": 0.06777811646461487, + "step": 42190 + }, + { + "epoch": 5.990063875088715, + "grad_norm": 4.586569309234619, + "learning_rate": 9.40126330731015e-05, + "loss": 0.0436800479888916, + "step": 42200 + }, + { + "epoch": 5.991483321504613, + "grad_norm": 0.17483319342136383, + "learning_rate": 9.40112136266856e-05, + "loss": 0.10744675397872924, + "step": 42210 + }, + { + "epoch": 5.992902767920511, + "grad_norm": 0.6297892332077026, + "learning_rate": 9.40097941802697e-05, + "loss": 0.016437722742557524, + "step": 42220 + }, + { + "epoch": 5.9943222143364085, + "grad_norm": 10.479232788085938, + "learning_rate": 9.40083747338538e-05, + "loss": 0.04590970277786255, + "step": 42230 + }, + { + "epoch": 5.995741660752307, + "grad_norm": 0.8099613189697266, + "learning_rate": 9.40069552874379e-05, + "loss": 0.04706493616104126, + "step": 42240 + }, + { + "epoch": 5.997161107168204, + "grad_norm": 0.6547835469245911, + "learning_rate": 9.4005535841022e-05, + "loss": 0.04878454804420471, + "step": 42250 + }, + { + "epoch": 5.998580553584103, + "grad_norm": 8.429951667785645, + "learning_rate": 9.400411639460611e-05, + "loss": 0.03529610633850098, + "step": 42260 + }, + { + "epoch": 6.0, + "grad_norm": 1.0936342477798462, + "learning_rate": 9.40026969481902e-05, + "loss": 0.058073770999908444, + "step": 42270 + }, + { + "epoch": 6.001419446415897, + "grad_norm": 10.419304847717285, + "learning_rate": 9.400127750177431e-05, + "loss": 0.04838542342185974, + "step": 42280 + }, + { + "epoch": 6.002838892831796, + "grad_norm": 1.3970303535461426, + "learning_rate": 9.399985805535842e-05, + "loss": 0.027141714096069337, + "step": 42290 + }, + { + "epoch": 6.004258339247693, + "grad_norm": 0.9260740280151367, + "learning_rate": 9.399843860894252e-05, + "loss": 0.05527122020721435, + "step": 42300 + }, + { + "epoch": 6.0056777856635915, + "grad_norm": 3.39192533493042, + "learning_rate": 9.399701916252663e-05, + "loss": 0.08901798725128174, + "step": 42310 + }, + { + "epoch": 6.007097232079489, + "grad_norm": 2.0796215534210205, + "learning_rate": 9.399559971611071e-05, + "loss": 0.0270698219537735, + "step": 42320 + }, + { + "epoch": 6.008516678495387, + "grad_norm": 2.704911708831787, + "learning_rate": 9.399418026969482e-05, + "loss": 0.021138927340507506, + "step": 42330 + }, + { + "epoch": 6.009936124911285, + "grad_norm": 0.3152889609336853, + "learning_rate": 9.399276082327892e-05, + "loss": 0.02813498079776764, + "step": 42340 + }, + { + "epoch": 6.011355571327182, + "grad_norm": 0.4241684675216675, + "learning_rate": 9.399134137686303e-05, + "loss": 0.030684193968772887, + "step": 42350 + }, + { + "epoch": 6.01277501774308, + "grad_norm": 5.618993759155273, + "learning_rate": 9.398992193044713e-05, + "loss": 0.035936284065246585, + "step": 42360 + }, + { + "epoch": 6.014194464158978, + "grad_norm": 2.3576912879943848, + "learning_rate": 9.398850248403124e-05, + "loss": 0.021441501379013062, + "step": 42370 + }, + { + "epoch": 6.015613910574876, + "grad_norm": 1.1104201078414917, + "learning_rate": 9.398708303761534e-05, + "loss": 0.03485158383846283, + "step": 42380 + }, + { + "epoch": 6.0170333569907735, + "grad_norm": 0.14437326788902283, + "learning_rate": 9.398566359119943e-05, + "loss": 0.0569730281829834, + "step": 42390 + }, + { + "epoch": 6.018452803406672, + "grad_norm": 3.9557926654815674, + "learning_rate": 9.398424414478355e-05, + "loss": 0.04000087082386017, + "step": 42400 + }, + { + "epoch": 6.019872249822569, + "grad_norm": 0.8551321029663086, + "learning_rate": 9.398282469836764e-05, + "loss": 0.02508898377418518, + "step": 42410 + }, + { + "epoch": 6.021291696238467, + "grad_norm": 0.5500660538673401, + "learning_rate": 9.398140525195175e-05, + "loss": 0.040706342458724974, + "step": 42420 + }, + { + "epoch": 6.022711142654365, + "grad_norm": 1.740173101425171, + "learning_rate": 9.397998580553584e-05, + "loss": 0.05620036721229553, + "step": 42430 + }, + { + "epoch": 6.024130589070262, + "grad_norm": 0.16710884869098663, + "learning_rate": 9.397856635911995e-05, + "loss": 0.00892709344625473, + "step": 42440 + }, + { + "epoch": 6.025550035486161, + "grad_norm": 0.6740273237228394, + "learning_rate": 9.397714691270405e-05, + "loss": 0.06913689970970154, + "step": 42450 + }, + { + "epoch": 6.026969481902058, + "grad_norm": 0.17985455691814423, + "learning_rate": 9.397572746628816e-05, + "loss": 0.03412851691246033, + "step": 42460 + }, + { + "epoch": 6.028388928317956, + "grad_norm": 8.147322654724121, + "learning_rate": 9.397430801987225e-05, + "loss": 0.03205571174621582, + "step": 42470 + }, + { + "epoch": 6.029808374733854, + "grad_norm": 0.2391805201768875, + "learning_rate": 9.397288857345635e-05, + "loss": 0.03885909616947174, + "step": 42480 + }, + { + "epoch": 6.031227821149751, + "grad_norm": 2.3706560134887695, + "learning_rate": 9.397146912704046e-05, + "loss": 0.059897488355636595, + "step": 42490 + }, + { + "epoch": 6.0326472675656495, + "grad_norm": 1.3437626361846924, + "learning_rate": 9.397004968062456e-05, + "loss": 0.026122891902923585, + "step": 42500 + }, + { + "epoch": 6.0326472675656495, + "eval_accuracy": 0.9801615056908501, + "eval_loss": 0.06414638459682465, + "eval_runtime": 32.6309, + "eval_samples_per_second": 481.966, + "eval_steps_per_second": 15.078, + "step": 42500 + }, + { + "epoch": 6.034066713981547, + "grad_norm": 4.8015360832214355, + "learning_rate": 9.396863023420867e-05, + "loss": 0.032011619210243224, + "step": 42510 + }, + { + "epoch": 6.035486160397445, + "grad_norm": 4.332185745239258, + "learning_rate": 9.396721078779277e-05, + "loss": 0.056959223747253415, + "step": 42520 + }, + { + "epoch": 6.036905606813343, + "grad_norm": 0.11899875849485397, + "learning_rate": 9.396579134137687e-05, + "loss": 0.02158723771572113, + "step": 42530 + }, + { + "epoch": 6.038325053229241, + "grad_norm": 6.497653961181641, + "learning_rate": 9.396437189496096e-05, + "loss": 0.025063958764076234, + "step": 42540 + }, + { + "epoch": 6.039744499645138, + "grad_norm": 10.640429496765137, + "learning_rate": 9.396295244854507e-05, + "loss": 0.07477115392684937, + "step": 42550 + }, + { + "epoch": 6.041163946061036, + "grad_norm": 3.3736305236816406, + "learning_rate": 9.396153300212917e-05, + "loss": 0.028403592109680176, + "step": 42560 + }, + { + "epoch": 6.042583392476934, + "grad_norm": 0.13938064873218536, + "learning_rate": 9.396011355571328e-05, + "loss": 0.0490555077791214, + "step": 42570 + }, + { + "epoch": 6.0440028388928315, + "grad_norm": 3.9250857830047607, + "learning_rate": 9.395869410929738e-05, + "loss": 0.02079556733369827, + "step": 42580 + }, + { + "epoch": 6.04542228530873, + "grad_norm": 8.848977088928223, + "learning_rate": 9.395727466288148e-05, + "loss": 0.04536510109901428, + "step": 42590 + }, + { + "epoch": 6.046841731724627, + "grad_norm": 2.9003498554229736, + "learning_rate": 9.395585521646559e-05, + "loss": 0.06284236311912536, + "step": 42600 + }, + { + "epoch": 6.0482611781405256, + "grad_norm": 2.06571626663208, + "learning_rate": 9.395443577004969e-05, + "loss": 0.10079717636108398, + "step": 42610 + }, + { + "epoch": 6.049680624556423, + "grad_norm": 1.5646941661834717, + "learning_rate": 9.39530163236338e-05, + "loss": 0.0747799277305603, + "step": 42620 + }, + { + "epoch": 6.05110007097232, + "grad_norm": 4.782958030700684, + "learning_rate": 9.395159687721788e-05, + "loss": 0.10376496315002441, + "step": 42630 + }, + { + "epoch": 6.052519517388219, + "grad_norm": 0.5791422128677368, + "learning_rate": 9.395017743080199e-05, + "loss": 0.059558308124542235, + "step": 42640 + }, + { + "epoch": 6.053938963804116, + "grad_norm": 6.751038551330566, + "learning_rate": 9.394875798438609e-05, + "loss": 0.05839126706123352, + "step": 42650 + }, + { + "epoch": 6.055358410220014, + "grad_norm": 7.07871675491333, + "learning_rate": 9.39473385379702e-05, + "loss": 0.059094560146331784, + "step": 42660 + }, + { + "epoch": 6.056777856635912, + "grad_norm": 1.6905990839004517, + "learning_rate": 9.39459190915543e-05, + "loss": 0.015537199378013612, + "step": 42670 + }, + { + "epoch": 6.05819730305181, + "grad_norm": 6.017456531524658, + "learning_rate": 9.39444996451384e-05, + "loss": 0.08799818754196168, + "step": 42680 + }, + { + "epoch": 6.059616749467708, + "grad_norm": 4.168159008026123, + "learning_rate": 9.39430801987225e-05, + "loss": 0.035238003730773924, + "step": 42690 + }, + { + "epoch": 6.061036195883605, + "grad_norm": 3.4934043884277344, + "learning_rate": 9.39416607523066e-05, + "loss": 0.04012168049812317, + "step": 42700 + }, + { + "epoch": 6.062455642299503, + "grad_norm": 0.6561540365219116, + "learning_rate": 9.394024130589071e-05, + "loss": 0.03983815610408783, + "step": 42710 + }, + { + "epoch": 6.063875088715401, + "grad_norm": 5.77907133102417, + "learning_rate": 9.393882185947481e-05, + "loss": 0.0537925660610199, + "step": 42720 + }, + { + "epoch": 6.065294535131299, + "grad_norm": 0.7920040488243103, + "learning_rate": 9.393740241305892e-05, + "loss": 0.040977182984352115, + "step": 42730 + }, + { + "epoch": 6.066713981547196, + "grad_norm": 0.8078078031539917, + "learning_rate": 9.3935982966643e-05, + "loss": 0.02283947616815567, + "step": 42740 + }, + { + "epoch": 6.068133427963095, + "grad_norm": 6.577816963195801, + "learning_rate": 9.393456352022712e-05, + "loss": 0.048980104923248294, + "step": 42750 + }, + { + "epoch": 6.069552874378992, + "grad_norm": 2.022977113723755, + "learning_rate": 9.393314407381121e-05, + "loss": 0.01717734932899475, + "step": 42760 + }, + { + "epoch": 6.07097232079489, + "grad_norm": 3.7534005641937256, + "learning_rate": 9.393172462739532e-05, + "loss": 0.034478670358657836, + "step": 42770 + }, + { + "epoch": 6.072391767210788, + "grad_norm": 0.1850811243057251, + "learning_rate": 9.393030518097942e-05, + "loss": 0.0779535412788391, + "step": 42780 + }, + { + "epoch": 6.073811213626685, + "grad_norm": 2.3731038570404053, + "learning_rate": 9.392888573456352e-05, + "loss": 0.054844236373901366, + "step": 42790 + }, + { + "epoch": 6.075230660042584, + "grad_norm": 2.8397276401519775, + "learning_rate": 9.392746628814763e-05, + "loss": 0.03636707365512848, + "step": 42800 + }, + { + "epoch": 6.076650106458481, + "grad_norm": 1.0015398263931274, + "learning_rate": 9.392604684173173e-05, + "loss": 0.02905575931072235, + "step": 42810 + }, + { + "epoch": 6.078069552874379, + "grad_norm": 4.600225448608398, + "learning_rate": 9.392462739531584e-05, + "loss": 0.044324475526809695, + "step": 42820 + }, + { + "epoch": 6.079488999290277, + "grad_norm": 0.05996633321046829, + "learning_rate": 9.392320794889994e-05, + "loss": 0.022943411767482758, + "step": 42830 + }, + { + "epoch": 6.080908445706174, + "grad_norm": 5.939465522766113, + "learning_rate": 9.392178850248403e-05, + "loss": 0.032868221402168274, + "step": 42840 + }, + { + "epoch": 6.0823278921220725, + "grad_norm": 1.8107235431671143, + "learning_rate": 9.392036905606813e-05, + "loss": 0.04718858897686005, + "step": 42850 + }, + { + "epoch": 6.08374733853797, + "grad_norm": 7.665246486663818, + "learning_rate": 9.391894960965224e-05, + "loss": 0.056220120191574095, + "step": 42860 + }, + { + "epoch": 6.085166784953868, + "grad_norm": 2.0348501205444336, + "learning_rate": 9.391753016323634e-05, + "loss": 0.06356436610221863, + "step": 42870 + }, + { + "epoch": 6.086586231369766, + "grad_norm": 6.74163293838501, + "learning_rate": 9.391611071682045e-05, + "loss": 0.027032476663589478, + "step": 42880 + }, + { + "epoch": 6.088005677785664, + "grad_norm": 9.711258888244629, + "learning_rate": 9.391469127040455e-05, + "loss": 0.02918843924999237, + "step": 42890 + }, + { + "epoch": 6.089425124201561, + "grad_norm": 1.9372355937957764, + "learning_rate": 9.391327182398864e-05, + "loss": 0.027305248379707336, + "step": 42900 + }, + { + "epoch": 6.090844570617459, + "grad_norm": 3.3903517723083496, + "learning_rate": 9.391185237757276e-05, + "loss": 0.035347151756286624, + "step": 42910 + }, + { + "epoch": 6.092264017033357, + "grad_norm": 2.7463488578796387, + "learning_rate": 9.391043293115685e-05, + "loss": 0.06535101532936097, + "step": 42920 + }, + { + "epoch": 6.0936834634492545, + "grad_norm": 6.837551593780518, + "learning_rate": 9.390901348474096e-05, + "loss": 0.08181880712509156, + "step": 42930 + }, + { + "epoch": 6.095102909865153, + "grad_norm": 0.08821488916873932, + "learning_rate": 9.390759403832505e-05, + "loss": 0.07695122361183167, + "step": 42940 + }, + { + "epoch": 6.09652235628105, + "grad_norm": 2.108302593231201, + "learning_rate": 9.390617459190916e-05, + "loss": 0.053022068738937375, + "step": 42950 + }, + { + "epoch": 6.0979418026969485, + "grad_norm": 1.4591866731643677, + "learning_rate": 9.390475514549326e-05, + "loss": 0.029264354705810548, + "step": 42960 + }, + { + "epoch": 6.099361249112846, + "grad_norm": 0.7254082560539246, + "learning_rate": 9.390333569907737e-05, + "loss": 0.023519128561019897, + "step": 42970 + }, + { + "epoch": 6.100780695528743, + "grad_norm": 3.487905740737915, + "learning_rate": 9.390191625266146e-05, + "loss": 0.025741568207740782, + "step": 42980 + }, + { + "epoch": 6.102200141944642, + "grad_norm": 2.3520405292510986, + "learning_rate": 9.390049680624556e-05, + "loss": 0.03384661078453064, + "step": 42990 + }, + { + "epoch": 6.103619588360539, + "grad_norm": 12.140244483947754, + "learning_rate": 9.389907735982967e-05, + "loss": 0.047564372420310974, + "step": 43000 + }, + { + "epoch": 6.103619588360539, + "eval_accuracy": 0.9731671647485216, + "eval_loss": 0.10344529151916504, + "eval_runtime": 32.822, + "eval_samples_per_second": 479.161, + "eval_steps_per_second": 14.99, + "step": 43000 + }, + { + "epoch": 6.105039034776437, + "grad_norm": 9.412208557128906, + "learning_rate": 9.389779985805536e-05, + "loss": 0.08669753670692444, + "step": 43010 + }, + { + "epoch": 6.106458481192335, + "grad_norm": 7.122009754180908, + "learning_rate": 9.389638041163946e-05, + "loss": 0.0934341549873352, + "step": 43020 + }, + { + "epoch": 6.107877927608233, + "grad_norm": 1.6225032806396484, + "learning_rate": 9.389496096522357e-05, + "loss": 0.05215970277786255, + "step": 43030 + }, + { + "epoch": 6.1092973740241305, + "grad_norm": 6.039062023162842, + "learning_rate": 9.389354151880768e-05, + "loss": 0.08043327331542968, + "step": 43040 + }, + { + "epoch": 6.110716820440028, + "grad_norm": 0.20308256149291992, + "learning_rate": 9.389212207239177e-05, + "loss": 0.04331388473510742, + "step": 43050 + }, + { + "epoch": 6.112136266855926, + "grad_norm": 4.1182990074157715, + "learning_rate": 9.389070262597589e-05, + "loss": 0.09296801090240478, + "step": 43060 + }, + { + "epoch": 6.113555713271824, + "grad_norm": 2.675135374069214, + "learning_rate": 9.388928317955997e-05, + "loss": 0.04550227820873261, + "step": 43070 + }, + { + "epoch": 6.114975159687722, + "grad_norm": 6.068324565887451, + "learning_rate": 9.388786373314408e-05, + "loss": 0.06599665880203247, + "step": 43080 + }, + { + "epoch": 6.116394606103619, + "grad_norm": 1.9616451263427734, + "learning_rate": 9.388644428672818e-05, + "loss": 0.07046244740486145, + "step": 43090 + }, + { + "epoch": 6.117814052519518, + "grad_norm": 4.773113250732422, + "learning_rate": 9.388502484031229e-05, + "loss": 0.05360671281814575, + "step": 43100 + }, + { + "epoch": 6.119233498935415, + "grad_norm": 0.3193398714065552, + "learning_rate": 9.388360539389639e-05, + "loss": 0.08010044693946838, + "step": 43110 + }, + { + "epoch": 6.120652945351313, + "grad_norm": 9.674460411071777, + "learning_rate": 9.388218594748048e-05, + "loss": 0.04749388694763183, + "step": 43120 + }, + { + "epoch": 6.122072391767211, + "grad_norm": 1.1620126962661743, + "learning_rate": 9.38807665010646e-05, + "loss": 0.033527106046676636, + "step": 43130 + }, + { + "epoch": 6.123491838183108, + "grad_norm": 1.2030847072601318, + "learning_rate": 9.387934705464869e-05, + "loss": 0.03352363109588623, + "step": 43140 + }, + { + "epoch": 6.124911284599007, + "grad_norm": 1.329487681388855, + "learning_rate": 9.38779276082328e-05, + "loss": 0.03268220722675323, + "step": 43150 + }, + { + "epoch": 6.126330731014904, + "grad_norm": 0.743346631526947, + "learning_rate": 9.38765081618169e-05, + "loss": 0.02008904367685318, + "step": 43160 + }, + { + "epoch": 6.127750177430802, + "grad_norm": 7.962668418884277, + "learning_rate": 9.3875088715401e-05, + "loss": 0.09379298686981201, + "step": 43170 + }, + { + "epoch": 6.1291696238467, + "grad_norm": 0.15104877948760986, + "learning_rate": 9.38736692689851e-05, + "loss": 0.06707976460456848, + "step": 43180 + }, + { + "epoch": 6.130589070262598, + "grad_norm": 1.7654162645339966, + "learning_rate": 9.38722498225692e-05, + "loss": 0.04772307276725769, + "step": 43190 + }, + { + "epoch": 6.1320085166784954, + "grad_norm": 4.709306240081787, + "learning_rate": 9.38708303761533e-05, + "loss": 0.03134104907512665, + "step": 43200 + }, + { + "epoch": 6.133427963094393, + "grad_norm": 7.34123420715332, + "learning_rate": 9.386941092973741e-05, + "loss": 0.0683401346206665, + "step": 43210 + }, + { + "epoch": 6.134847409510291, + "grad_norm": 2.9818601608276367, + "learning_rate": 9.386799148332151e-05, + "loss": 0.037702041864395144, + "step": 43220 + }, + { + "epoch": 6.136266855926189, + "grad_norm": 2.6485135555267334, + "learning_rate": 9.386657203690561e-05, + "loss": 0.03553232550621033, + "step": 43230 + }, + { + "epoch": 6.137686302342087, + "grad_norm": 0.658879280090332, + "learning_rate": 9.386515259048972e-05, + "loss": 0.05370528697967529, + "step": 43240 + }, + { + "epoch": 6.139105748757984, + "grad_norm": 1.2858973741531372, + "learning_rate": 9.386373314407382e-05, + "loss": 0.08532284498214722, + "step": 43250 + }, + { + "epoch": 6.140525195173883, + "grad_norm": 1.1662542819976807, + "learning_rate": 9.386231369765793e-05, + "loss": 0.035893863439559935, + "step": 43260 + }, + { + "epoch": 6.14194464158978, + "grad_norm": 0.8113408088684082, + "learning_rate": 9.386089425124201e-05, + "loss": 0.05194441676139831, + "step": 43270 + }, + { + "epoch": 6.1433640880056775, + "grad_norm": 1.8177664279937744, + "learning_rate": 9.385947480482612e-05, + "loss": 0.06852318644523621, + "step": 43280 + }, + { + "epoch": 6.144783534421576, + "grad_norm": 10.182692527770996, + "learning_rate": 9.385805535841022e-05, + "loss": 0.06881612539291382, + "step": 43290 + }, + { + "epoch": 6.146202980837473, + "grad_norm": 0.47980985045433044, + "learning_rate": 9.385663591199433e-05, + "loss": 0.044000831246376035, + "step": 43300 + }, + { + "epoch": 6.1476224272533715, + "grad_norm": 0.38547614216804504, + "learning_rate": 9.385521646557843e-05, + "loss": 0.030921560525894166, + "step": 43310 + }, + { + "epoch": 6.149041873669269, + "grad_norm": 4.417194843292236, + "learning_rate": 9.385379701916253e-05, + "loss": 0.02149178385734558, + "step": 43320 + }, + { + "epoch": 6.150461320085167, + "grad_norm": 3.865617513656616, + "learning_rate": 9.385237757274664e-05, + "loss": 0.01396312564611435, + "step": 43330 + }, + { + "epoch": 6.151880766501065, + "grad_norm": 0.28896984457969666, + "learning_rate": 9.385095812633073e-05, + "loss": 0.059472233057022095, + "step": 43340 + }, + { + "epoch": 6.153300212916962, + "grad_norm": 0.8453249931335449, + "learning_rate": 9.384953867991484e-05, + "loss": 0.059427005052566526, + "step": 43350 + }, + { + "epoch": 6.15471965933286, + "grad_norm": 0.9897168278694153, + "learning_rate": 9.384811923349894e-05, + "loss": 0.047054699063301085, + "step": 43360 + }, + { + "epoch": 6.156139105748758, + "grad_norm": 10.11900806427002, + "learning_rate": 9.384669978708304e-05, + "loss": 0.06199625730514526, + "step": 43370 + }, + { + "epoch": 6.157558552164656, + "grad_norm": 6.863821029663086, + "learning_rate": 9.384528034066714e-05, + "loss": 0.03126347362995148, + "step": 43380 + }, + { + "epoch": 6.1589779985805535, + "grad_norm": 0.7479419112205505, + "learning_rate": 9.384386089425125e-05, + "loss": 0.022752903401851654, + "step": 43390 + }, + { + "epoch": 6.160397444996452, + "grad_norm": 7.677117347717285, + "learning_rate": 9.384244144783535e-05, + "loss": 0.06328274011611938, + "step": 43400 + }, + { + "epoch": 6.161816891412349, + "grad_norm": 0.43821343779563904, + "learning_rate": 9.384102200141946e-05, + "loss": 0.011660891026258469, + "step": 43410 + }, + { + "epoch": 6.163236337828247, + "grad_norm": 0.12804394960403442, + "learning_rate": 9.383960255500355e-05, + "loss": 0.11909148693084717, + "step": 43420 + }, + { + "epoch": 6.164655784244145, + "grad_norm": 2.8437774181365967, + "learning_rate": 9.383818310858765e-05, + "loss": 0.0267734169960022, + "step": 43430 + }, + { + "epoch": 6.166075230660042, + "grad_norm": 0.17364199459552765, + "learning_rate": 9.383676366217176e-05, + "loss": 0.014441606402397156, + "step": 43440 + }, + { + "epoch": 6.167494677075941, + "grad_norm": 5.291993618011475, + "learning_rate": 9.383534421575586e-05, + "loss": 0.04625110328197479, + "step": 43450 + }, + { + "epoch": 6.168914123491838, + "grad_norm": 1.9384088516235352, + "learning_rate": 9.383392476933997e-05, + "loss": 0.03442354500293732, + "step": 43460 + }, + { + "epoch": 6.170333569907736, + "grad_norm": 5.928829669952393, + "learning_rate": 9.383250532292407e-05, + "loss": 0.052574223279953, + "step": 43470 + }, + { + "epoch": 6.171753016323634, + "grad_norm": 0.3862343728542328, + "learning_rate": 9.383108587650816e-05, + "loss": 0.10290310382843018, + "step": 43480 + }, + { + "epoch": 6.173172462739531, + "grad_norm": 4.460153102874756, + "learning_rate": 9.382966643009226e-05, + "loss": 0.04883348345756531, + "step": 43490 + }, + { + "epoch": 6.1745919091554295, + "grad_norm": 0.46098002791404724, + "learning_rate": 9.382824698367637e-05, + "loss": 0.024615487456321715, + "step": 43500 + }, + { + "epoch": 6.1745919091554295, + "eval_accuracy": 0.9751382971959052, + "eval_loss": 0.07891522347927094, + "eval_runtime": 32.9149, + "eval_samples_per_second": 477.808, + "eval_steps_per_second": 14.948, + "step": 43500 + }, + { + "epoch": 6.176011355571327, + "grad_norm": 7.657423496246338, + "learning_rate": 9.382682753726047e-05, + "loss": 0.02250729650259018, + "step": 43510 + }, + { + "epoch": 6.177430801987225, + "grad_norm": 3.3920164108276367, + "learning_rate": 9.382540809084458e-05, + "loss": 0.05920064449310303, + "step": 43520 + }, + { + "epoch": 6.178850248403123, + "grad_norm": 5.01832914352417, + "learning_rate": 9.382398864442868e-05, + "loss": 0.06701637506484985, + "step": 43530 + }, + { + "epoch": 6.180269694819021, + "grad_norm": 13.015891075134277, + "learning_rate": 9.382256919801278e-05, + "loss": 0.07436400651931763, + "step": 43540 + }, + { + "epoch": 6.181689141234918, + "grad_norm": 0.7317226529121399, + "learning_rate": 9.382114975159689e-05, + "loss": 0.0654987096786499, + "step": 43550 + }, + { + "epoch": 6.183108587650816, + "grad_norm": 2.2464566230773926, + "learning_rate": 9.381973030518098e-05, + "loss": 0.056348496675491334, + "step": 43560 + }, + { + "epoch": 6.184528034066714, + "grad_norm": 3.364604949951172, + "learning_rate": 9.38183108587651e-05, + "loss": 0.054125458002090454, + "step": 43570 + }, + { + "epoch": 6.185947480482612, + "grad_norm": 0.46684959530830383, + "learning_rate": 9.381689141234918e-05, + "loss": 0.06286740899085999, + "step": 43580 + }, + { + "epoch": 6.18736692689851, + "grad_norm": 1.326682209968567, + "learning_rate": 9.381547196593329e-05, + "loss": 0.09077779650688171, + "step": 43590 + }, + { + "epoch": 6.188786373314407, + "grad_norm": 5.395484447479248, + "learning_rate": 9.381405251951739e-05, + "loss": 0.0627810776233673, + "step": 43600 + }, + { + "epoch": 6.190205819730306, + "grad_norm": 0.3197677433490753, + "learning_rate": 9.38126330731015e-05, + "loss": 0.07363017201423645, + "step": 43610 + }, + { + "epoch": 6.191625266146203, + "grad_norm": 0.048317801207304, + "learning_rate": 9.38112136266856e-05, + "loss": 0.04617903530597687, + "step": 43620 + }, + { + "epoch": 6.1930447125621, + "grad_norm": 0.30027082562446594, + "learning_rate": 9.38097941802697e-05, + "loss": 0.04771397709846496, + "step": 43630 + }, + { + "epoch": 6.194464158977999, + "grad_norm": 1.9492099285125732, + "learning_rate": 9.38083747338538e-05, + "loss": 0.05550530552864075, + "step": 43640 + }, + { + "epoch": 6.195883605393896, + "grad_norm": 0.49073126912117004, + "learning_rate": 9.38069552874379e-05, + "loss": 0.03370376825332642, + "step": 43650 + }, + { + "epoch": 6.1973030518097945, + "grad_norm": 4.336076736450195, + "learning_rate": 9.380553584102201e-05, + "loss": 0.04263518452644348, + "step": 43660 + }, + { + "epoch": 6.198722498225692, + "grad_norm": 1.0135289430618286, + "learning_rate": 9.380411639460611e-05, + "loss": 0.010483792424201966, + "step": 43670 + }, + { + "epoch": 6.20014194464159, + "grad_norm": 0.6504700183868408, + "learning_rate": 9.380269694819021e-05, + "loss": 0.10248892307281494, + "step": 43680 + }, + { + "epoch": 6.201561391057488, + "grad_norm": 0.7889754176139832, + "learning_rate": 9.38012775017743e-05, + "loss": 0.02525465190410614, + "step": 43690 + }, + { + "epoch": 6.202980837473385, + "grad_norm": 11.922003746032715, + "learning_rate": 9.379985805535842e-05, + "loss": 0.12885262966156005, + "step": 43700 + }, + { + "epoch": 6.204400283889283, + "grad_norm": 8.441861152648926, + "learning_rate": 9.379843860894251e-05, + "loss": 0.062443208694458005, + "step": 43710 + }, + { + "epoch": 6.205819730305181, + "grad_norm": 0.432558536529541, + "learning_rate": 9.379701916252662e-05, + "loss": 0.016783684492111206, + "step": 43720 + }, + { + "epoch": 6.207239176721079, + "grad_norm": 3.7388827800750732, + "learning_rate": 9.379559971611072e-05, + "loss": 0.04627739787101746, + "step": 43730 + }, + { + "epoch": 6.2086586231369765, + "grad_norm": 5.352664470672607, + "learning_rate": 9.379418026969482e-05, + "loss": 0.04351229965686798, + "step": 43740 + }, + { + "epoch": 6.210078069552875, + "grad_norm": 0.5855856537818909, + "learning_rate": 9.379276082327893e-05, + "loss": 0.05773522257804871, + "step": 43750 + }, + { + "epoch": 6.211497515968772, + "grad_norm": 0.16541746258735657, + "learning_rate": 9.379134137686303e-05, + "loss": 0.04460527896881104, + "step": 43760 + }, + { + "epoch": 6.21291696238467, + "grad_norm": 0.3547366261482239, + "learning_rate": 9.378992193044714e-05, + "loss": 0.1184334397315979, + "step": 43770 + }, + { + "epoch": 6.214336408800568, + "grad_norm": 2.3118815422058105, + "learning_rate": 9.378850248403124e-05, + "loss": 0.05782003998756409, + "step": 43780 + }, + { + "epoch": 6.215755855216465, + "grad_norm": 0.15461857616901398, + "learning_rate": 9.378708303761533e-05, + "loss": 0.07659928202629089, + "step": 43790 + }, + { + "epoch": 6.217175301632364, + "grad_norm": 2.9949324131011963, + "learning_rate": 9.378566359119943e-05, + "loss": 0.06376568078994752, + "step": 43800 + }, + { + "epoch": 6.218594748048261, + "grad_norm": 0.9916458129882812, + "learning_rate": 9.378424414478354e-05, + "loss": 0.018292531371116638, + "step": 43810 + }, + { + "epoch": 6.220014194464159, + "grad_norm": 0.48116499185562134, + "learning_rate": 9.378282469836764e-05, + "loss": 0.024942028522491454, + "step": 43820 + }, + { + "epoch": 6.221433640880057, + "grad_norm": 1.3514341115951538, + "learning_rate": 9.378140525195175e-05, + "loss": 0.09017609357833863, + "step": 43830 + }, + { + "epoch": 6.222853087295954, + "grad_norm": 1.7869921922683716, + "learning_rate": 9.377998580553585e-05, + "loss": 0.06085496544837952, + "step": 43840 + }, + { + "epoch": 6.2242725337118525, + "grad_norm": 1.120748519897461, + "learning_rate": 9.377856635911994e-05, + "loss": 0.08365220427513123, + "step": 43850 + }, + { + "epoch": 6.22569198012775, + "grad_norm": 4.36846399307251, + "learning_rate": 9.377714691270405e-05, + "loss": 0.03777306079864502, + "step": 43860 + }, + { + "epoch": 6.227111426543648, + "grad_norm": 2.9369184970855713, + "learning_rate": 9.377572746628815e-05, + "loss": 0.08006559610366822, + "step": 43870 + }, + { + "epoch": 6.228530872959546, + "grad_norm": 0.3148958384990692, + "learning_rate": 9.377430801987226e-05, + "loss": 0.0385653018951416, + "step": 43880 + }, + { + "epoch": 6.229950319375444, + "grad_norm": 1.3210095167160034, + "learning_rate": 9.377288857345635e-05, + "loss": 0.033772128820419314, + "step": 43890 + }, + { + "epoch": 6.231369765791341, + "grad_norm": 2.804184675216675, + "learning_rate": 9.377146912704046e-05, + "loss": 0.0412954181432724, + "step": 43900 + }, + { + "epoch": 6.232789212207239, + "grad_norm": 0.18998447060585022, + "learning_rate": 9.377004968062456e-05, + "loss": 0.020071253180503845, + "step": 43910 + }, + { + "epoch": 6.234208658623137, + "grad_norm": 0.3029160499572754, + "learning_rate": 9.376863023420867e-05, + "loss": 0.06371122598648071, + "step": 43920 + }, + { + "epoch": 6.2356281050390345, + "grad_norm": 9.19083023071289, + "learning_rate": 9.376721078779276e-05, + "loss": 0.03808676302433014, + "step": 43930 + }, + { + "epoch": 6.237047551454933, + "grad_norm": 0.2272202968597412, + "learning_rate": 9.376579134137686e-05, + "loss": 0.04455348253250122, + "step": 43940 + }, + { + "epoch": 6.23846699787083, + "grad_norm": 0.23112483322620392, + "learning_rate": 9.376437189496097e-05, + "loss": 0.01939200460910797, + "step": 43950 + }, + { + "epoch": 6.239886444286729, + "grad_norm": 0.1665191948413849, + "learning_rate": 9.376295244854507e-05, + "loss": 0.03822802007198334, + "step": 43960 + }, + { + "epoch": 6.241305890702626, + "grad_norm": 11.432101249694824, + "learning_rate": 9.376153300212918e-05, + "loss": 0.10485298633575439, + "step": 43970 + }, + { + "epoch": 6.242725337118523, + "grad_norm": 1.8148565292358398, + "learning_rate": 9.376011355571328e-05, + "loss": 0.09240283370018006, + "step": 43980 + }, + { + "epoch": 6.244144783534422, + "grad_norm": 0.7837454676628113, + "learning_rate": 9.375869410929738e-05, + "loss": 0.05253263115882874, + "step": 43990 + }, + { + "epoch": 6.245564229950319, + "grad_norm": 5.624810695648193, + "learning_rate": 9.375727466288147e-05, + "loss": 0.049315616488456726, + "step": 44000 + }, + { + "epoch": 6.245564229950319, + "eval_accuracy": 0.9733579195014942, + "eval_loss": 0.0813438892364502, + "eval_runtime": 32.0503, + "eval_samples_per_second": 490.697, + "eval_steps_per_second": 15.351, + "step": 44000 + }, + { + "epoch": 6.246983676366217, + "grad_norm": 5.86858606338501, + "learning_rate": 9.375585521646558e-05, + "loss": 0.029601067304611206, + "step": 44010 + }, + { + "epoch": 6.248403122782115, + "grad_norm": 1.0253491401672363, + "learning_rate": 9.375443577004968e-05, + "loss": 0.05629914402961731, + "step": 44020 + }, + { + "epoch": 6.249822569198013, + "grad_norm": 0.475136399269104, + "learning_rate": 9.375301632363379e-05, + "loss": 0.038224822282791136, + "step": 44030 + }, + { + "epoch": 6.251242015613911, + "grad_norm": 10.579747200012207, + "learning_rate": 9.375159687721789e-05, + "loss": 0.08462954759597778, + "step": 44040 + }, + { + "epoch": 6.252661462029808, + "grad_norm": 1.7066859006881714, + "learning_rate": 9.375017743080199e-05, + "loss": 0.07730778455734252, + "step": 44050 + }, + { + "epoch": 6.254080908445706, + "grad_norm": 0.373104065656662, + "learning_rate": 9.37487579843861e-05, + "loss": 0.04493587613105774, + "step": 44060 + }, + { + "epoch": 6.255500354861604, + "grad_norm": 2.209771156311035, + "learning_rate": 9.37473385379702e-05, + "loss": 0.031600701808929446, + "step": 44070 + }, + { + "epoch": 6.256919801277502, + "grad_norm": 1.3299434185028076, + "learning_rate": 9.37459190915543e-05, + "loss": 0.04354170858860016, + "step": 44080 + }, + { + "epoch": 6.258339247693399, + "grad_norm": 3.383254289627075, + "learning_rate": 9.374449964513839e-05, + "loss": 0.03865576386451721, + "step": 44090 + }, + { + "epoch": 6.259758694109298, + "grad_norm": 0.8928094506263733, + "learning_rate": 9.37430801987225e-05, + "loss": 0.11763886213302613, + "step": 44100 + }, + { + "epoch": 6.261178140525195, + "grad_norm": 0.7259202003479004, + "learning_rate": 9.37416607523066e-05, + "loss": 0.07595643401145935, + "step": 44110 + }, + { + "epoch": 6.262597586941093, + "grad_norm": 10.287557601928711, + "learning_rate": 9.374024130589071e-05, + "loss": 0.026817291975021362, + "step": 44120 + }, + { + "epoch": 6.264017033356991, + "grad_norm": 6.837195873260498, + "learning_rate": 9.37388218594748e-05, + "loss": 0.045735102891921994, + "step": 44130 + }, + { + "epoch": 6.265436479772888, + "grad_norm": 2.9552390575408936, + "learning_rate": 9.373740241305892e-05, + "loss": 0.06823272109031678, + "step": 44140 + }, + { + "epoch": 6.266855926188787, + "grad_norm": 7.999316692352295, + "learning_rate": 9.373598296664301e-05, + "loss": 0.08050004243850709, + "step": 44150 + }, + { + "epoch": 6.268275372604684, + "grad_norm": 6.604404926300049, + "learning_rate": 9.373456352022711e-05, + "loss": 0.04037375450134277, + "step": 44160 + }, + { + "epoch": 6.269694819020582, + "grad_norm": 0.303448885679245, + "learning_rate": 9.373314407381122e-05, + "loss": 0.06240311861038208, + "step": 44170 + }, + { + "epoch": 6.27111426543648, + "grad_norm": 0.028919706121087074, + "learning_rate": 9.373172462739532e-05, + "loss": 0.01785750240087509, + "step": 44180 + }, + { + "epoch": 6.272533711852377, + "grad_norm": 2.20166277885437, + "learning_rate": 9.373030518097943e-05, + "loss": 0.06402125954627991, + "step": 44190 + }, + { + "epoch": 6.2739531582682755, + "grad_norm": 9.297113418579102, + "learning_rate": 9.372888573456351e-05, + "loss": 0.031133198738098146, + "step": 44200 + }, + { + "epoch": 6.275372604684173, + "grad_norm": 2.6971702575683594, + "learning_rate": 9.372746628814763e-05, + "loss": 0.02306177020072937, + "step": 44210 + }, + { + "epoch": 6.276792051100071, + "grad_norm": 3.3070108890533447, + "learning_rate": 9.372604684173172e-05, + "loss": 0.08548479676246643, + "step": 44220 + }, + { + "epoch": 6.278211497515969, + "grad_norm": 7.981506824493408, + "learning_rate": 9.372462739531583e-05, + "loss": 0.013684302568435669, + "step": 44230 + }, + { + "epoch": 6.279630943931867, + "grad_norm": 8.1182222366333, + "learning_rate": 9.372320794889993e-05, + "loss": 0.03548404574394226, + "step": 44240 + }, + { + "epoch": 6.281050390347764, + "grad_norm": 0.4073677957057953, + "learning_rate": 9.372178850248403e-05, + "loss": 0.07523361444473267, + "step": 44250 + }, + { + "epoch": 6.282469836763662, + "grad_norm": 1.4130693674087524, + "learning_rate": 9.372036905606814e-05, + "loss": 0.06159330606460571, + "step": 44260 + }, + { + "epoch": 6.28388928317956, + "grad_norm": 0.19895227253437042, + "learning_rate": 9.371894960965224e-05, + "loss": 0.03507781326770783, + "step": 44270 + }, + { + "epoch": 6.2853087295954575, + "grad_norm": 0.9761980175971985, + "learning_rate": 9.371753016323635e-05, + "loss": 0.04177262783050537, + "step": 44280 + }, + { + "epoch": 6.286728176011356, + "grad_norm": 1.4593634605407715, + "learning_rate": 9.371611071682045e-05, + "loss": 0.02598634660243988, + "step": 44290 + }, + { + "epoch": 6.288147622427253, + "grad_norm": 0.16853317618370056, + "learning_rate": 9.371469127040454e-05, + "loss": 0.030664128065109254, + "step": 44300 + }, + { + "epoch": 6.2895670688431515, + "grad_norm": 5.152888774871826, + "learning_rate": 9.371327182398864e-05, + "loss": 0.026159942150115967, + "step": 44310 + }, + { + "epoch": 6.290986515259049, + "grad_norm": 0.22693578898906708, + "learning_rate": 9.371185237757275e-05, + "loss": 0.011505614221096038, + "step": 44320 + }, + { + "epoch": 6.292405961674946, + "grad_norm": 0.12040051072835922, + "learning_rate": 9.371043293115685e-05, + "loss": 0.063713937997818, + "step": 44330 + }, + { + "epoch": 6.293825408090845, + "grad_norm": 8.595141410827637, + "learning_rate": 9.370901348474096e-05, + "loss": 0.052627182006835936, + "step": 44340 + }, + { + "epoch": 6.295244854506742, + "grad_norm": 0.4570486545562744, + "learning_rate": 9.370759403832506e-05, + "loss": 0.05868352651596069, + "step": 44350 + }, + { + "epoch": 6.29666430092264, + "grad_norm": 10.074518203735352, + "learning_rate": 9.370617459190915e-05, + "loss": 0.0731860876083374, + "step": 44360 + }, + { + "epoch": 6.298083747338538, + "grad_norm": 5.729341506958008, + "learning_rate": 9.370475514549327e-05, + "loss": 0.04211472272872925, + "step": 44370 + }, + { + "epoch": 6.299503193754436, + "grad_norm": 7.382556438446045, + "learning_rate": 9.370333569907736e-05, + "loss": 0.041568410396575925, + "step": 44380 + }, + { + "epoch": 6.3009226401703335, + "grad_norm": 7.665040016174316, + "learning_rate": 9.370191625266147e-05, + "loss": 0.08681845664978027, + "step": 44390 + }, + { + "epoch": 6.302342086586231, + "grad_norm": 5.031284332275391, + "learning_rate": 9.370049680624556e-05, + "loss": 0.06689251661300659, + "step": 44400 + }, + { + "epoch": 6.303761533002129, + "grad_norm": 8.009147644042969, + "learning_rate": 9.369907735982967e-05, + "loss": 0.05348163843154907, + "step": 44410 + }, + { + "epoch": 6.305180979418027, + "grad_norm": 0.7823144793510437, + "learning_rate": 9.369765791341377e-05, + "loss": 0.025225034356117247, + "step": 44420 + }, + { + "epoch": 6.306600425833925, + "grad_norm": 6.241203784942627, + "learning_rate": 9.369623846699788e-05, + "loss": 0.055314040184020995, + "step": 44430 + }, + { + "epoch": 6.308019872249822, + "grad_norm": 0.48145613074302673, + "learning_rate": 9.369481902058199e-05, + "loss": 0.046424245834350585, + "step": 44440 + }, + { + "epoch": 6.309439318665721, + "grad_norm": 8.973401069641113, + "learning_rate": 9.369339957416608e-05, + "loss": 0.05323241949081421, + "step": 44450 + }, + { + "epoch": 6.310858765081618, + "grad_norm": 10.027241706848145, + "learning_rate": 9.369198012775018e-05, + "loss": 0.027989843487739564, + "step": 44460 + }, + { + "epoch": 6.312278211497516, + "grad_norm": 0.4566883146762848, + "learning_rate": 9.369056068133428e-05, + "loss": 0.010857632756233216, + "step": 44470 + }, + { + "epoch": 6.313697657913414, + "grad_norm": 3.899604320526123, + "learning_rate": 9.368914123491839e-05, + "loss": 0.03426201343536377, + "step": 44480 + }, + { + "epoch": 6.315117104329311, + "grad_norm": 0.21963563561439514, + "learning_rate": 9.368772178850249e-05, + "loss": 0.021139997243881225, + "step": 44490 + }, + { + "epoch": 6.31653655074521, + "grad_norm": 0.3161865472793579, + "learning_rate": 9.36863023420866e-05, + "loss": 0.06483102440834046, + "step": 44500 + }, + { + "epoch": 6.31653655074521, + "eval_accuracy": 0.9754562217841928, + "eval_loss": 0.07596415281295776, + "eval_runtime": 32.4521, + "eval_samples_per_second": 484.622, + "eval_steps_per_second": 15.161, + "step": 44500 + }, + { + "epoch": 6.317955997161107, + "grad_norm": 3.4797637462615967, + "learning_rate": 9.368488289567068e-05, + "loss": 0.046000164747238156, + "step": 44510 + }, + { + "epoch": 6.319375443577005, + "grad_norm": 3.0673649311065674, + "learning_rate": 9.36834634492548e-05, + "loss": 0.06700726747512817, + "step": 44520 + }, + { + "epoch": 6.320794889992903, + "grad_norm": 0.4138965308666229, + "learning_rate": 9.36820440028389e-05, + "loss": 0.03780338764190674, + "step": 44530 + }, + { + "epoch": 6.3222143364088, + "grad_norm": 0.016734696924686432, + "learning_rate": 9.3680624556423e-05, + "loss": 0.059049326181411746, + "step": 44540 + }, + { + "epoch": 6.3236337828246985, + "grad_norm": 4.171092987060547, + "learning_rate": 9.367920511000711e-05, + "loss": 0.021105588972568513, + "step": 44550 + }, + { + "epoch": 6.325053229240596, + "grad_norm": 0.40709081292152405, + "learning_rate": 9.36777856635912e-05, + "loss": 0.031069639325141906, + "step": 44560 + }, + { + "epoch": 6.326472675656494, + "grad_norm": 1.556479811668396, + "learning_rate": 9.367636621717531e-05, + "loss": 0.022256243228912353, + "step": 44570 + }, + { + "epoch": 6.327892122072392, + "grad_norm": 0.566481351852417, + "learning_rate": 9.36749467707594e-05, + "loss": 0.06047871708869934, + "step": 44580 + }, + { + "epoch": 6.32931156848829, + "grad_norm": 0.029307467862963676, + "learning_rate": 9.367352732434352e-05, + "loss": 0.0970751941204071, + "step": 44590 + }, + { + "epoch": 6.330731014904187, + "grad_norm": 0.20486246049404144, + "learning_rate": 9.367210787792761e-05, + "loss": 0.015537744760513306, + "step": 44600 + }, + { + "epoch": 6.332150461320085, + "grad_norm": 7.652834415435791, + "learning_rate": 9.367068843151171e-05, + "loss": 0.031410837173461915, + "step": 44610 + }, + { + "epoch": 6.333569907735983, + "grad_norm": 0.8402873277664185, + "learning_rate": 9.366926898509582e-05, + "loss": 0.02272775173187256, + "step": 44620 + }, + { + "epoch": 6.3349893541518805, + "grad_norm": 0.1902949959039688, + "learning_rate": 9.366784953867992e-05, + "loss": 0.06183580756187439, + "step": 44630 + }, + { + "epoch": 6.336408800567779, + "grad_norm": 1.0105608701705933, + "learning_rate": 9.366643009226403e-05, + "loss": 0.027782031893730165, + "step": 44640 + }, + { + "epoch": 6.337828246983676, + "grad_norm": 2.3518152236938477, + "learning_rate": 9.366501064584813e-05, + "loss": 0.05995446443557739, + "step": 44650 + }, + { + "epoch": 6.3392476933995745, + "grad_norm": 0.39905598759651184, + "learning_rate": 9.366359119943222e-05, + "loss": 0.061394399404525755, + "step": 44660 + }, + { + "epoch": 6.340667139815472, + "grad_norm": 7.838604927062988, + "learning_rate": 9.366217175301632e-05, + "loss": 0.06700649261474609, + "step": 44670 + }, + { + "epoch": 6.342086586231369, + "grad_norm": 0.11704706400632858, + "learning_rate": 9.366075230660043e-05, + "loss": 0.03449685275554657, + "step": 44680 + }, + { + "epoch": 6.343506032647268, + "grad_norm": 1.6874034404754639, + "learning_rate": 9.365933286018453e-05, + "loss": 0.08206239938735962, + "step": 44690 + }, + { + "epoch": 6.344925479063165, + "grad_norm": 7.237381935119629, + "learning_rate": 9.365791341376864e-05, + "loss": 0.08659184575080872, + "step": 44700 + }, + { + "epoch": 6.346344925479063, + "grad_norm": 2.569840908050537, + "learning_rate": 9.365649396735274e-05, + "loss": 0.10804457664489746, + "step": 44710 + }, + { + "epoch": 6.347764371894961, + "grad_norm": 12.174409866333008, + "learning_rate": 9.365507452093684e-05, + "loss": 0.08026717901229859, + "step": 44720 + }, + { + "epoch": 6.349183818310859, + "grad_norm": 0.8195727467536926, + "learning_rate": 9.365365507452095e-05, + "loss": 0.024174678325653075, + "step": 44730 + }, + { + "epoch": 6.3506032647267565, + "grad_norm": 1.0109519958496094, + "learning_rate": 9.365223562810504e-05, + "loss": 0.08143213391304016, + "step": 44740 + }, + { + "epoch": 6.352022711142654, + "grad_norm": 6.605328559875488, + "learning_rate": 9.365081618168916e-05, + "loss": 0.03471195697784424, + "step": 44750 + }, + { + "epoch": 6.353442157558552, + "grad_norm": 0.9406673908233643, + "learning_rate": 9.364939673527324e-05, + "loss": 0.0571575939655304, + "step": 44760 + }, + { + "epoch": 6.35486160397445, + "grad_norm": 0.23806260526180267, + "learning_rate": 9.364797728885735e-05, + "loss": 0.04274870157241821, + "step": 44770 + }, + { + "epoch": 6.356281050390348, + "grad_norm": 1.739245057106018, + "learning_rate": 9.364655784244145e-05, + "loss": 0.033979329466819766, + "step": 44780 + }, + { + "epoch": 6.357700496806245, + "grad_norm": 2.6521658897399902, + "learning_rate": 9.364513839602556e-05, + "loss": 0.0800173044204712, + "step": 44790 + }, + { + "epoch": 6.359119943222144, + "grad_norm": 9.809504508972168, + "learning_rate": 9.364371894960966e-05, + "loss": 0.11794828176498413, + "step": 44800 + }, + { + "epoch": 6.360539389638041, + "grad_norm": 0.32209938764572144, + "learning_rate": 9.364229950319377e-05, + "loss": 0.037167853116989134, + "step": 44810 + }, + { + "epoch": 6.3619588360539385, + "grad_norm": 1.5642523765563965, + "learning_rate": 9.364088005677786e-05, + "loss": 0.02572680115699768, + "step": 44820 + }, + { + "epoch": 6.363378282469837, + "grad_norm": 0.2237132340669632, + "learning_rate": 9.363946061036196e-05, + "loss": 0.022832623124122618, + "step": 44830 + }, + { + "epoch": 6.364797728885734, + "grad_norm": 2.831024646759033, + "learning_rate": 9.363804116394607e-05, + "loss": 0.05402438640594483, + "step": 44840 + }, + { + "epoch": 6.366217175301633, + "grad_norm": 5.034896373748779, + "learning_rate": 9.363662171753017e-05, + "loss": 0.07991594672203065, + "step": 44850 + }, + { + "epoch": 6.36763662171753, + "grad_norm": 1.0471001863479614, + "learning_rate": 9.363520227111428e-05, + "loss": 0.01694260984659195, + "step": 44860 + }, + { + "epoch": 6.369056068133428, + "grad_norm": 4.997382164001465, + "learning_rate": 9.363378282469836e-05, + "loss": 0.04651702046394348, + "step": 44870 + }, + { + "epoch": 6.370475514549326, + "grad_norm": 3.6034419536590576, + "learning_rate": 9.363236337828248e-05, + "loss": 0.029604411125183104, + "step": 44880 + }, + { + "epoch": 6.371894960965223, + "grad_norm": 5.520668029785156, + "learning_rate": 9.363094393186657e-05, + "loss": 0.05422980189323425, + "step": 44890 + }, + { + "epoch": 6.373314407381121, + "grad_norm": 4.492794990539551, + "learning_rate": 9.362952448545068e-05, + "loss": 0.057867521047592164, + "step": 44900 + }, + { + "epoch": 6.374733853797019, + "grad_norm": 9.890128135681152, + "learning_rate": 9.362810503903478e-05, + "loss": 0.048404908180236815, + "step": 44910 + }, + { + "epoch": 6.376153300212917, + "grad_norm": 1.9544626474380493, + "learning_rate": 9.362668559261888e-05, + "loss": 0.052456903457641604, + "step": 44920 + }, + { + "epoch": 6.377572746628815, + "grad_norm": 0.8930409550666809, + "learning_rate": 9.362526614620299e-05, + "loss": 0.04241478145122528, + "step": 44930 + }, + { + "epoch": 6.378992193044713, + "grad_norm": 8.498833656311035, + "learning_rate": 9.362384669978709e-05, + "loss": 0.023489537835121154, + "step": 44940 + }, + { + "epoch": 6.38041163946061, + "grad_norm": 1.9117820262908936, + "learning_rate": 9.36224272533712e-05, + "loss": 0.050252276659011844, + "step": 44950 + }, + { + "epoch": 6.381831085876508, + "grad_norm": 0.12219048291444778, + "learning_rate": 9.36210078069553e-05, + "loss": 0.031068319082260133, + "step": 44960 + }, + { + "epoch": 6.383250532292406, + "grad_norm": 1.1104416847229004, + "learning_rate": 9.361958836053939e-05, + "loss": 0.034421283006668094, + "step": 44970 + }, + { + "epoch": 6.384669978708303, + "grad_norm": 5.63448429107666, + "learning_rate": 9.361816891412349e-05, + "loss": 0.06306658387184143, + "step": 44980 + }, + { + "epoch": 6.386089425124202, + "grad_norm": 0.5965878963470459, + "learning_rate": 9.36167494677076e-05, + "loss": 0.09757879376411438, + "step": 44990 + }, + { + "epoch": 6.387508871540099, + "grad_norm": 7.923148155212402, + "learning_rate": 9.36153300212917e-05, + "loss": 0.02036636769771576, + "step": 45000 + }, + { + "epoch": 6.387508871540099, + "eval_accuracy": 0.9795892414319324, + "eval_loss": 0.05749217048287392, + "eval_runtime": 32.5022, + "eval_samples_per_second": 483.874, + "eval_steps_per_second": 15.137, + "step": 45000 + }, + { + "epoch": 6.3889283179559975, + "grad_norm": 9.941558837890625, + "learning_rate": 9.361391057487581e-05, + "loss": 0.05102572441101074, + "step": 45010 + }, + { + "epoch": 6.390347764371895, + "grad_norm": 3.2416279315948486, + "learning_rate": 9.36124911284599e-05, + "loss": 0.07383667826652526, + "step": 45020 + }, + { + "epoch": 6.391767210787792, + "grad_norm": 5.702998638153076, + "learning_rate": 9.3611071682044e-05, + "loss": 0.028665339946746825, + "step": 45030 + }, + { + "epoch": 6.393186657203691, + "grad_norm": 0.4869743883609772, + "learning_rate": 9.360965223562811e-05, + "loss": 0.037399545311927795, + "step": 45040 + }, + { + "epoch": 6.394606103619588, + "grad_norm": 1.0424598455429077, + "learning_rate": 9.360823278921221e-05, + "loss": 0.04913428127765655, + "step": 45050 + }, + { + "epoch": 6.396025550035486, + "grad_norm": 4.111805438995361, + "learning_rate": 9.360681334279632e-05, + "loss": 0.019465672969818115, + "step": 45060 + }, + { + "epoch": 6.397444996451384, + "grad_norm": 0.8496167659759521, + "learning_rate": 9.360553584102201e-05, + "loss": 0.08575330376625061, + "step": 45070 + }, + { + "epoch": 6.398864442867282, + "grad_norm": 2.862640857696533, + "learning_rate": 9.36041163946061e-05, + "loss": 0.022041980922222138, + "step": 45080 + }, + { + "epoch": 6.4002838892831795, + "grad_norm": 0.3247391879558563, + "learning_rate": 9.36026969481902e-05, + "loss": 0.040409648418426515, + "step": 45090 + }, + { + "epoch": 6.401703335699077, + "grad_norm": 4.703793525695801, + "learning_rate": 9.360127750177431e-05, + "loss": 0.023367772996425628, + "step": 45100 + }, + { + "epoch": 6.403122782114975, + "grad_norm": 0.1681506484746933, + "learning_rate": 9.359985805535841e-05, + "loss": 0.024120521545410157, + "step": 45110 + }, + { + "epoch": 6.404542228530873, + "grad_norm": 5.247531890869141, + "learning_rate": 9.359843860894252e-05, + "loss": 0.028122204542160033, + "step": 45120 + }, + { + "epoch": 6.405961674946771, + "grad_norm": 0.015556308440864086, + "learning_rate": 9.359701916252662e-05, + "loss": 0.014227265119552612, + "step": 45130 + }, + { + "epoch": 6.407381121362668, + "grad_norm": 3.447943687438965, + "learning_rate": 9.359559971611073e-05, + "loss": 0.05484868288040161, + "step": 45140 + }, + { + "epoch": 6.408800567778567, + "grad_norm": 0.591742217540741, + "learning_rate": 9.359418026969481e-05, + "loss": 0.052353084087371826, + "step": 45150 + }, + { + "epoch": 6.410220014194464, + "grad_norm": 1.4653022289276123, + "learning_rate": 9.359276082327892e-05, + "loss": 0.04562208354473114, + "step": 45160 + }, + { + "epoch": 6.4116394606103615, + "grad_norm": 4.289706230163574, + "learning_rate": 9.359134137686302e-05, + "loss": 0.04674837589263916, + "step": 45170 + }, + { + "epoch": 6.41305890702626, + "grad_norm": 7.5176682472229, + "learning_rate": 9.358992193044713e-05, + "loss": 0.048534101247787474, + "step": 45180 + }, + { + "epoch": 6.414478353442157, + "grad_norm": 0.7810305953025818, + "learning_rate": 9.358850248403124e-05, + "loss": 0.00921451896429062, + "step": 45190 + }, + { + "epoch": 6.4158977998580555, + "grad_norm": 1.4035773277282715, + "learning_rate": 9.358708303761533e-05, + "loss": 0.033289432525634766, + "step": 45200 + }, + { + "epoch": 6.417317246273953, + "grad_norm": 1.1093297004699707, + "learning_rate": 9.358566359119944e-05, + "loss": 0.025805479288101195, + "step": 45210 + }, + { + "epoch": 6.418736692689851, + "grad_norm": 0.6371444463729858, + "learning_rate": 9.358424414478354e-05, + "loss": 0.03949221968650818, + "step": 45220 + }, + { + "epoch": 6.420156139105749, + "grad_norm": 5.708765983581543, + "learning_rate": 9.358282469836765e-05, + "loss": 0.04703320562839508, + "step": 45230 + }, + { + "epoch": 6.421575585521646, + "grad_norm": 7.504692077636719, + "learning_rate": 9.358140525195174e-05, + "loss": 0.027354171872138976, + "step": 45240 + }, + { + "epoch": 6.422995031937544, + "grad_norm": 6.347846984863281, + "learning_rate": 9.357998580553584e-05, + "loss": 0.07207931280136108, + "step": 45250 + }, + { + "epoch": 6.424414478353442, + "grad_norm": 4.845231533050537, + "learning_rate": 9.357856635911994e-05, + "loss": 0.029147011041641236, + "step": 45260 + }, + { + "epoch": 6.42583392476934, + "grad_norm": 2.5489513874053955, + "learning_rate": 9.357714691270405e-05, + "loss": 0.05189456939697266, + "step": 45270 + }, + { + "epoch": 6.4272533711852375, + "grad_norm": 3.6041786670684814, + "learning_rate": 9.357572746628816e-05, + "loss": 0.056130462884902955, + "step": 45280 + }, + { + "epoch": 6.428672817601136, + "grad_norm": 2.5728187561035156, + "learning_rate": 9.357430801987226e-05, + "loss": 0.02223515808582306, + "step": 45290 + }, + { + "epoch": 6.430092264017033, + "grad_norm": 5.534434795379639, + "learning_rate": 9.357288857345636e-05, + "loss": 0.1306079149246216, + "step": 45300 + }, + { + "epoch": 6.431511710432932, + "grad_norm": 2.4969444274902344, + "learning_rate": 9.357146912704045e-05, + "loss": 0.030625393986701964, + "step": 45310 + }, + { + "epoch": 6.432931156848829, + "grad_norm": 0.8693184852600098, + "learning_rate": 9.357004968062456e-05, + "loss": 0.058841168880462646, + "step": 45320 + }, + { + "epoch": 6.434350603264726, + "grad_norm": 0.7950196266174316, + "learning_rate": 9.356863023420866e-05, + "loss": 0.07491456866264343, + "step": 45330 + }, + { + "epoch": 6.435770049680625, + "grad_norm": 8.494572639465332, + "learning_rate": 9.356721078779277e-05, + "loss": 0.05608614683151245, + "step": 45340 + }, + { + "epoch": 6.437189496096522, + "grad_norm": 9.173154830932617, + "learning_rate": 9.356579134137686e-05, + "loss": 0.10091447830200195, + "step": 45350 + }, + { + "epoch": 6.43860894251242, + "grad_norm": 6.76547384262085, + "learning_rate": 9.356437189496097e-05, + "loss": 0.027798345685005187, + "step": 45360 + }, + { + "epoch": 6.440028388928318, + "grad_norm": 2.0051448345184326, + "learning_rate": 9.356295244854508e-05, + "loss": 0.07456052303314209, + "step": 45370 + }, + { + "epoch": 6.441447835344216, + "grad_norm": 2.5545055866241455, + "learning_rate": 9.356153300212918e-05, + "loss": 0.05242663621902466, + "step": 45380 + }, + { + "epoch": 6.442867281760114, + "grad_norm": 0.8713223338127136, + "learning_rate": 9.356011355571329e-05, + "loss": 0.01972261071205139, + "step": 45390 + }, + { + "epoch": 6.444286728176011, + "grad_norm": 2.4405908584594727, + "learning_rate": 9.355869410929737e-05, + "loss": 0.021471349895000456, + "step": 45400 + }, + { + "epoch": 6.445706174591909, + "grad_norm": 0.43845289945602417, + "learning_rate": 9.355727466288148e-05, + "loss": 0.041665560007095336, + "step": 45410 + }, + { + "epoch": 6.447125621007807, + "grad_norm": 4.887557029724121, + "learning_rate": 9.355585521646558e-05, + "loss": 0.021108832955360413, + "step": 45420 + }, + { + "epoch": 6.448545067423705, + "grad_norm": 13.41622543334961, + "learning_rate": 9.355443577004969e-05, + "loss": 0.030141952633857726, + "step": 45430 + }, + { + "epoch": 6.4499645138396025, + "grad_norm": 3.250065565109253, + "learning_rate": 9.355301632363379e-05, + "loss": 0.021225135028362273, + "step": 45440 + }, + { + "epoch": 6.451383960255501, + "grad_norm": 12.573553085327148, + "learning_rate": 9.355159687721788e-05, + "loss": 0.040314275026321414, + "step": 45450 + }, + { + "epoch": 6.452803406671398, + "grad_norm": 8.960000038146973, + "learning_rate": 9.3550177430802e-05, + "loss": 0.04356703162193298, + "step": 45460 + }, + { + "epoch": 6.454222853087296, + "grad_norm": 7.876071929931641, + "learning_rate": 9.354875798438609e-05, + "loss": 0.0315957635641098, + "step": 45470 + }, + { + "epoch": 6.455642299503194, + "grad_norm": 0.33070969581604004, + "learning_rate": 9.35473385379702e-05, + "loss": 0.04888508915901184, + "step": 45480 + }, + { + "epoch": 6.457061745919091, + "grad_norm": 7.309976100921631, + "learning_rate": 9.35459190915543e-05, + "loss": 0.03390091061592102, + "step": 45490 + }, + { + "epoch": 6.45848119233499, + "grad_norm": 10.262081146240234, + "learning_rate": 9.354449964513841e-05, + "loss": 0.05524118542671204, + "step": 45500 + }, + { + "epoch": 6.45848119233499, + "eval_accuracy": 0.9663635785591658, + "eval_loss": 0.11470869183540344, + "eval_runtime": 32.1395, + "eval_samples_per_second": 489.336, + "eval_steps_per_second": 15.308, + "step": 45500 + }, + { + "epoch": 6.459900638750887, + "grad_norm": 0.6667046546936035, + "learning_rate": 9.35430801987225e-05, + "loss": 0.027115851640701294, + "step": 45510 + }, + { + "epoch": 6.461320085166785, + "grad_norm": 2.887190580368042, + "learning_rate": 9.35416607523066e-05, + "loss": 0.06599999070167542, + "step": 45520 + }, + { + "epoch": 6.462739531582683, + "grad_norm": 4.560837268829346, + "learning_rate": 9.35402413058907e-05, + "loss": 0.09502153396606446, + "step": 45530 + }, + { + "epoch": 6.46415897799858, + "grad_norm": 3.5318081378936768, + "learning_rate": 9.353882185947481e-05, + "loss": 0.03770635426044464, + "step": 45540 + }, + { + "epoch": 6.4655784244144785, + "grad_norm": 1.6946613788604736, + "learning_rate": 9.353740241305891e-05, + "loss": 0.04129364490509033, + "step": 45550 + }, + { + "epoch": 6.466997870830376, + "grad_norm": 1.8913307189941406, + "learning_rate": 9.353598296664301e-05, + "loss": 0.054656922817230225, + "step": 45560 + }, + { + "epoch": 6.468417317246274, + "grad_norm": 0.312080442905426, + "learning_rate": 9.353456352022712e-05, + "loss": 0.04530891180038452, + "step": 45570 + }, + { + "epoch": 6.469836763662172, + "grad_norm": 5.941243648529053, + "learning_rate": 9.353314407381122e-05, + "loss": 0.02797747552394867, + "step": 45580 + }, + { + "epoch": 6.47125621007807, + "grad_norm": 0.10799313336610794, + "learning_rate": 9.353172462739533e-05, + "loss": 0.040154564380645755, + "step": 45590 + }, + { + "epoch": 6.472675656493967, + "grad_norm": 8.12649154663086, + "learning_rate": 9.353030518097943e-05, + "loss": 0.0917394757270813, + "step": 45600 + }, + { + "epoch": 6.474095102909865, + "grad_norm": 0.055603571236133575, + "learning_rate": 9.352888573456352e-05, + "loss": 0.021613481640815734, + "step": 45610 + }, + { + "epoch": 6.475514549325763, + "grad_norm": 3.251713275909424, + "learning_rate": 9.352746628814762e-05, + "loss": 0.06455045938491821, + "step": 45620 + }, + { + "epoch": 6.4769339957416605, + "grad_norm": 11.580523490905762, + "learning_rate": 9.352604684173173e-05, + "loss": 0.06646577119827271, + "step": 45630 + }, + { + "epoch": 6.478353442157559, + "grad_norm": 5.402514457702637, + "learning_rate": 9.352462739531583e-05, + "loss": 0.06263988018035889, + "step": 45640 + }, + { + "epoch": 6.479772888573456, + "grad_norm": 0.2940421998500824, + "learning_rate": 9.352320794889994e-05, + "loss": 0.06655374765396119, + "step": 45650 + }, + { + "epoch": 6.4811923349893545, + "grad_norm": 4.967324256896973, + "learning_rate": 9.352178850248404e-05, + "loss": 0.04391606450080872, + "step": 45660 + }, + { + "epoch": 6.482611781405252, + "grad_norm": 11.9403715133667, + "learning_rate": 9.352036905606813e-05, + "loss": 0.06087355017662048, + "step": 45670 + }, + { + "epoch": 6.484031227821149, + "grad_norm": 6.026821136474609, + "learning_rate": 9.351894960965225e-05, + "loss": 0.02247100919485092, + "step": 45680 + }, + { + "epoch": 6.485450674237048, + "grad_norm": 2.9908483028411865, + "learning_rate": 9.351753016323634e-05, + "loss": 0.0618190348148346, + "step": 45690 + }, + { + "epoch": 6.486870120652945, + "grad_norm": 0.9543291926383972, + "learning_rate": 9.351611071682045e-05, + "loss": 0.0716430127620697, + "step": 45700 + }, + { + "epoch": 6.488289567068843, + "grad_norm": 0.33498528599739075, + "learning_rate": 9.351469127040454e-05, + "loss": 0.0465453565120697, + "step": 45710 + }, + { + "epoch": 6.489709013484741, + "grad_norm": 6.390286922454834, + "learning_rate": 9.351327182398865e-05, + "loss": 0.07091315388679505, + "step": 45720 + }, + { + "epoch": 6.491128459900639, + "grad_norm": 1.1601217985153198, + "learning_rate": 9.351185237757275e-05, + "loss": 0.024645933508872987, + "step": 45730 + }, + { + "epoch": 6.4925479063165366, + "grad_norm": 11.113912582397461, + "learning_rate": 9.351043293115686e-05, + "loss": 0.11422479152679443, + "step": 45740 + }, + { + "epoch": 6.493967352732434, + "grad_norm": 2.370166540145874, + "learning_rate": 9.350901348474095e-05, + "loss": 0.05117689967155457, + "step": 45750 + }, + { + "epoch": 6.495386799148332, + "grad_norm": 0.16023661196231842, + "learning_rate": 9.350759403832505e-05, + "loss": 0.10015047788619995, + "step": 45760 + }, + { + "epoch": 6.49680624556423, + "grad_norm": 0.25259077548980713, + "learning_rate": 9.350617459190916e-05, + "loss": 0.05752279162406922, + "step": 45770 + }, + { + "epoch": 6.498225691980128, + "grad_norm": 5.102621078491211, + "learning_rate": 9.350475514549326e-05, + "loss": 0.04667982161045074, + "step": 45780 + }, + { + "epoch": 6.499645138396025, + "grad_norm": 8.65860652923584, + "learning_rate": 9.350333569907737e-05, + "loss": 0.07225523591041565, + "step": 45790 + }, + { + "epoch": 6.501064584811924, + "grad_norm": 1.2118887901306152, + "learning_rate": 9.350191625266147e-05, + "loss": 0.09330646991729737, + "step": 45800 + }, + { + "epoch": 6.502484031227821, + "grad_norm": 3.675856590270996, + "learning_rate": 9.350049680624557e-05, + "loss": 0.09584462642669678, + "step": 45810 + }, + { + "epoch": 6.503903477643719, + "grad_norm": 4.621379375457764, + "learning_rate": 9.349907735982966e-05, + "loss": 0.026826804876327513, + "step": 45820 + }, + { + "epoch": 6.505322924059617, + "grad_norm": 6.8167033195495605, + "learning_rate": 9.349765791341377e-05, + "loss": 0.04220779240131378, + "step": 45830 + }, + { + "epoch": 6.506742370475514, + "grad_norm": 7.388572692871094, + "learning_rate": 9.349623846699787e-05, + "loss": 0.06925356388092041, + "step": 45840 + }, + { + "epoch": 6.508161816891413, + "grad_norm": 1.5208756923675537, + "learning_rate": 9.349481902058198e-05, + "loss": 0.0498742550611496, + "step": 45850 + }, + { + "epoch": 6.50958126330731, + "grad_norm": 1.2147730588912964, + "learning_rate": 9.349339957416608e-05, + "loss": 0.05823368430137634, + "step": 45860 + }, + { + "epoch": 6.511000709723208, + "grad_norm": 7.438129901885986, + "learning_rate": 9.349198012775018e-05, + "loss": 0.02405182123184204, + "step": 45870 + }, + { + "epoch": 6.512420156139106, + "grad_norm": 3.8440439701080322, + "learning_rate": 9.349056068133429e-05, + "loss": 0.08130161166191101, + "step": 45880 + }, + { + "epoch": 6.513839602555003, + "grad_norm": 5.825738906860352, + "learning_rate": 9.348914123491839e-05, + "loss": 0.05340722799301147, + "step": 45890 + }, + { + "epoch": 6.5152590489709015, + "grad_norm": 0.20723451673984528, + "learning_rate": 9.34877217885025e-05, + "loss": 0.08683145642280579, + "step": 45900 + }, + { + "epoch": 6.516678495386799, + "grad_norm": 0.07636448740959167, + "learning_rate": 9.34863023420866e-05, + "loss": 0.013025203347206115, + "step": 45910 + }, + { + "epoch": 6.518097941802697, + "grad_norm": 6.467600345611572, + "learning_rate": 9.348488289567069e-05, + "loss": 0.07896758913993836, + "step": 45920 + }, + { + "epoch": 6.519517388218595, + "grad_norm": 0.15048600733280182, + "learning_rate": 9.348346344925479e-05, + "loss": 0.05304072499275207, + "step": 45930 + }, + { + "epoch": 6.520936834634493, + "grad_norm": 1.0183674097061157, + "learning_rate": 9.34820440028389e-05, + "loss": 0.02095807492733002, + "step": 45940 + }, + { + "epoch": 6.52235628105039, + "grad_norm": 0.6236469149589539, + "learning_rate": 9.3480624556423e-05, + "loss": 0.026838436722755432, + "step": 45950 + }, + { + "epoch": 6.523775727466289, + "grad_norm": 3.4434401988983154, + "learning_rate": 9.347920511000711e-05, + "loss": 0.052097213268280027, + "step": 45960 + }, + { + "epoch": 6.525195173882186, + "grad_norm": 2.725377321243286, + "learning_rate": 9.34777856635912e-05, + "loss": 0.03362095654010773, + "step": 45970 + }, + { + "epoch": 6.5266146202980835, + "grad_norm": 0.5739490985870361, + "learning_rate": 9.34763662171753e-05, + "loss": 0.03694147765636444, + "step": 45980 + }, + { + "epoch": 6.528034066713982, + "grad_norm": 2.8594954013824463, + "learning_rate": 9.347494677075941e-05, + "loss": 0.022614985704421997, + "step": 45990 + }, + { + "epoch": 6.529453513129879, + "grad_norm": 1.5943201780319214, + "learning_rate": 9.347352732434351e-05, + "loss": 0.028483986854553223, + "step": 46000 + }, + { + "epoch": 6.529453513129879, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.0719020664691925, + "eval_runtime": 32.803, + "eval_samples_per_second": 479.438, + "eval_steps_per_second": 14.999, + "step": 46000 + }, + { + "epoch": 6.5308729595457775, + "grad_norm": 0.6581725478172302, + "learning_rate": 9.347210787792762e-05, + "loss": 0.024489733576774596, + "step": 46010 + }, + { + "epoch": 6.532292405961675, + "grad_norm": 0.812021791934967, + "learning_rate": 9.34706884315117e-05, + "loss": 0.047544506192207334, + "step": 46020 + }, + { + "epoch": 6.533711852377573, + "grad_norm": 0.18070675432682037, + "learning_rate": 9.346926898509582e-05, + "loss": 0.05682721734046936, + "step": 46030 + }, + { + "epoch": 6.535131298793471, + "grad_norm": 1.2470442056655884, + "learning_rate": 9.346784953867991e-05, + "loss": 0.03914521634578705, + "step": 46040 + }, + { + "epoch": 6.536550745209368, + "grad_norm": 0.18507327139377594, + "learning_rate": 9.346643009226402e-05, + "loss": 0.05161336660385132, + "step": 46050 + }, + { + "epoch": 6.537970191625266, + "grad_norm": 2.305793523788452, + "learning_rate": 9.346501064584812e-05, + "loss": 0.04021240770816803, + "step": 46060 + }, + { + "epoch": 6.539389638041164, + "grad_norm": 0.20650917291641235, + "learning_rate": 9.346359119943222e-05, + "loss": 0.02591737508773804, + "step": 46070 + }, + { + "epoch": 6.540809084457062, + "grad_norm": 1.8644980192184448, + "learning_rate": 9.346217175301633e-05, + "loss": 0.022021229565143585, + "step": 46080 + }, + { + "epoch": 6.5422285308729595, + "grad_norm": 2.503336191177368, + "learning_rate": 9.346075230660043e-05, + "loss": 0.040047654509544374, + "step": 46090 + }, + { + "epoch": 6.543647977288858, + "grad_norm": 0.6199862957000732, + "learning_rate": 9.345933286018454e-05, + "loss": 0.01799771934747696, + "step": 46100 + }, + { + "epoch": 6.545067423704755, + "grad_norm": 0.29369279742240906, + "learning_rate": 9.345791341376864e-05, + "loss": 0.020970551669597624, + "step": 46110 + }, + { + "epoch": 6.546486870120653, + "grad_norm": 0.7996075749397278, + "learning_rate": 9.345649396735273e-05, + "loss": 0.043533599376678465, + "step": 46120 + }, + { + "epoch": 6.547906316536551, + "grad_norm": 0.3149011731147766, + "learning_rate": 9.345507452093683e-05, + "loss": 0.027598875761032104, + "step": 46130 + }, + { + "epoch": 6.549325762952448, + "grad_norm": 0.8261258602142334, + "learning_rate": 9.345365507452094e-05, + "loss": 0.04715876281261444, + "step": 46140 + }, + { + "epoch": 6.550745209368347, + "grad_norm": 4.057319164276123, + "learning_rate": 9.345223562810504e-05, + "loss": 0.03853771984577179, + "step": 46150 + }, + { + "epoch": 6.552164655784244, + "grad_norm": 8.443818092346191, + "learning_rate": 9.345081618168915e-05, + "loss": 0.07661219835281372, + "step": 46160 + }, + { + "epoch": 6.553584102200142, + "grad_norm": 7.326854705810547, + "learning_rate": 9.344939673527325e-05, + "loss": 0.08085020184516907, + "step": 46170 + }, + { + "epoch": 6.55500354861604, + "grad_norm": 0.49740973114967346, + "learning_rate": 9.344797728885734e-05, + "loss": 0.042351937294006346, + "step": 46180 + }, + { + "epoch": 6.556422995031937, + "grad_norm": 2.507627487182617, + "learning_rate": 9.344655784244146e-05, + "loss": 0.08018359541893005, + "step": 46190 + }, + { + "epoch": 6.557842441447836, + "grad_norm": 3.4351646900177, + "learning_rate": 9.344513839602555e-05, + "loss": 0.09477327466011047, + "step": 46200 + }, + { + "epoch": 6.559261887863733, + "grad_norm": 1.8519635200500488, + "learning_rate": 9.344371894960966e-05, + "loss": 0.04426932036876678, + "step": 46210 + }, + { + "epoch": 6.560681334279631, + "grad_norm": 1.091602087020874, + "learning_rate": 9.344229950319376e-05, + "loss": 0.02118045687675476, + "step": 46220 + }, + { + "epoch": 6.562100780695529, + "grad_norm": 4.351092338562012, + "learning_rate": 9.344088005677786e-05, + "loss": 0.039005580544471743, + "step": 46230 + }, + { + "epoch": 6.563520227111427, + "grad_norm": 0.24212029576301575, + "learning_rate": 9.343946061036196e-05, + "loss": 0.05372491478919983, + "step": 46240 + }, + { + "epoch": 6.564939673527324, + "grad_norm": 3.965920925140381, + "learning_rate": 9.343804116394607e-05, + "loss": 0.04061869978904724, + "step": 46250 + }, + { + "epoch": 6.566359119943222, + "grad_norm": 0.806727945804596, + "learning_rate": 9.343662171753016e-05, + "loss": 0.03351776301860809, + "step": 46260 + }, + { + "epoch": 6.56777856635912, + "grad_norm": 0.08710307627916336, + "learning_rate": 9.343520227111428e-05, + "loss": 0.03745451867580414, + "step": 46270 + }, + { + "epoch": 6.569198012775018, + "grad_norm": 2.581657648086548, + "learning_rate": 9.343378282469837e-05, + "loss": 0.03488259315490723, + "step": 46280 + }, + { + "epoch": 6.570617459190916, + "grad_norm": 5.0455427169799805, + "learning_rate": 9.343236337828247e-05, + "loss": 0.04453178346157074, + "step": 46290 + }, + { + "epoch": 6.572036905606813, + "grad_norm": 3.029026985168457, + "learning_rate": 9.343094393186658e-05, + "loss": 0.03993709683418274, + "step": 46300 + }, + { + "epoch": 6.573456352022712, + "grad_norm": 3.476870536804199, + "learning_rate": 9.342952448545068e-05, + "loss": 0.0863077163696289, + "step": 46310 + }, + { + "epoch": 6.574875798438609, + "grad_norm": 13.0468111038208, + "learning_rate": 9.342810503903479e-05, + "loss": 0.0779941201210022, + "step": 46320 + }, + { + "epoch": 6.5762952448545064, + "grad_norm": 0.7732180953025818, + "learning_rate": 9.342668559261887e-05, + "loss": 0.04824472963809967, + "step": 46330 + }, + { + "epoch": 6.577714691270405, + "grad_norm": 2.9980263710021973, + "learning_rate": 9.342526614620298e-05, + "loss": 0.051023286581039426, + "step": 46340 + }, + { + "epoch": 6.579134137686302, + "grad_norm": 3.4411685466766357, + "learning_rate": 9.342384669978708e-05, + "loss": 0.029833096265792846, + "step": 46350 + }, + { + "epoch": 6.5805535841022005, + "grad_norm": 0.820574939250946, + "learning_rate": 9.342242725337119e-05, + "loss": 0.05238626003265381, + "step": 46360 + }, + { + "epoch": 6.581973030518098, + "grad_norm": 0.5479278564453125, + "learning_rate": 9.342100780695529e-05, + "loss": 0.03393109440803528, + "step": 46370 + }, + { + "epoch": 6.583392476933996, + "grad_norm": 1.2850233316421509, + "learning_rate": 9.341958836053939e-05, + "loss": 0.029635608196258545, + "step": 46380 + }, + { + "epoch": 6.584811923349894, + "grad_norm": 0.3429562747478485, + "learning_rate": 9.34181689141235e-05, + "loss": 0.07307572960853577, + "step": 46390 + }, + { + "epoch": 6.586231369765791, + "grad_norm": 0.2704232633113861, + "learning_rate": 9.34167494677076e-05, + "loss": 0.03701426386833191, + "step": 46400 + }, + { + "epoch": 6.587650816181689, + "grad_norm": 1.3456661701202393, + "learning_rate": 9.34153300212917e-05, + "loss": 0.040149462223052976, + "step": 46410 + }, + { + "epoch": 6.589070262597587, + "grad_norm": 2.917856454849243, + "learning_rate": 9.34139105748758e-05, + "loss": 0.038611260056495664, + "step": 46420 + }, + { + "epoch": 6.590489709013485, + "grad_norm": 4.476409912109375, + "learning_rate": 9.34124911284599e-05, + "loss": 0.035803604125976565, + "step": 46430 + }, + { + "epoch": 6.5919091554293825, + "grad_norm": 0.41631996631622314, + "learning_rate": 9.3411071682044e-05, + "loss": 0.023559017479419707, + "step": 46440 + }, + { + "epoch": 6.593328601845281, + "grad_norm": 6.439150810241699, + "learning_rate": 9.340965223562811e-05, + "loss": 0.046225354075431824, + "step": 46450 + }, + { + "epoch": 6.594748048261178, + "grad_norm": 7.897381782531738, + "learning_rate": 9.340823278921221e-05, + "loss": 0.14436500072479247, + "step": 46460 + }, + { + "epoch": 6.596167494677076, + "grad_norm": 3.0115458965301514, + "learning_rate": 9.340681334279632e-05, + "loss": 0.02835048735141754, + "step": 46470 + }, + { + "epoch": 6.597586941092974, + "grad_norm": 1.3637899160385132, + "learning_rate": 9.340539389638042e-05, + "loss": 0.06278788447380065, + "step": 46480 + }, + { + "epoch": 6.599006387508871, + "grad_norm": 0.22502969205379486, + "learning_rate": 9.340397444996451e-05, + "loss": 0.02985972762107849, + "step": 46490 + }, + { + "epoch": 6.60042583392477, + "grad_norm": 4.472122669219971, + "learning_rate": 9.340255500354862e-05, + "loss": 0.07923436164855957, + "step": 46500 + }, + { + "epoch": 6.60042583392477, + "eval_accuracy": 0.9771730145609462, + "eval_loss": 0.07765964418649673, + "eval_runtime": 34.8533, + "eval_samples_per_second": 451.234, + "eval_steps_per_second": 14.116, + "step": 46500 + }, + { + "epoch": 6.601845280340667, + "grad_norm": 1.4962148666381836, + "learning_rate": 9.340113555713272e-05, + "loss": 0.05791752338409424, + "step": 46510 + }, + { + "epoch": 6.603264726756565, + "grad_norm": 9.11119556427002, + "learning_rate": 9.339971611071683e-05, + "loss": 0.09500344395637512, + "step": 46520 + }, + { + "epoch": 6.604684173172463, + "grad_norm": 0.042891647666692734, + "learning_rate": 9.339829666430092e-05, + "loss": 0.07049063444137574, + "step": 46530 + }, + { + "epoch": 6.60610361958836, + "grad_norm": 0.38025587797164917, + "learning_rate": 9.339687721788503e-05, + "loss": 0.022993910312652587, + "step": 46540 + }, + { + "epoch": 6.6075230660042585, + "grad_norm": 1.9691075086593628, + "learning_rate": 9.339545777146912e-05, + "loss": 0.028483304381370544, + "step": 46550 + }, + { + "epoch": 6.608942512420156, + "grad_norm": 4.02490234375, + "learning_rate": 9.339403832505323e-05, + "loss": 0.031239235401153566, + "step": 46560 + }, + { + "epoch": 6.610361958836054, + "grad_norm": 4.263495922088623, + "learning_rate": 9.339261887863733e-05, + "loss": 0.08727318644523621, + "step": 46570 + }, + { + "epoch": 6.611781405251952, + "grad_norm": 0.8540966510772705, + "learning_rate": 9.339119943222144e-05, + "loss": 0.040844264626502993, + "step": 46580 + }, + { + "epoch": 6.61320085166785, + "grad_norm": 7.359906196594238, + "learning_rate": 9.338977998580554e-05, + "loss": 0.09770232439041138, + "step": 46590 + }, + { + "epoch": 6.614620298083747, + "grad_norm": 0.7775536775588989, + "learning_rate": 9.338836053938964e-05, + "loss": 0.04570707976818085, + "step": 46600 + }, + { + "epoch": 6.616039744499645, + "grad_norm": 0.3048425614833832, + "learning_rate": 9.338694109297375e-05, + "loss": 0.013830628991127015, + "step": 46610 + }, + { + "epoch": 6.617459190915543, + "grad_norm": 0.7367076873779297, + "learning_rate": 9.338552164655785e-05, + "loss": 0.05104082226753235, + "step": 46620 + }, + { + "epoch": 6.6188786373314406, + "grad_norm": 0.09017914533615112, + "learning_rate": 9.338410220014196e-05, + "loss": 0.0412735253572464, + "step": 46630 + }, + { + "epoch": 6.620298083747339, + "grad_norm": 5.469369411468506, + "learning_rate": 9.338268275372604e-05, + "loss": 0.015447242558002472, + "step": 46640 + }, + { + "epoch": 6.621717530163236, + "grad_norm": 2.02463436126709, + "learning_rate": 9.338126330731015e-05, + "loss": 0.04360540807247162, + "step": 46650 + }, + { + "epoch": 6.623136976579135, + "grad_norm": 2.3378515243530273, + "learning_rate": 9.337984386089425e-05, + "loss": 0.044150394201278684, + "step": 46660 + }, + { + "epoch": 6.624556422995032, + "grad_norm": 2.2478842735290527, + "learning_rate": 9.337842441447836e-05, + "loss": 0.04635085165500641, + "step": 46670 + }, + { + "epoch": 6.625975869410929, + "grad_norm": 2.4840340614318848, + "learning_rate": 9.337700496806247e-05, + "loss": 0.04768993556499481, + "step": 46680 + }, + { + "epoch": 6.627395315826828, + "grad_norm": 2.0236635208129883, + "learning_rate": 9.337558552164656e-05, + "loss": 0.031044638156890868, + "step": 46690 + }, + { + "epoch": 6.628814762242725, + "grad_norm": 4.0689897537231445, + "learning_rate": 9.337416607523067e-05, + "loss": 0.03407057225704193, + "step": 46700 + }, + { + "epoch": 6.6302342086586235, + "grad_norm": 4.269835948944092, + "learning_rate": 9.337274662881476e-05, + "loss": 0.025110429525375365, + "step": 46710 + }, + { + "epoch": 6.631653655074521, + "grad_norm": 0.36590269207954407, + "learning_rate": 9.337132718239887e-05, + "loss": 0.0714455485343933, + "step": 46720 + }, + { + "epoch": 6.633073101490419, + "grad_norm": 0.7490295767784119, + "learning_rate": 9.336990773598297e-05, + "loss": 0.03289024829864502, + "step": 46730 + }, + { + "epoch": 6.634492547906317, + "grad_norm": 7.103208065032959, + "learning_rate": 9.336848828956707e-05, + "loss": 0.05486240386962891, + "step": 46740 + }, + { + "epoch": 6.635911994322214, + "grad_norm": 0.6095759272575378, + "learning_rate": 9.336706884315117e-05, + "loss": 0.02849574089050293, + "step": 46750 + }, + { + "epoch": 6.637331440738112, + "grad_norm": 8.099491119384766, + "learning_rate": 9.336564939673528e-05, + "loss": 0.06593834161758423, + "step": 46760 + }, + { + "epoch": 6.63875088715401, + "grad_norm": 5.979064464569092, + "learning_rate": 9.336422995031939e-05, + "loss": 0.058841896057128903, + "step": 46770 + }, + { + "epoch": 6.640170333569908, + "grad_norm": 0.5812722444534302, + "learning_rate": 9.336281050390349e-05, + "loss": 0.07408539652824402, + "step": 46780 + }, + { + "epoch": 6.6415897799858055, + "grad_norm": 8.559142112731934, + "learning_rate": 9.336139105748758e-05, + "loss": 0.05026623606681824, + "step": 46790 + }, + { + "epoch": 6.643009226401704, + "grad_norm": 1.433610200881958, + "learning_rate": 9.335997161107168e-05, + "loss": 0.024886251986026765, + "step": 46800 + }, + { + "epoch": 6.644428672817601, + "grad_norm": 5.033099174499512, + "learning_rate": 9.335855216465579e-05, + "loss": 0.04658404290676117, + "step": 46810 + }, + { + "epoch": 6.645848119233499, + "grad_norm": 1.879032850265503, + "learning_rate": 9.335713271823989e-05, + "loss": 0.031316140294075014, + "step": 46820 + }, + { + "epoch": 6.647267565649397, + "grad_norm": 1.1462465524673462, + "learning_rate": 9.3355713271824e-05, + "loss": 0.05361767411231995, + "step": 46830 + }, + { + "epoch": 6.648687012065294, + "grad_norm": 0.19135117530822754, + "learning_rate": 9.335429382540808e-05, + "loss": 0.04055591523647308, + "step": 46840 + }, + { + "epoch": 6.650106458481193, + "grad_norm": 0.23918968439102173, + "learning_rate": 9.33528743789922e-05, + "loss": 0.03483697474002838, + "step": 46850 + }, + { + "epoch": 6.65152590489709, + "grad_norm": 1.0027947425842285, + "learning_rate": 9.33514549325763e-05, + "loss": 0.03233981728553772, + "step": 46860 + }, + { + "epoch": 6.652945351312988, + "grad_norm": 4.6202473640441895, + "learning_rate": 9.33500354861604e-05, + "loss": 0.057700860500335696, + "step": 46870 + }, + { + "epoch": 6.654364797728886, + "grad_norm": 0.49680638313293457, + "learning_rate": 9.334861603974451e-05, + "loss": 0.013076686859130859, + "step": 46880 + }, + { + "epoch": 6.655784244144783, + "grad_norm": 12.804496765136719, + "learning_rate": 9.33471965933286e-05, + "loss": 0.07863889336585998, + "step": 46890 + }, + { + "epoch": 6.6572036905606815, + "grad_norm": 0.31738927960395813, + "learning_rate": 9.334577714691271e-05, + "loss": 0.03100692629814148, + "step": 46900 + }, + { + "epoch": 6.658623136976579, + "grad_norm": 5.837845802307129, + "learning_rate": 9.33443577004968e-05, + "loss": 0.04397173523902893, + "step": 46910 + }, + { + "epoch": 6.660042583392477, + "grad_norm": 3.81868314743042, + "learning_rate": 9.334293825408092e-05, + "loss": 0.033652427792549136, + "step": 46920 + }, + { + "epoch": 6.661462029808375, + "grad_norm": 0.06779835373163223, + "learning_rate": 9.334151880766501e-05, + "loss": 0.022059588134288786, + "step": 46930 + }, + { + "epoch": 6.662881476224273, + "grad_norm": 1.363473653793335, + "learning_rate": 9.334009936124912e-05, + "loss": 0.025214645266532897, + "step": 46940 + }, + { + "epoch": 6.66430092264017, + "grad_norm": 1.0777561664581299, + "learning_rate": 9.333867991483322e-05, + "loss": 0.02584805488586426, + "step": 46950 + }, + { + "epoch": 6.665720369056068, + "grad_norm": 5.158640384674072, + "learning_rate": 9.333726046841732e-05, + "loss": 0.07380213737487792, + "step": 46960 + }, + { + "epoch": 6.667139815471966, + "grad_norm": 0.5082380771636963, + "learning_rate": 9.333584102200143e-05, + "loss": 0.015044075250625611, + "step": 46970 + }, + { + "epoch": 6.6685592618878635, + "grad_norm": 3.573641300201416, + "learning_rate": 9.333442157558553e-05, + "loss": 0.029354152083396912, + "step": 46980 + }, + { + "epoch": 6.669978708303762, + "grad_norm": 2.1343441009521484, + "learning_rate": 9.333300212916964e-05, + "loss": 0.02817715108394623, + "step": 46990 + }, + { + "epoch": 6.671398154719659, + "grad_norm": 1.2065638303756714, + "learning_rate": 9.333158268275372e-05, + "loss": 0.02622884213924408, + "step": 47000 + }, + { + "epoch": 6.671398154719659, + "eval_accuracy": 0.9733579195014942, + "eval_loss": 0.08096129447221756, + "eval_runtime": 32.7148, + "eval_samples_per_second": 480.73, + "eval_steps_per_second": 15.039, + "step": 47000 + }, + { + "epoch": 6.6728176011355576, + "grad_norm": 0.8675025105476379, + "learning_rate": 9.333016323633783e-05, + "loss": 0.0328928142786026, + "step": 47010 + }, + { + "epoch": 6.674237047551455, + "grad_norm": 0.07285960763692856, + "learning_rate": 9.332874378992193e-05, + "loss": 0.09256799221038818, + "step": 47020 + }, + { + "epoch": 6.675656493967352, + "grad_norm": 4.019161224365234, + "learning_rate": 9.332732434350604e-05, + "loss": 0.07572144269943237, + "step": 47030 + }, + { + "epoch": 6.677075940383251, + "grad_norm": 7.3675150871276855, + "learning_rate": 9.332590489709014e-05, + "loss": 0.0726061463356018, + "step": 47040 + }, + { + "epoch": 6.678495386799148, + "grad_norm": 12.73004150390625, + "learning_rate": 9.332448545067424e-05, + "loss": 0.07878984212875366, + "step": 47050 + }, + { + "epoch": 6.679914833215046, + "grad_norm": 8.069070816040039, + "learning_rate": 9.332306600425835e-05, + "loss": 0.07341142892837524, + "step": 47060 + }, + { + "epoch": 6.681334279630944, + "grad_norm": 4.058375835418701, + "learning_rate": 9.332164655784245e-05, + "loss": 0.04533909559249878, + "step": 47070 + }, + { + "epoch": 6.682753726046842, + "grad_norm": 0.957751452922821, + "learning_rate": 9.332022711142656e-05, + "loss": 0.02306291162967682, + "step": 47080 + }, + { + "epoch": 6.68417317246274, + "grad_norm": 1.2759968042373657, + "learning_rate": 9.331880766501065e-05, + "loss": 0.03262047171592712, + "step": 47090 + }, + { + "epoch": 6.685592618878637, + "grad_norm": 3.3727896213531494, + "learning_rate": 9.331738821859475e-05, + "loss": 0.11750341653823852, + "step": 47100 + }, + { + "epoch": 6.687012065294535, + "grad_norm": 5.064836025238037, + "learning_rate": 9.331596877217885e-05, + "loss": 0.07212659120559692, + "step": 47110 + }, + { + "epoch": 6.688431511710433, + "grad_norm": 6.41402530670166, + "learning_rate": 9.331454932576296e-05, + "loss": 0.06090214252471924, + "step": 47120 + }, + { + "epoch": 6.689850958126331, + "grad_norm": 0.642575204372406, + "learning_rate": 9.331312987934706e-05, + "loss": 0.049497807025909425, + "step": 47130 + }, + { + "epoch": 6.691270404542228, + "grad_norm": 5.882664203643799, + "learning_rate": 9.331171043293117e-05, + "loss": 0.04391663372516632, + "step": 47140 + }, + { + "epoch": 6.692689850958127, + "grad_norm": 0.44636282324790955, + "learning_rate": 9.331029098651526e-05, + "loss": 0.02593545913696289, + "step": 47150 + }, + { + "epoch": 6.694109297374024, + "grad_norm": 4.4043989181518555, + "learning_rate": 9.330887154009936e-05, + "loss": 0.03738081157207489, + "step": 47160 + }, + { + "epoch": 6.695528743789922, + "grad_norm": 0.9728304743766785, + "learning_rate": 9.330745209368347e-05, + "loss": 0.044902724027633664, + "step": 47170 + }, + { + "epoch": 6.69694819020582, + "grad_norm": 1.4055997133255005, + "learning_rate": 9.330603264726757e-05, + "loss": 0.05182392597198486, + "step": 47180 + }, + { + "epoch": 6.698367636621717, + "grad_norm": 0.5322553515434265, + "learning_rate": 9.330461320085168e-05, + "loss": 0.03811835050582886, + "step": 47190 + }, + { + "epoch": 6.699787083037616, + "grad_norm": 5.997743606567383, + "learning_rate": 9.330319375443577e-05, + "loss": 0.08122850656509399, + "step": 47200 + }, + { + "epoch": 6.701206529453513, + "grad_norm": 3.8824660778045654, + "learning_rate": 9.330177430801988e-05, + "loss": 0.10573461055755615, + "step": 47210 + }, + { + "epoch": 6.702625975869411, + "grad_norm": 2.0631566047668457, + "learning_rate": 9.330035486160397e-05, + "loss": 0.07894684672355652, + "step": 47220 + }, + { + "epoch": 6.704045422285309, + "grad_norm": 0.43265125155448914, + "learning_rate": 9.329893541518808e-05, + "loss": 0.00781625285744667, + "step": 47230 + }, + { + "epoch": 6.705464868701206, + "grad_norm": 5.595661640167236, + "learning_rate": 9.329751596877218e-05, + "loss": 0.02967623770236969, + "step": 47240 + }, + { + "epoch": 6.7068843151171045, + "grad_norm": 4.208871841430664, + "learning_rate": 9.329609652235629e-05, + "loss": 0.03154032528400421, + "step": 47250 + }, + { + "epoch": 6.708303761533002, + "grad_norm": 2.042327642440796, + "learning_rate": 9.329467707594039e-05, + "loss": 0.035281413793563844, + "step": 47260 + }, + { + "epoch": 6.7097232079489, + "grad_norm": 2.4885547161102295, + "learning_rate": 9.329325762952449e-05, + "loss": 0.06740244030952454, + "step": 47270 + }, + { + "epoch": 6.711142654364798, + "grad_norm": 3.6246352195739746, + "learning_rate": 9.32918381831086e-05, + "loss": 0.03529942333698273, + "step": 47280 + }, + { + "epoch": 6.712562100780696, + "grad_norm": 7.900681018829346, + "learning_rate": 9.32904187366927e-05, + "loss": 0.04457513988018036, + "step": 47290 + }, + { + "epoch": 6.713981547196593, + "grad_norm": 6.030803680419922, + "learning_rate": 9.32889992902768e-05, + "loss": 0.07301679849624634, + "step": 47300 + }, + { + "epoch": 6.715400993612491, + "grad_norm": 1.9671475887298584, + "learning_rate": 9.328757984386089e-05, + "loss": 0.03088918924331665, + "step": 47310 + }, + { + "epoch": 6.716820440028389, + "grad_norm": 0.9832845330238342, + "learning_rate": 9.3286160397445e-05, + "loss": 0.03263017535209656, + "step": 47320 + }, + { + "epoch": 6.7182398864442865, + "grad_norm": 1.1375794410705566, + "learning_rate": 9.32847409510291e-05, + "loss": 0.04314920902252197, + "step": 47330 + }, + { + "epoch": 6.719659332860185, + "grad_norm": 1.5910285711288452, + "learning_rate": 9.328332150461321e-05, + "loss": 0.03253903090953827, + "step": 47340 + }, + { + "epoch": 6.721078779276082, + "grad_norm": 9.413193702697754, + "learning_rate": 9.328190205819731e-05, + "loss": 0.03828516602516174, + "step": 47350 + }, + { + "epoch": 6.7224982256919805, + "grad_norm": 10.063383102416992, + "learning_rate": 9.32804826117814e-05, + "loss": 0.06823945045471191, + "step": 47360 + }, + { + "epoch": 6.723917672107878, + "grad_norm": 10.355716705322266, + "learning_rate": 9.327906316536552e-05, + "loss": 0.06583051681518555, + "step": 47370 + }, + { + "epoch": 6.725337118523775, + "grad_norm": 1.6706312894821167, + "learning_rate": 9.327764371894961e-05, + "loss": 0.050222575664520264, + "step": 47380 + }, + { + "epoch": 6.726756564939674, + "grad_norm": 12.793781280517578, + "learning_rate": 9.327622427253372e-05, + "loss": 0.0696624755859375, + "step": 47390 + }, + { + "epoch": 6.728176011355571, + "grad_norm": 14.380304336547852, + "learning_rate": 9.327480482611782e-05, + "loss": 0.07547839879989623, + "step": 47400 + }, + { + "epoch": 6.729595457771469, + "grad_norm": 0.04411943256855011, + "learning_rate": 9.327338537970192e-05, + "loss": 0.05585261583328247, + "step": 47410 + }, + { + "epoch": 6.731014904187367, + "grad_norm": 5.027952194213867, + "learning_rate": 9.327196593328602e-05, + "loss": 0.0800262987613678, + "step": 47420 + }, + { + "epoch": 6.732434350603265, + "grad_norm": 3.2839534282684326, + "learning_rate": 9.327054648687013e-05, + "loss": 0.09744354486465454, + "step": 47430 + }, + { + "epoch": 6.7338537970191625, + "grad_norm": 1.8493952751159668, + "learning_rate": 9.326912704045422e-05, + "loss": 0.041766023635864256, + "step": 47440 + }, + { + "epoch": 6.73527324343506, + "grad_norm": 3.7212188243865967, + "learning_rate": 9.326770759403834e-05, + "loss": 0.03478267788887024, + "step": 47450 + }, + { + "epoch": 6.736692689850958, + "grad_norm": 2.638964891433716, + "learning_rate": 9.326628814762243e-05, + "loss": 0.03159240484237671, + "step": 47460 + }, + { + "epoch": 6.738112136266856, + "grad_norm": 6.593120574951172, + "learning_rate": 9.326486870120653e-05, + "loss": 0.07078714966773987, + "step": 47470 + }, + { + "epoch": 6.739531582682754, + "grad_norm": 4.137892246246338, + "learning_rate": 9.326344925479064e-05, + "loss": 0.044456595182418825, + "step": 47480 + }, + { + "epoch": 6.740951029098651, + "grad_norm": 1.0298532247543335, + "learning_rate": 9.326202980837474e-05, + "loss": 0.05453903079032898, + "step": 47490 + }, + { + "epoch": 6.74237047551455, + "grad_norm": 5.411275386810303, + "learning_rate": 9.326061036195885e-05, + "loss": 0.03052000105381012, + "step": 47500 + }, + { + "epoch": 6.74237047551455, + "eval_accuracy": 0.9678896165829465, + "eval_loss": 0.10287806391716003, + "eval_runtime": 33.2324, + "eval_samples_per_second": 473.244, + "eval_steps_per_second": 14.805, + "step": 47500 + }, + { + "epoch": 6.743789921930447, + "grad_norm": 0.06812303513288498, + "learning_rate": 9.325919091554293e-05, + "loss": 0.05741068124771118, + "step": 47510 + }, + { + "epoch": 6.7452093683463445, + "grad_norm": 7.734038829803467, + "learning_rate": 9.325777146912704e-05, + "loss": 0.07300693392753602, + "step": 47520 + }, + { + "epoch": 6.746628814762243, + "grad_norm": 4.815206050872803, + "learning_rate": 9.325635202271114e-05, + "loss": 0.02375160902738571, + "step": 47530 + }, + { + "epoch": 6.74804826117814, + "grad_norm": 0.18969817459583282, + "learning_rate": 9.325493257629525e-05, + "loss": 0.05244582295417786, + "step": 47540 + }, + { + "epoch": 6.749467707594039, + "grad_norm": 3.6226210594177246, + "learning_rate": 9.325351312987935e-05, + "loss": 0.0583953857421875, + "step": 47550 + }, + { + "epoch": 6.750887154009936, + "grad_norm": 9.285144805908203, + "learning_rate": 9.325209368346345e-05, + "loss": 0.07046167850494385, + "step": 47560 + }, + { + "epoch": 6.752306600425834, + "grad_norm": 6.898341655731201, + "learning_rate": 9.325067423704756e-05, + "loss": 0.024330171942710876, + "step": 47570 + }, + { + "epoch": 6.753726046841732, + "grad_norm": 1.6316328048706055, + "learning_rate": 9.324925479063166e-05, + "loss": 0.040696841478347776, + "step": 47580 + }, + { + "epoch": 6.755145493257629, + "grad_norm": 0.510357677936554, + "learning_rate": 9.324783534421577e-05, + "loss": 0.015253564715385437, + "step": 47590 + }, + { + "epoch": 6.7565649396735274, + "grad_norm": 4.652739524841309, + "learning_rate": 9.324641589779986e-05, + "loss": 0.03557385802268982, + "step": 47600 + }, + { + "epoch": 6.757984386089425, + "grad_norm": 0.3203953802585602, + "learning_rate": 9.324499645138397e-05, + "loss": 0.010098765790462493, + "step": 47610 + }, + { + "epoch": 6.759403832505323, + "grad_norm": 1.6275372505187988, + "learning_rate": 9.324357700496806e-05, + "loss": 0.03298323750495911, + "step": 47620 + }, + { + "epoch": 6.760823278921221, + "grad_norm": 11.40868854522705, + "learning_rate": 9.324215755855217e-05, + "loss": 0.07673094868659973, + "step": 47630 + }, + { + "epoch": 6.762242725337119, + "grad_norm": 7.8622212409973145, + "learning_rate": 9.324073811213627e-05, + "loss": 0.05273681879043579, + "step": 47640 + }, + { + "epoch": 6.763662171753016, + "grad_norm": 5.881760120391846, + "learning_rate": 9.323931866572038e-05, + "loss": 0.0287151038646698, + "step": 47650 + }, + { + "epoch": 6.765081618168914, + "grad_norm": 8.757052421569824, + "learning_rate": 9.323789921930447e-05, + "loss": 0.10142724514007569, + "step": 47660 + }, + { + "epoch": 6.766501064584812, + "grad_norm": 7.562860012054443, + "learning_rate": 9.323647977288857e-05, + "loss": 0.061179614067077635, + "step": 47670 + }, + { + "epoch": 6.7679205110007095, + "grad_norm": 0.293720006942749, + "learning_rate": 9.323506032647268e-05, + "loss": 0.0769286334514618, + "step": 47680 + }, + { + "epoch": 6.769339957416608, + "grad_norm": 5.228733062744141, + "learning_rate": 9.323364088005678e-05, + "loss": 0.03475149571895599, + "step": 47690 + }, + { + "epoch": 6.770759403832505, + "grad_norm": 0.8717983961105347, + "learning_rate": 9.323222143364089e-05, + "loss": 0.03800695240497589, + "step": 47700 + }, + { + "epoch": 6.7721788502484035, + "grad_norm": 16.308420181274414, + "learning_rate": 9.323080198722499e-05, + "loss": 0.04515612721443176, + "step": 47710 + }, + { + "epoch": 6.773598296664301, + "grad_norm": 5.392683029174805, + "learning_rate": 9.322938254080909e-05, + "loss": 0.030790060758590698, + "step": 47720 + }, + { + "epoch": 6.775017743080198, + "grad_norm": 4.039763927459717, + "learning_rate": 9.322796309439318e-05, + "loss": 0.03297184109687805, + "step": 47730 + }, + { + "epoch": 6.776437189496097, + "grad_norm": 5.786772727966309, + "learning_rate": 9.32265436479773e-05, + "loss": 0.05468297004699707, + "step": 47740 + }, + { + "epoch": 6.777856635911994, + "grad_norm": 16.83050537109375, + "learning_rate": 9.322512420156139e-05, + "loss": 0.04757193326950073, + "step": 47750 + }, + { + "epoch": 6.779276082327892, + "grad_norm": 4.77114200592041, + "learning_rate": 9.32237047551455e-05, + "loss": 0.03624268174171448, + "step": 47760 + }, + { + "epoch": 6.78069552874379, + "grad_norm": 10.355727195739746, + "learning_rate": 9.32222853087296e-05, + "loss": 0.039849352836608884, + "step": 47770 + }, + { + "epoch": 6.782114975159688, + "grad_norm": 5.408930778503418, + "learning_rate": 9.32208658623137e-05, + "loss": 0.05064322948455811, + "step": 47780 + }, + { + "epoch": 6.7835344215755855, + "grad_norm": 1.2858765125274658, + "learning_rate": 9.321944641589781e-05, + "loss": 0.04928310811519623, + "step": 47790 + }, + { + "epoch": 6.784953867991483, + "grad_norm": 0.4276614487171173, + "learning_rate": 9.32180269694819e-05, + "loss": 0.051668965816497804, + "step": 47800 + }, + { + "epoch": 6.786373314407381, + "grad_norm": 0.45648816227912903, + "learning_rate": 9.321660752306602e-05, + "loss": 0.03476710915565491, + "step": 47810 + }, + { + "epoch": 6.787792760823279, + "grad_norm": 23.13014793395996, + "learning_rate": 9.32151880766501e-05, + "loss": 0.043744403123855594, + "step": 47820 + }, + { + "epoch": 6.789212207239177, + "grad_norm": 5.712381362915039, + "learning_rate": 9.321376863023421e-05, + "loss": 0.112934410572052, + "step": 47830 + }, + { + "epoch": 6.790631653655074, + "grad_norm": 0.8790338039398193, + "learning_rate": 9.321234918381831e-05, + "loss": 0.054661625623703004, + "step": 47840 + }, + { + "epoch": 6.792051100070973, + "grad_norm": 8.923171997070312, + "learning_rate": 9.321092973740242e-05, + "loss": 0.043182292580604555, + "step": 47850 + }, + { + "epoch": 6.79347054648687, + "grad_norm": 4.031979560852051, + "learning_rate": 9.320951029098652e-05, + "loss": 0.06587361693382263, + "step": 47860 + }, + { + "epoch": 6.7948899929027675, + "grad_norm": 1.1798633337020874, + "learning_rate": 9.320809084457061e-05, + "loss": 0.06723747253417969, + "step": 47870 + }, + { + "epoch": 6.796309439318666, + "grad_norm": 0.15142561495304108, + "learning_rate": 9.320681334279631e-05, + "loss": 0.0663286030292511, + "step": 47880 + }, + { + "epoch": 6.797728885734563, + "grad_norm": 0.30359768867492676, + "learning_rate": 9.320539389638041e-05, + "loss": 0.028592944145202637, + "step": 47890 + }, + { + "epoch": 6.7991483321504615, + "grad_norm": 0.025870347395539284, + "learning_rate": 9.320397444996452e-05, + "loss": 0.03683372735977173, + "step": 47900 + }, + { + "epoch": 6.800567778566359, + "grad_norm": 1.0389518737792969, + "learning_rate": 9.320255500354862e-05, + "loss": 0.0489422082901001, + "step": 47910 + }, + { + "epoch": 6.801987224982257, + "grad_norm": 2.1409695148468018, + "learning_rate": 9.320113555713273e-05, + "loss": 0.07247651815414428, + "step": 47920 + }, + { + "epoch": 6.803406671398155, + "grad_norm": 5.814974784851074, + "learning_rate": 9.319971611071683e-05, + "loss": 0.06912614107131958, + "step": 47930 + }, + { + "epoch": 6.804826117814052, + "grad_norm": 0.1616591513156891, + "learning_rate": 9.319829666430094e-05, + "loss": 0.05542629361152649, + "step": 47940 + }, + { + "epoch": 6.80624556422995, + "grad_norm": 5.357841968536377, + "learning_rate": 9.319687721788502e-05, + "loss": 0.04025912284851074, + "step": 47950 + }, + { + "epoch": 6.807665010645848, + "grad_norm": 1.2836575508117676, + "learning_rate": 9.319545777146913e-05, + "loss": 0.0756769597530365, + "step": 47960 + }, + { + "epoch": 6.809084457061746, + "grad_norm": 0.03740439563989639, + "learning_rate": 9.319403832505323e-05, + "loss": 0.0715951144695282, + "step": 47970 + }, + { + "epoch": 6.810503903477644, + "grad_norm": 0.518546462059021, + "learning_rate": 9.319261887863734e-05, + "loss": 0.04978099465370178, + "step": 47980 + }, + { + "epoch": 6.811923349893542, + "grad_norm": 0.18438740074634552, + "learning_rate": 9.319119943222144e-05, + "loss": 0.01751907467842102, + "step": 47990 + }, + { + "epoch": 6.813342796309439, + "grad_norm": 3.142836570739746, + "learning_rate": 9.318977998580554e-05, + "loss": 0.02677932381629944, + "step": 48000 + }, + { + "epoch": 6.813342796309439, + "eval_accuracy": 0.9809881096203981, + "eval_loss": 0.05622277408838272, + "eval_runtime": 32.8658, + "eval_samples_per_second": 478.522, + "eval_steps_per_second": 14.97, + "step": 48000 + }, + { + "epoch": 6.814762242725337, + "grad_norm": 2.0536131858825684, + "learning_rate": 9.318836053938965e-05, + "loss": 0.015174926817417144, + "step": 48010 + }, + { + "epoch": 6.816181689141235, + "grad_norm": 0.4630908966064453, + "learning_rate": 9.318694109297374e-05, + "loss": 0.030945992469787596, + "step": 48020 + }, + { + "epoch": 6.817601135557132, + "grad_norm": 4.387209415435791, + "learning_rate": 9.318552164655785e-05, + "loss": 0.03779844045639038, + "step": 48030 + }, + { + "epoch": 6.819020581973031, + "grad_norm": 2.2216169834136963, + "learning_rate": 9.318410220014195e-05, + "loss": 0.060065722465515135, + "step": 48040 + }, + { + "epoch": 6.820440028388928, + "grad_norm": 5.043936252593994, + "learning_rate": 9.318268275372605e-05, + "loss": 0.051817584037780764, + "step": 48050 + }, + { + "epoch": 6.8218594748048265, + "grad_norm": 5.093997001647949, + "learning_rate": 9.318126330731015e-05, + "loss": 0.0350986510515213, + "step": 48060 + }, + { + "epoch": 6.823278921220724, + "grad_norm": 0.955636203289032, + "learning_rate": 9.317984386089426e-05, + "loss": 0.06274473071098327, + "step": 48070 + }, + { + "epoch": 6.824698367636621, + "grad_norm": 0.037599656730890274, + "learning_rate": 9.317842441447836e-05, + "loss": 0.04869303405284882, + "step": 48080 + }, + { + "epoch": 6.82611781405252, + "grad_norm": 0.32732093334198, + "learning_rate": 9.317700496806247e-05, + "loss": 0.03161635100841522, + "step": 48090 + }, + { + "epoch": 6.827537260468417, + "grad_norm": 2.418246030807495, + "learning_rate": 9.317558552164656e-05, + "loss": 0.029885494709014894, + "step": 48100 + }, + { + "epoch": 6.828956706884315, + "grad_norm": 6.135824203491211, + "learning_rate": 9.317416607523066e-05, + "loss": 0.03382058739662171, + "step": 48110 + }, + { + "epoch": 6.830376153300213, + "grad_norm": 3.056666374206543, + "learning_rate": 9.317274662881477e-05, + "loss": 0.030065348744392394, + "step": 48120 + }, + { + "epoch": 6.831795599716111, + "grad_norm": 2.694615602493286, + "learning_rate": 9.317132718239887e-05, + "loss": 0.08778796792030334, + "step": 48130 + }, + { + "epoch": 6.8332150461320085, + "grad_norm": 0.2775319218635559, + "learning_rate": 9.316990773598298e-05, + "loss": 0.07494470477104187, + "step": 48140 + }, + { + "epoch": 6.834634492547906, + "grad_norm": 4.01226282119751, + "learning_rate": 9.316848828956706e-05, + "loss": 0.02631029188632965, + "step": 48150 + }, + { + "epoch": 6.836053938963804, + "grad_norm": 6.277001857757568, + "learning_rate": 9.316706884315118e-05, + "loss": 0.046081721782684326, + "step": 48160 + }, + { + "epoch": 6.837473385379702, + "grad_norm": 6.287518501281738, + "learning_rate": 9.316564939673527e-05, + "loss": 0.07654207348823547, + "step": 48170 + }, + { + "epoch": 6.8388928317956, + "grad_norm": 2.8772833347320557, + "learning_rate": 9.316422995031938e-05, + "loss": 0.026699700951576234, + "step": 48180 + }, + { + "epoch": 6.840312278211497, + "grad_norm": 11.019613265991211, + "learning_rate": 9.316281050390348e-05, + "loss": 0.06047802567481995, + "step": 48190 + }, + { + "epoch": 6.841731724627396, + "grad_norm": 1.1007559299468994, + "learning_rate": 9.316139105748758e-05, + "loss": 0.01713552176952362, + "step": 48200 + }, + { + "epoch": 6.843151171043293, + "grad_norm": 1.7577693462371826, + "learning_rate": 9.315997161107169e-05, + "loss": 0.07861257791519165, + "step": 48210 + }, + { + "epoch": 6.8445706174591905, + "grad_norm": 0.16370804607868195, + "learning_rate": 9.315855216465579e-05, + "loss": 0.02424170821905136, + "step": 48220 + }, + { + "epoch": 6.845990063875089, + "grad_norm": 0.40334025025367737, + "learning_rate": 9.31571327182399e-05, + "loss": 0.0454858660697937, + "step": 48230 + }, + { + "epoch": 6.847409510290986, + "grad_norm": 0.16940270364284515, + "learning_rate": 9.3155713271824e-05, + "loss": 0.020930516719818115, + "step": 48240 + }, + { + "epoch": 6.8488289567068845, + "grad_norm": 2.689833402633667, + "learning_rate": 9.315429382540809e-05, + "loss": 0.06456713676452637, + "step": 48250 + }, + { + "epoch": 6.850248403122782, + "grad_norm": 0.9107224345207214, + "learning_rate": 9.315287437899219e-05, + "loss": 0.0072081081569194795, + "step": 48260 + }, + { + "epoch": 6.85166784953868, + "grad_norm": 4.121362686157227, + "learning_rate": 9.31514549325763e-05, + "loss": 0.0435828447341919, + "step": 48270 + }, + { + "epoch": 6.853087295954578, + "grad_norm": 1.2586746215820312, + "learning_rate": 9.31500354861604e-05, + "loss": 0.09252724051475525, + "step": 48280 + }, + { + "epoch": 6.854506742370475, + "grad_norm": 1.9886664152145386, + "learning_rate": 9.314861603974451e-05, + "loss": 0.04249245822429657, + "step": 48290 + }, + { + "epoch": 6.855926188786373, + "grad_norm": 4.510488986968994, + "learning_rate": 9.31471965933286e-05, + "loss": 0.016424933075904848, + "step": 48300 + }, + { + "epoch": 6.857345635202271, + "grad_norm": 0.23531511425971985, + "learning_rate": 9.31457771469127e-05, + "loss": 0.02162374258041382, + "step": 48310 + }, + { + "epoch": 6.858765081618169, + "grad_norm": 7.018204689025879, + "learning_rate": 9.314435770049681e-05, + "loss": 0.05110843181610107, + "step": 48320 + }, + { + "epoch": 6.8601845280340665, + "grad_norm": 4.952686309814453, + "learning_rate": 9.314293825408091e-05, + "loss": 0.0650827169418335, + "step": 48330 + }, + { + "epoch": 6.861603974449965, + "grad_norm": 0.2199895679950714, + "learning_rate": 9.314151880766502e-05, + "loss": 0.018927814066410066, + "step": 48340 + }, + { + "epoch": 6.863023420865862, + "grad_norm": 6.606371879577637, + "learning_rate": 9.314009936124912e-05, + "loss": 0.032240912318229675, + "step": 48350 + }, + { + "epoch": 6.86444286728176, + "grad_norm": 1.6385270357131958, + "learning_rate": 9.313867991483322e-05, + "loss": 0.023154914379119873, + "step": 48360 + }, + { + "epoch": 6.865862313697658, + "grad_norm": 4.304486274719238, + "learning_rate": 9.313726046841731e-05, + "loss": 0.05492810606956482, + "step": 48370 + }, + { + "epoch": 6.867281760113555, + "grad_norm": 2.8906960487365723, + "learning_rate": 9.313584102200143e-05, + "loss": 0.06663978099822998, + "step": 48380 + }, + { + "epoch": 6.868701206529454, + "grad_norm": 0.6142754554748535, + "learning_rate": 9.313442157558552e-05, + "loss": 0.0662991464138031, + "step": 48390 + }, + { + "epoch": 6.870120652945351, + "grad_norm": 0.6287058591842651, + "learning_rate": 9.313300212916963e-05, + "loss": 0.06578343510627746, + "step": 48400 + }, + { + "epoch": 6.871540099361249, + "grad_norm": 6.606374740600586, + "learning_rate": 9.313158268275373e-05, + "loss": 0.10795985460281372, + "step": 48410 + }, + { + "epoch": 6.872959545777147, + "grad_norm": 3.5225772857666016, + "learning_rate": 9.313016323633783e-05, + "loss": 0.01849919557571411, + "step": 48420 + }, + { + "epoch": 6.874378992193044, + "grad_norm": 0.921964704990387, + "learning_rate": 9.312874378992194e-05, + "loss": 0.041754227876663205, + "step": 48430 + }, + { + "epoch": 6.875798438608943, + "grad_norm": 0.8441599011421204, + "learning_rate": 9.312732434350604e-05, + "loss": 0.11150503158569336, + "step": 48440 + }, + { + "epoch": 6.87721788502484, + "grad_norm": 0.2408447563648224, + "learning_rate": 9.312590489709015e-05, + "loss": 0.04510400295257568, + "step": 48450 + }, + { + "epoch": 6.878637331440738, + "grad_norm": 4.951006889343262, + "learning_rate": 9.312448545067423e-05, + "loss": 0.06654103994369506, + "step": 48460 + }, + { + "epoch": 6.880056777856636, + "grad_norm": 0.20190109312534332, + "learning_rate": 9.312306600425834e-05, + "loss": 0.04683546721935272, + "step": 48470 + }, + { + "epoch": 6.881476224272534, + "grad_norm": 0.44238659739494324, + "learning_rate": 9.312164655784244e-05, + "loss": 0.07028831243515014, + "step": 48480 + }, + { + "epoch": 6.8828956706884314, + "grad_norm": 0.18961113691329956, + "learning_rate": 9.312022711142655e-05, + "loss": 0.0513995349407196, + "step": 48490 + }, + { + "epoch": 6.884315117104329, + "grad_norm": 1.5219275951385498, + "learning_rate": 9.311880766501065e-05, + "loss": 0.04622653126716614, + "step": 48500 + }, + { + "epoch": 6.884315117104329, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.08012785017490387, + "eval_runtime": 33.1858, + "eval_samples_per_second": 473.908, + "eval_steps_per_second": 14.826, + "step": 48500 + }, + { + "epoch": 6.885734563520227, + "grad_norm": 7.168130397796631, + "learning_rate": 9.311738821859475e-05, + "loss": 0.0142391636967659, + "step": 48510 + }, + { + "epoch": 6.887154009936125, + "grad_norm": 0.36366888880729675, + "learning_rate": 9.311596877217886e-05, + "loss": 0.00685754343867302, + "step": 48520 + }, + { + "epoch": 6.888573456352023, + "grad_norm": 6.618060111999512, + "learning_rate": 9.311454932576295e-05, + "loss": 0.03779117166996002, + "step": 48530 + }, + { + "epoch": 6.88999290276792, + "grad_norm": 1.2769519090652466, + "learning_rate": 9.311312987934707e-05, + "loss": 0.05502520203590393, + "step": 48540 + }, + { + "epoch": 6.891412349183819, + "grad_norm": 2.1486284732818604, + "learning_rate": 9.311171043293116e-05, + "loss": 0.05189969539642334, + "step": 48550 + }, + { + "epoch": 6.892831795599716, + "grad_norm": 5.32498025894165, + "learning_rate": 9.311029098651526e-05, + "loss": 0.041870713233947754, + "step": 48560 + }, + { + "epoch": 6.8942512420156135, + "grad_norm": 7.3712005615234375, + "learning_rate": 9.310887154009936e-05, + "loss": 0.05584554672241211, + "step": 48570 + }, + { + "epoch": 6.895670688431512, + "grad_norm": 0.048228669911623, + "learning_rate": 9.310745209368347e-05, + "loss": 0.025919014215469362, + "step": 48580 + }, + { + "epoch": 6.897090134847409, + "grad_norm": 0.2741209864616394, + "learning_rate": 9.310603264726757e-05, + "loss": 0.057539498805999754, + "step": 48590 + }, + { + "epoch": 6.8985095812633075, + "grad_norm": 0.655285656452179, + "learning_rate": 9.310461320085168e-05, + "loss": 0.10678447484970092, + "step": 48600 + }, + { + "epoch": 6.899929027679205, + "grad_norm": 0.24642018973827362, + "learning_rate": 9.310319375443577e-05, + "loss": 0.02704497575759888, + "step": 48610 + }, + { + "epoch": 6.901348474095103, + "grad_norm": 0.37187427282333374, + "learning_rate": 9.310177430801987e-05, + "loss": 0.02497977912425995, + "step": 48620 + }, + { + "epoch": 6.902767920511001, + "grad_norm": 1.8962280750274658, + "learning_rate": 9.310035486160398e-05, + "loss": 0.0649878740310669, + "step": 48630 + }, + { + "epoch": 6.904187366926898, + "grad_norm": 5.100073337554932, + "learning_rate": 9.309893541518808e-05, + "loss": 0.04320423901081085, + "step": 48640 + }, + { + "epoch": 6.905606813342796, + "grad_norm": 6.323354721069336, + "learning_rate": 9.309751596877219e-05, + "loss": 0.058730268478393556, + "step": 48650 + }, + { + "epoch": 6.907026259758694, + "grad_norm": 2.025700330734253, + "learning_rate": 9.309609652235629e-05, + "loss": 0.06454595327377319, + "step": 48660 + }, + { + "epoch": 6.908445706174592, + "grad_norm": 5.871410846710205, + "learning_rate": 9.309467707594039e-05, + "loss": 0.09823285341262818, + "step": 48670 + }, + { + "epoch": 6.9098651525904895, + "grad_norm": 11.58802604675293, + "learning_rate": 9.309325762952448e-05, + "loss": 0.07219201326370239, + "step": 48680 + }, + { + "epoch": 6.911284599006388, + "grad_norm": 6.74500846862793, + "learning_rate": 9.30918381831086e-05, + "loss": 0.058211779594421385, + "step": 48690 + }, + { + "epoch": 6.912704045422285, + "grad_norm": 12.418977737426758, + "learning_rate": 9.309041873669269e-05, + "loss": 0.09747650027275086, + "step": 48700 + }, + { + "epoch": 6.914123491838183, + "grad_norm": 4.093213081359863, + "learning_rate": 9.30889992902768e-05, + "loss": 0.02640637755393982, + "step": 48710 + }, + { + "epoch": 6.915542938254081, + "grad_norm": 10.278468132019043, + "learning_rate": 9.30875798438609e-05, + "loss": 0.030793681740760803, + "step": 48720 + }, + { + "epoch": 6.916962384669978, + "grad_norm": 5.775954723358154, + "learning_rate": 9.3086160397445e-05, + "loss": 0.06745225191116333, + "step": 48730 + }, + { + "epoch": 6.918381831085877, + "grad_norm": 10.210089683532715, + "learning_rate": 9.308474095102911e-05, + "loss": 0.10631564855575562, + "step": 48740 + }, + { + "epoch": 6.919801277501774, + "grad_norm": 12.40491008758545, + "learning_rate": 9.30833215046132e-05, + "loss": 0.07292826771736145, + "step": 48750 + }, + { + "epoch": 6.921220723917672, + "grad_norm": 0.05215975269675255, + "learning_rate": 9.308190205819732e-05, + "loss": 0.051340538263320926, + "step": 48760 + }, + { + "epoch": 6.92264017033357, + "grad_norm": 0.206340029835701, + "learning_rate": 9.30804826117814e-05, + "loss": 0.017073613405227662, + "step": 48770 + }, + { + "epoch": 6.924059616749467, + "grad_norm": 1.0038329362869263, + "learning_rate": 9.307906316536551e-05, + "loss": 0.07039524912834168, + "step": 48780 + }, + { + "epoch": 6.9254790631653655, + "grad_norm": 0.03913341462612152, + "learning_rate": 9.307764371894961e-05, + "loss": 0.055343860387802125, + "step": 48790 + }, + { + "epoch": 6.926898509581263, + "grad_norm": 4.186570167541504, + "learning_rate": 9.307622427253372e-05, + "loss": 0.038757961988449094, + "step": 48800 + }, + { + "epoch": 6.928317955997161, + "grad_norm": 2.0609304904937744, + "learning_rate": 9.307480482611782e-05, + "loss": 0.04881545305252075, + "step": 48810 + }, + { + "epoch": 6.929737402413059, + "grad_norm": 8.912321090698242, + "learning_rate": 9.307338537970191e-05, + "loss": 0.07206100821495057, + "step": 48820 + }, + { + "epoch": 6.931156848828957, + "grad_norm": 0.3043152689933777, + "learning_rate": 9.307196593328602e-05, + "loss": 0.0357146680355072, + "step": 48830 + }, + { + "epoch": 6.932576295244854, + "grad_norm": 1.626572847366333, + "learning_rate": 9.307054648687012e-05, + "loss": 0.03512680530548096, + "step": 48840 + }, + { + "epoch": 6.933995741660752, + "grad_norm": 0.24631650745868683, + "learning_rate": 9.306912704045423e-05, + "loss": 0.032503852248191835, + "step": 48850 + }, + { + "epoch": 6.93541518807665, + "grad_norm": 7.898240566253662, + "learning_rate": 9.306770759403833e-05, + "loss": 0.04666814804077148, + "step": 48860 + }, + { + "epoch": 6.936834634492548, + "grad_norm": 8.17198371887207, + "learning_rate": 9.306628814762243e-05, + "loss": 0.04198618233203888, + "step": 48870 + }, + { + "epoch": 6.938254080908446, + "grad_norm": 4.493419170379639, + "learning_rate": 9.306486870120652e-05, + "loss": 0.10232670307159424, + "step": 48880 + }, + { + "epoch": 6.939673527324343, + "grad_norm": 4.874084949493408, + "learning_rate": 9.306344925479064e-05, + "loss": 0.04881571829319, + "step": 48890 + }, + { + "epoch": 6.941092973740242, + "grad_norm": 0.2118980437517166, + "learning_rate": 9.306202980837473e-05, + "loss": 0.029319232702255248, + "step": 48900 + }, + { + "epoch": 6.942512420156139, + "grad_norm": 4.677948474884033, + "learning_rate": 9.306061036195884e-05, + "loss": 0.0549718976020813, + "step": 48910 + }, + { + "epoch": 6.943931866572036, + "grad_norm": 7.681908130645752, + "learning_rate": 9.305919091554294e-05, + "loss": 0.036868888139724734, + "step": 48920 + }, + { + "epoch": 6.945351312987935, + "grad_norm": 9.071455001831055, + "learning_rate": 9.305777146912704e-05, + "loss": 0.08308836221694946, + "step": 48930 + }, + { + "epoch": 6.946770759403832, + "grad_norm": 4.830014228820801, + "learning_rate": 9.305635202271115e-05, + "loss": 0.03471656739711761, + "step": 48940 + }, + { + "epoch": 6.9481902058197305, + "grad_norm": 2.677654504776001, + "learning_rate": 9.305493257629525e-05, + "loss": 0.03147899806499481, + "step": 48950 + }, + { + "epoch": 6.949609652235628, + "grad_norm": 8.488234519958496, + "learning_rate": 9.305351312987936e-05, + "loss": 0.040718674659729004, + "step": 48960 + }, + { + "epoch": 6.951029098651526, + "grad_norm": 1.8252172470092773, + "learning_rate": 9.305209368346344e-05, + "loss": 0.04164456129074097, + "step": 48970 + }, + { + "epoch": 6.952448545067424, + "grad_norm": 0.5312723517417908, + "learning_rate": 9.305067423704755e-05, + "loss": 0.04740914106369019, + "step": 48980 + }, + { + "epoch": 6.953867991483321, + "grad_norm": 4.89031982421875, + "learning_rate": 9.304925479063165e-05, + "loss": 0.07554548978805542, + "step": 48990 + }, + { + "epoch": 6.955287437899219, + "grad_norm": 1.1586805582046509, + "learning_rate": 9.304783534421576e-05, + "loss": 0.036046579480171204, + "step": 49000 + }, + { + "epoch": 6.955287437899219, + "eval_accuracy": 0.9715775418070833, + "eval_loss": 0.08274991810321808, + "eval_runtime": 33.2502, + "eval_samples_per_second": 472.99, + "eval_steps_per_second": 14.797, + "step": 49000 + }, + { + "epoch": 6.956706884315117, + "grad_norm": 3.4457671642303467, + "learning_rate": 9.304641589779987e-05, + "loss": 0.0518226146697998, + "step": 49010 + }, + { + "epoch": 6.958126330731015, + "grad_norm": 1.6784181594848633, + "learning_rate": 9.304499645138397e-05, + "loss": 0.08865286707878113, + "step": 49020 + }, + { + "epoch": 6.9595457771469125, + "grad_norm": 5.46137809753418, + "learning_rate": 9.304357700496807e-05, + "loss": 0.05337139964103699, + "step": 49030 + }, + { + "epoch": 6.960965223562811, + "grad_norm": 0.19309842586517334, + "learning_rate": 9.304215755855216e-05, + "loss": 0.08255500793457031, + "step": 49040 + }, + { + "epoch": 6.962384669978708, + "grad_norm": 6.043724536895752, + "learning_rate": 9.304073811213628e-05, + "loss": 0.02880779802799225, + "step": 49050 + }, + { + "epoch": 6.963804116394606, + "grad_norm": 7.318505764007568, + "learning_rate": 9.303931866572037e-05, + "loss": 0.02839864492416382, + "step": 49060 + }, + { + "epoch": 6.965223562810504, + "grad_norm": 0.155448779463768, + "learning_rate": 9.303789921930448e-05, + "loss": 0.05681364536285401, + "step": 49070 + }, + { + "epoch": 6.966643009226401, + "grad_norm": 6.362581253051758, + "learning_rate": 9.303647977288857e-05, + "loss": 0.053470975160598753, + "step": 49080 + }, + { + "epoch": 6.9680624556423, + "grad_norm": 0.9104495644569397, + "learning_rate": 9.303506032647268e-05, + "loss": 0.042231276631355286, + "step": 49090 + }, + { + "epoch": 6.969481902058197, + "grad_norm": 0.3613532483577728, + "learning_rate": 9.303364088005679e-05, + "loss": 0.01866423785686493, + "step": 49100 + }, + { + "epoch": 6.970901348474095, + "grad_norm": 0.20154109597206116, + "learning_rate": 9.303222143364089e-05, + "loss": 0.020688405632972716, + "step": 49110 + }, + { + "epoch": 6.972320794889993, + "grad_norm": 0.06894083321094513, + "learning_rate": 9.3030801987225e-05, + "loss": 0.023301401734352113, + "step": 49120 + }, + { + "epoch": 6.97374024130589, + "grad_norm": 0.7792617678642273, + "learning_rate": 9.302938254080908e-05, + "loss": 0.035962840914726256, + "step": 49130 + }, + { + "epoch": 6.9751596877217885, + "grad_norm": 3.8199219703674316, + "learning_rate": 9.302796309439319e-05, + "loss": 0.05565328001976013, + "step": 49140 + }, + { + "epoch": 6.976579134137686, + "grad_norm": 4.88375186920166, + "learning_rate": 9.302654364797729e-05, + "loss": 0.018589270114898682, + "step": 49150 + }, + { + "epoch": 6.977998580553584, + "grad_norm": 2.5005977153778076, + "learning_rate": 9.30251242015614e-05, + "loss": 0.052044129371643065, + "step": 49160 + }, + { + "epoch": 6.979418026969482, + "grad_norm": 0.2199069857597351, + "learning_rate": 9.30237047551455e-05, + "loss": 0.06527388095855713, + "step": 49170 + }, + { + "epoch": 6.98083747338538, + "grad_norm": 5.643954277038574, + "learning_rate": 9.30222853087296e-05, + "loss": 0.03402488529682159, + "step": 49180 + }, + { + "epoch": 6.982256919801277, + "grad_norm": 9.927678108215332, + "learning_rate": 9.30208658623137e-05, + "loss": 0.08523820042610168, + "step": 49190 + }, + { + "epoch": 6.983676366217175, + "grad_norm": 10.16280460357666, + "learning_rate": 9.30194464158978e-05, + "loss": 0.07182406187057495, + "step": 49200 + }, + { + "epoch": 6.985095812633073, + "grad_norm": 8.577272415161133, + "learning_rate": 9.301802696948191e-05, + "loss": 0.08476966619491577, + "step": 49210 + }, + { + "epoch": 6.9865152590489705, + "grad_norm": 3.291269063949585, + "learning_rate": 9.301660752306601e-05, + "loss": 0.05766218900680542, + "step": 49220 + }, + { + "epoch": 6.987934705464869, + "grad_norm": 0.36031633615493774, + "learning_rate": 9.301518807665011e-05, + "loss": 0.03990486562252045, + "step": 49230 + }, + { + "epoch": 6.989354151880766, + "grad_norm": 2.6701886653900146, + "learning_rate": 9.30137686302342e-05, + "loss": 0.04657737016677856, + "step": 49240 + }, + { + "epoch": 6.990773598296665, + "grad_norm": 6.119255065917969, + "learning_rate": 9.301234918381832e-05, + "loss": 0.058797252178192136, + "step": 49250 + }, + { + "epoch": 6.992193044712562, + "grad_norm": 11.877795219421387, + "learning_rate": 9.301092973740241e-05, + "loss": 0.07917346358299256, + "step": 49260 + }, + { + "epoch": 6.99361249112846, + "grad_norm": 5.940694332122803, + "learning_rate": 9.300951029098653e-05, + "loss": 0.048585915565490724, + "step": 49270 + }, + { + "epoch": 6.995031937544358, + "grad_norm": 0.06978671997785568, + "learning_rate": 9.300809084457062e-05, + "loss": 0.038115686178207396, + "step": 49280 + }, + { + "epoch": 6.996451383960255, + "grad_norm": 4.792726039886475, + "learning_rate": 9.300667139815472e-05, + "loss": 0.04403967261314392, + "step": 49290 + }, + { + "epoch": 6.997870830376153, + "grad_norm": 0.2641933858394623, + "learning_rate": 9.300525195173883e-05, + "loss": 0.09480243921279907, + "step": 49300 + }, + { + "epoch": 6.999290276792051, + "grad_norm": 0.3769291043281555, + "learning_rate": 9.300383250532293e-05, + "loss": 0.04385235905647278, + "step": 49310 + }, + { + "epoch": 7.000709723207949, + "grad_norm": 1.2710357904434204, + "learning_rate": 9.300241305890704e-05, + "loss": 0.04902540445327759, + "step": 49320 + }, + { + "epoch": 7.002129169623847, + "grad_norm": 3.25337815284729, + "learning_rate": 9.300099361249112e-05, + "loss": 0.06324410438537598, + "step": 49330 + }, + { + "epoch": 7.003548616039745, + "grad_norm": 3.0381252765655518, + "learning_rate": 9.299957416607523e-05, + "loss": 0.02603963911533356, + "step": 49340 + }, + { + "epoch": 7.004968062455642, + "grad_norm": 5.802962303161621, + "learning_rate": 9.299815471965933e-05, + "loss": 0.07219669818878174, + "step": 49350 + }, + { + "epoch": 7.00638750887154, + "grad_norm": 1.1525763273239136, + "learning_rate": 9.299673527324344e-05, + "loss": 0.026941490173339844, + "step": 49360 + }, + { + "epoch": 7.007806955287438, + "grad_norm": 3.169665575027466, + "learning_rate": 9.299531582682754e-05, + "loss": 0.01637158840894699, + "step": 49370 + }, + { + "epoch": 7.009226401703335, + "grad_norm": 6.44553279876709, + "learning_rate": 9.299389638041165e-05, + "loss": 0.01533316820859909, + "step": 49380 + }, + { + "epoch": 7.010645848119234, + "grad_norm": 0.5937087535858154, + "learning_rate": 9.299247693399575e-05, + "loss": 0.03281160593032837, + "step": 49390 + }, + { + "epoch": 7.012065294535131, + "grad_norm": 0.40654799342155457, + "learning_rate": 9.299105748757985e-05, + "loss": 0.05567447543144226, + "step": 49400 + }, + { + "epoch": 7.0134847409510295, + "grad_norm": 1.6091256141662598, + "learning_rate": 9.298963804116396e-05, + "loss": 0.0387098491191864, + "step": 49410 + }, + { + "epoch": 7.014904187366927, + "grad_norm": 0.13329650461673737, + "learning_rate": 9.298821859474805e-05, + "loss": 0.07836799025535583, + "step": 49420 + }, + { + "epoch": 7.016323633782824, + "grad_norm": 5.576003074645996, + "learning_rate": 9.298679914833217e-05, + "loss": 0.08060545921325683, + "step": 49430 + }, + { + "epoch": 7.017743080198723, + "grad_norm": 0.44313862919807434, + "learning_rate": 9.298537970191625e-05, + "loss": 0.0336797297000885, + "step": 49440 + }, + { + "epoch": 7.01916252661462, + "grad_norm": 7.476051330566406, + "learning_rate": 9.298396025550036e-05, + "loss": 0.06888166666030884, + "step": 49450 + }, + { + "epoch": 7.020581973030518, + "grad_norm": 0.964314877986908, + "learning_rate": 9.298254080908446e-05, + "loss": 0.08780239224433899, + "step": 49460 + }, + { + "epoch": 7.022001419446416, + "grad_norm": 5.419530391693115, + "learning_rate": 9.298112136266857e-05, + "loss": 0.04995315372943878, + "step": 49470 + }, + { + "epoch": 7.023420865862314, + "grad_norm": 1.7097113132476807, + "learning_rate": 9.297970191625267e-05, + "loss": 0.01210094690322876, + "step": 49480 + }, + { + "epoch": 7.0248403122782115, + "grad_norm": 4.466053485870361, + "learning_rate": 9.297828246983676e-05, + "loss": 0.05786159634590149, + "step": 49490 + }, + { + "epoch": 7.026259758694109, + "grad_norm": 4.688916206359863, + "learning_rate": 9.297686302342087e-05, + "loss": 0.032867801189422605, + "step": 49500 + }, + { + "epoch": 7.026259758694109, + "eval_accuracy": 0.9810516945380555, + "eval_loss": 0.05638516694307327, + "eval_runtime": 32.4152, + "eval_samples_per_second": 485.174, + "eval_steps_per_second": 15.178, + "step": 49500 + }, + { + "epoch": 7.027679205110007, + "grad_norm": 0.7310358881950378, + "learning_rate": 9.297544357700497e-05, + "loss": 0.019714725017547608, + "step": 49510 + }, + { + "epoch": 7.029098651525905, + "grad_norm": 0.09676510095596313, + "learning_rate": 9.297402413058908e-05, + "loss": 0.034003442525863646, + "step": 49520 + }, + { + "epoch": 7.030518097941803, + "grad_norm": 0.2067088782787323, + "learning_rate": 9.297260468417318e-05, + "loss": 0.031689074635505673, + "step": 49530 + }, + { + "epoch": 7.0319375443577, + "grad_norm": 0.0486597940325737, + "learning_rate": 9.297118523775728e-05, + "loss": 0.02177567034959793, + "step": 49540 + }, + { + "epoch": 7.033356990773599, + "grad_norm": 0.3509371876716614, + "learning_rate": 9.296976579134137e-05, + "loss": 0.032621800899505615, + "step": 49550 + }, + { + "epoch": 7.034776437189496, + "grad_norm": 1.1296992301940918, + "learning_rate": 9.296834634492549e-05, + "loss": 0.081687331199646, + "step": 49560 + }, + { + "epoch": 7.0361958836053935, + "grad_norm": 8.385297775268555, + "learning_rate": 9.296692689850958e-05, + "loss": 0.02719823718070984, + "step": 49570 + }, + { + "epoch": 7.037615330021292, + "grad_norm": 0.13415709137916565, + "learning_rate": 9.29655074520937e-05, + "loss": 0.028894478082656862, + "step": 49580 + }, + { + "epoch": 7.039034776437189, + "grad_norm": 1.7251049280166626, + "learning_rate": 9.296408800567779e-05, + "loss": 0.04444833397865296, + "step": 49590 + }, + { + "epoch": 7.0404542228530875, + "grad_norm": 0.09141341596841812, + "learning_rate": 9.296266855926189e-05, + "loss": 0.12254937887191772, + "step": 49600 + }, + { + "epoch": 7.041873669268985, + "grad_norm": 1.8580803871154785, + "learning_rate": 9.2961249112846e-05, + "loss": 0.047639891505241394, + "step": 49610 + }, + { + "epoch": 7.043293115684883, + "grad_norm": 2.467607259750366, + "learning_rate": 9.29598296664301e-05, + "loss": 0.04458892643451691, + "step": 49620 + }, + { + "epoch": 7.044712562100781, + "grad_norm": 0.6312891244888306, + "learning_rate": 9.295841022001421e-05, + "loss": 0.034400665760040285, + "step": 49630 + }, + { + "epoch": 7.046132008516678, + "grad_norm": 6.18006706237793, + "learning_rate": 9.295699077359829e-05, + "loss": 0.04445726573467255, + "step": 49640 + }, + { + "epoch": 7.047551454932576, + "grad_norm": 6.739034652709961, + "learning_rate": 9.29555713271824e-05, + "loss": 0.10003730058670043, + "step": 49650 + }, + { + "epoch": 7.048970901348474, + "grad_norm": 2.1692278385162354, + "learning_rate": 9.29541518807665e-05, + "loss": 0.044187459349632266, + "step": 49660 + }, + { + "epoch": 7.050390347764372, + "grad_norm": 0.19314740598201752, + "learning_rate": 9.295273243435061e-05, + "loss": 0.05206958651542663, + "step": 49670 + }, + { + "epoch": 7.0518097941802695, + "grad_norm": 0.11539632081985474, + "learning_rate": 9.295131298793471e-05, + "loss": 0.007926839590072631, + "step": 49680 + }, + { + "epoch": 7.053229240596168, + "grad_norm": 0.31761738657951355, + "learning_rate": 9.29498935415188e-05, + "loss": 0.024791139364242553, + "step": 49690 + }, + { + "epoch": 7.054648687012065, + "grad_norm": 4.63431978225708, + "learning_rate": 9.294847409510292e-05, + "loss": 0.04683954417705536, + "step": 49700 + }, + { + "epoch": 7.056068133427963, + "grad_norm": 4.371407985687256, + "learning_rate": 9.294705464868701e-05, + "loss": 0.06283467411994934, + "step": 49710 + }, + { + "epoch": 7.057487579843861, + "grad_norm": 2.356304168701172, + "learning_rate": 9.294563520227112e-05, + "loss": 0.01404203027486801, + "step": 49720 + }, + { + "epoch": 7.058907026259758, + "grad_norm": 6.363495349884033, + "learning_rate": 9.294421575585522e-05, + "loss": 0.04853463172912598, + "step": 49730 + }, + { + "epoch": 7.060326472675657, + "grad_norm": 3.6162188053131104, + "learning_rate": 9.294279630943933e-05, + "loss": 0.019771024584770203, + "step": 49740 + }, + { + "epoch": 7.061745919091554, + "grad_norm": 0.28996092081069946, + "learning_rate": 9.294137686302342e-05, + "loss": 0.009965144097805023, + "step": 49750 + }, + { + "epoch": 7.063165365507452, + "grad_norm": 1.2177865505218506, + "learning_rate": 9.293995741660753e-05, + "loss": 0.025553053617477416, + "step": 49760 + }, + { + "epoch": 7.06458481192335, + "grad_norm": 6.402510643005371, + "learning_rate": 9.293853797019163e-05, + "loss": 0.07992073893547058, + "step": 49770 + }, + { + "epoch": 7.066004258339247, + "grad_norm": 1.7435178756713867, + "learning_rate": 9.293711852377574e-05, + "loss": 0.03621697425842285, + "step": 49780 + }, + { + "epoch": 7.067423704755146, + "grad_norm": 0.5364729762077332, + "learning_rate": 9.293569907735983e-05, + "loss": 0.03418395221233368, + "step": 49790 + }, + { + "epoch": 7.068843151171043, + "grad_norm": 3.1348342895507812, + "learning_rate": 9.293427963094393e-05, + "loss": 0.025732126832008363, + "step": 49800 + }, + { + "epoch": 7.070262597586941, + "grad_norm": 1.3703564405441284, + "learning_rate": 9.293286018452804e-05, + "loss": 0.03487118184566498, + "step": 49810 + }, + { + "epoch": 7.071682044002839, + "grad_norm": 0.9203046560287476, + "learning_rate": 9.293144073811214e-05, + "loss": 0.04656402170658112, + "step": 49820 + }, + { + "epoch": 7.073101490418737, + "grad_norm": 6.010906219482422, + "learning_rate": 9.293002129169625e-05, + "loss": 0.06883003711700439, + "step": 49830 + }, + { + "epoch": 7.0745209368346345, + "grad_norm": 0.20025935769081116, + "learning_rate": 9.292860184528035e-05, + "loss": 0.022409272193908692, + "step": 49840 + }, + { + "epoch": 7.075940383250532, + "grad_norm": 0.16005253791809082, + "learning_rate": 9.292718239886444e-05, + "loss": 0.028452104330062865, + "step": 49850 + }, + { + "epoch": 7.07735982966643, + "grad_norm": 2.2057294845581055, + "learning_rate": 9.292576295244854e-05, + "loss": 0.03202352523803711, + "step": 49860 + }, + { + "epoch": 7.078779276082328, + "grad_norm": 0.6192593574523926, + "learning_rate": 9.292434350603265e-05, + "loss": 0.03418879210948944, + "step": 49870 + }, + { + "epoch": 7.080198722498226, + "grad_norm": 5.115660190582275, + "learning_rate": 9.292292405961675e-05, + "loss": 0.019133344292640686, + "step": 49880 + }, + { + "epoch": 7.081618168914123, + "grad_norm": 0.1792406588792801, + "learning_rate": 9.292150461320086e-05, + "loss": 0.029404124617576598, + "step": 49890 + }, + { + "epoch": 7.083037615330022, + "grad_norm": 4.115320682525635, + "learning_rate": 9.292008516678496e-05, + "loss": 0.009263063967227935, + "step": 49900 + }, + { + "epoch": 7.084457061745919, + "grad_norm": 1.0660799741744995, + "learning_rate": 9.291866572036906e-05, + "loss": 0.024234510958194733, + "step": 49910 + }, + { + "epoch": 7.0858765081618165, + "grad_norm": 0.9303340911865234, + "learning_rate": 9.291724627395317e-05, + "loss": 0.04151077270507812, + "step": 49920 + }, + { + "epoch": 7.087295954577715, + "grad_norm": 3.377434253692627, + "learning_rate": 9.291582682753726e-05, + "loss": 0.0200410395860672, + "step": 49930 + }, + { + "epoch": 7.088715400993612, + "grad_norm": 12.069241523742676, + "learning_rate": 9.291440738112138e-05, + "loss": 0.0754818320274353, + "step": 49940 + }, + { + "epoch": 7.0901348474095105, + "grad_norm": 0.038232311606407166, + "learning_rate": 9.291298793470546e-05, + "loss": 0.009347131848335266, + "step": 49950 + }, + { + "epoch": 7.091554293825408, + "grad_norm": 4.304689884185791, + "learning_rate": 9.291156848828957e-05, + "loss": 0.06845600605010986, + "step": 49960 + }, + { + "epoch": 7.092973740241306, + "grad_norm": 0.4425681233406067, + "learning_rate": 9.291014904187367e-05, + "loss": 0.0835187554359436, + "step": 49970 + }, + { + "epoch": 7.094393186657204, + "grad_norm": 3.1910290718078613, + "learning_rate": 9.290872959545778e-05, + "loss": 0.0663329541683197, + "step": 49980 + }, + { + "epoch": 7.095812633073101, + "grad_norm": 13.372017860412598, + "learning_rate": 9.290731014904188e-05, + "loss": 0.06052567958831787, + "step": 49990 + }, + { + "epoch": 7.097232079488999, + "grad_norm": 15.484184265136719, + "learning_rate": 9.290589070262597e-05, + "loss": 0.0745700716972351, + "step": 50000 + }, + { + "epoch": 7.097232079488999, + "eval_accuracy": 0.9763464106313983, + "eval_loss": 0.08104771375656128, + "eval_runtime": 31.7815, + "eval_samples_per_second": 494.847, + "eval_steps_per_second": 15.481, + "step": 50000 + }, + { + "epoch": 7.098651525904897, + "grad_norm": 0.3437268137931824, + "learning_rate": 9.290447125621008e-05, + "loss": 0.032279747724533084, + "step": 50010 + }, + { + "epoch": 7.100070972320795, + "grad_norm": 1.0124702453613281, + "learning_rate": 9.290305180979418e-05, + "loss": 0.04395381212234497, + "step": 50020 + }, + { + "epoch": 7.1014904187366925, + "grad_norm": 0.9249765276908875, + "learning_rate": 9.290163236337829e-05, + "loss": 0.09170477390289307, + "step": 50030 + }, + { + "epoch": 7.102909865152591, + "grad_norm": 0.7250400185585022, + "learning_rate": 9.290021291696239e-05, + "loss": 0.009999457001686095, + "step": 50040 + }, + { + "epoch": 7.104329311568488, + "grad_norm": 5.4864044189453125, + "learning_rate": 9.28987934705465e-05, + "loss": 0.026081347465515138, + "step": 50050 + }, + { + "epoch": 7.105748757984386, + "grad_norm": 6.135810375213623, + "learning_rate": 9.289737402413058e-05, + "loss": 0.04017038345336914, + "step": 50060 + }, + { + "epoch": 7.107168204400284, + "grad_norm": 0.10332240909337997, + "learning_rate": 9.28959545777147e-05, + "loss": 0.028695687651634216, + "step": 50070 + }, + { + "epoch": 7.108587650816181, + "grad_norm": 0.26266011595726013, + "learning_rate": 9.289453513129879e-05, + "loss": 0.018574948608875274, + "step": 50080 + }, + { + "epoch": 7.11000709723208, + "grad_norm": 7.670289993286133, + "learning_rate": 9.28931156848829e-05, + "loss": 0.022504398226737977, + "step": 50090 + }, + { + "epoch": 7.111426543647977, + "grad_norm": 0.9531182050704956, + "learning_rate": 9.2891696238467e-05, + "loss": 0.0202168732881546, + "step": 50100 + }, + { + "epoch": 7.112845990063875, + "grad_norm": 1.2940996885299683, + "learning_rate": 9.28902767920511e-05, + "loss": 0.014882153272628785, + "step": 50110 + }, + { + "epoch": 7.114265436479773, + "grad_norm": 0.21062831580638885, + "learning_rate": 9.288885734563521e-05, + "loss": 0.01849692016839981, + "step": 50120 + }, + { + "epoch": 7.115684882895671, + "grad_norm": 0.2405148595571518, + "learning_rate": 9.28874378992193e-05, + "loss": 0.029791396856307984, + "step": 50130 + }, + { + "epoch": 7.1171043293115686, + "grad_norm": 0.14352326095104218, + "learning_rate": 9.288601845280342e-05, + "loss": 0.061136239767074586, + "step": 50140 + }, + { + "epoch": 7.118523775727466, + "grad_norm": 9.354962348937988, + "learning_rate": 9.288459900638752e-05, + "loss": 0.04090102016925812, + "step": 50150 + }, + { + "epoch": 7.119943222143364, + "grad_norm": 0.7026478052139282, + "learning_rate": 9.288317955997161e-05, + "loss": 0.009190419316291809, + "step": 50160 + }, + { + "epoch": 7.121362668559262, + "grad_norm": 0.5306406617164612, + "learning_rate": 9.288176011355571e-05, + "loss": 0.05512397289276123, + "step": 50170 + }, + { + "epoch": 7.12278211497516, + "grad_norm": 1.5007140636444092, + "learning_rate": 9.288034066713982e-05, + "loss": 0.059499156475067136, + "step": 50180 + }, + { + "epoch": 7.124201561391057, + "grad_norm": 2.3778538703918457, + "learning_rate": 9.287892122072392e-05, + "loss": 0.010339123010635377, + "step": 50190 + }, + { + "epoch": 7.125621007806956, + "grad_norm": 2.3378491401672363, + "learning_rate": 9.287750177430803e-05, + "loss": 0.027239561080932617, + "step": 50200 + }, + { + "epoch": 7.127040454222853, + "grad_norm": 5.091325283050537, + "learning_rate": 9.287608232789213e-05, + "loss": 0.04783933460712433, + "step": 50210 + }, + { + "epoch": 7.128459900638751, + "grad_norm": 6.505466461181641, + "learning_rate": 9.287466288147622e-05, + "loss": 0.04781743884086609, + "step": 50220 + }, + { + "epoch": 7.129879347054649, + "grad_norm": 5.033595561981201, + "learning_rate": 9.287324343506033e-05, + "loss": 0.0876664400100708, + "step": 50230 + }, + { + "epoch": 7.131298793470546, + "grad_norm": 0.27636995911598206, + "learning_rate": 9.287182398864443e-05, + "loss": 0.03849413990974426, + "step": 50240 + }, + { + "epoch": 7.132718239886445, + "grad_norm": 8.013169288635254, + "learning_rate": 9.287040454222854e-05, + "loss": 0.06555190682411194, + "step": 50250 + }, + { + "epoch": 7.134137686302342, + "grad_norm": 1.1230844259262085, + "learning_rate": 9.286898509581263e-05, + "loss": 0.06196191906929016, + "step": 50260 + }, + { + "epoch": 7.13555713271824, + "grad_norm": 1.3510446548461914, + "learning_rate": 9.286756564939674e-05, + "loss": 0.058738571405410764, + "step": 50270 + }, + { + "epoch": 7.136976579134138, + "grad_norm": 1.6742855310440063, + "learning_rate": 9.286614620298084e-05, + "loss": 0.06358702182769775, + "step": 50280 + }, + { + "epoch": 7.138396025550035, + "grad_norm": 5.48180627822876, + "learning_rate": 9.286472675656495e-05, + "loss": 0.051518088579177855, + "step": 50290 + }, + { + "epoch": 7.1398154719659335, + "grad_norm": 0.646102249622345, + "learning_rate": 9.286330731014906e-05, + "loss": 0.019508914649486543, + "step": 50300 + }, + { + "epoch": 7.141234918381831, + "grad_norm": 4.530608177185059, + "learning_rate": 9.286188786373314e-05, + "loss": 0.038636896014213565, + "step": 50310 + }, + { + "epoch": 7.142654364797729, + "grad_norm": 0.1688341349363327, + "learning_rate": 9.286046841731725e-05, + "loss": 0.020194944739341737, + "step": 50320 + }, + { + "epoch": 7.144073811213627, + "grad_norm": 0.30358272790908813, + "learning_rate": 9.285904897090135e-05, + "loss": 0.01204570233821869, + "step": 50330 + }, + { + "epoch": 7.145493257629525, + "grad_norm": 0.09064985811710358, + "learning_rate": 9.285762952448546e-05, + "loss": 0.04475564062595368, + "step": 50340 + }, + { + "epoch": 7.146912704045422, + "grad_norm": 3.235868453979492, + "learning_rate": 9.285621007806956e-05, + "loss": 0.016041412949562073, + "step": 50350 + }, + { + "epoch": 7.14833215046132, + "grad_norm": 0.30754321813583374, + "learning_rate": 9.285479063165365e-05, + "loss": 0.034186741709709166, + "step": 50360 + }, + { + "epoch": 7.149751596877218, + "grad_norm": 1.8013185262680054, + "learning_rate": 9.285337118523775e-05, + "loss": 0.037149444222450256, + "step": 50370 + }, + { + "epoch": 7.1511710432931155, + "grad_norm": 3.8225793838500977, + "learning_rate": 9.285195173882186e-05, + "loss": 0.026364266872406006, + "step": 50380 + }, + { + "epoch": 7.152590489709014, + "grad_norm": 1.6880011558532715, + "learning_rate": 9.285053229240597e-05, + "loss": 0.04398062229156494, + "step": 50390 + }, + { + "epoch": 7.154009936124911, + "grad_norm": 0.6532132029533386, + "learning_rate": 9.284911284599007e-05, + "loss": 0.04088074564933777, + "step": 50400 + }, + { + "epoch": 7.1554293825408095, + "grad_norm": 0.6275845170021057, + "learning_rate": 9.284769339957418e-05, + "loss": 0.03311673402786255, + "step": 50410 + }, + { + "epoch": 7.156848828956707, + "grad_norm": 8.379541397094727, + "learning_rate": 9.284627395315827e-05, + "loss": 0.06421371102333069, + "step": 50420 + }, + { + "epoch": 7.158268275372604, + "grad_norm": 0.3997638523578644, + "learning_rate": 9.284485450674238e-05, + "loss": 0.03623417913913727, + "step": 50430 + }, + { + "epoch": 7.159687721788503, + "grad_norm": 3.67484450340271, + "learning_rate": 9.284343506032647e-05, + "loss": 0.07285212278366089, + "step": 50440 + }, + { + "epoch": 7.1611071682044, + "grad_norm": 10.62584400177002, + "learning_rate": 9.284201561391059e-05, + "loss": 0.05084116458892822, + "step": 50450 + }, + { + "epoch": 7.162526614620298, + "grad_norm": 4.304548740386963, + "learning_rate": 9.284059616749468e-05, + "loss": 0.05473562479019165, + "step": 50460 + }, + { + "epoch": 7.163946061036196, + "grad_norm": 0.5049075484275818, + "learning_rate": 9.283917672107878e-05, + "loss": 0.03764554262161255, + "step": 50470 + }, + { + "epoch": 7.165365507452094, + "grad_norm": 7.426001071929932, + "learning_rate": 9.283775727466288e-05, + "loss": 0.07254430651664734, + "step": 50480 + }, + { + "epoch": 7.1667849538679915, + "grad_norm": 7.990915775299072, + "learning_rate": 9.283633782824699e-05, + "loss": 0.07232113480567932, + "step": 50490 + }, + { + "epoch": 7.168204400283889, + "grad_norm": 6.232173919677734, + "learning_rate": 9.28349183818311e-05, + "loss": 0.057746291160583496, + "step": 50500 + }, + { + "epoch": 7.168204400283889, + "eval_accuracy": 0.9734850893368093, + "eval_loss": 0.08124034851789474, + "eval_runtime": 32.7664, + "eval_samples_per_second": 479.973, + "eval_steps_per_second": 15.015, + "step": 50500 + }, + { + "epoch": 7.169623846699787, + "grad_norm": 1.4507334232330322, + "learning_rate": 9.28334989354152e-05, + "loss": 0.02919427752494812, + "step": 50510 + }, + { + "epoch": 7.171043293115685, + "grad_norm": 5.406381607055664, + "learning_rate": 9.28320794889993e-05, + "loss": 0.06213176846504211, + "step": 50520 + }, + { + "epoch": 7.172462739531583, + "grad_norm": 0.06627151370048523, + "learning_rate": 9.283066004258339e-05, + "loss": 0.020755791664123537, + "step": 50530 + }, + { + "epoch": 7.17388218594748, + "grad_norm": 6.034979820251465, + "learning_rate": 9.28292405961675e-05, + "loss": 0.04492558836936951, + "step": 50540 + }, + { + "epoch": 7.175301632363379, + "grad_norm": 5.751996994018555, + "learning_rate": 9.28278211497516e-05, + "loss": 0.03700354397296905, + "step": 50550 + }, + { + "epoch": 7.176721078779276, + "grad_norm": 4.298292636871338, + "learning_rate": 9.282640170333571e-05, + "loss": 0.0337158739566803, + "step": 50560 + }, + { + "epoch": 7.1781405251951735, + "grad_norm": 0.4003825783729553, + "learning_rate": 9.28249822569198e-05, + "loss": 0.04332021772861481, + "step": 50570 + }, + { + "epoch": 7.179559971611072, + "grad_norm": 5.302791595458984, + "learning_rate": 9.28235628105039e-05, + "loss": 0.02911527454853058, + "step": 50580 + }, + { + "epoch": 7.180979418026969, + "grad_norm": 6.431861400604248, + "learning_rate": 9.282214336408802e-05, + "loss": 0.05217592716217041, + "step": 50590 + }, + { + "epoch": 7.182398864442868, + "grad_norm": 0.46359333395957947, + "learning_rate": 9.28208658623137e-05, + "loss": 0.10249303579330445, + "step": 50600 + }, + { + "epoch": 7.183818310858765, + "grad_norm": 0.7804433703422546, + "learning_rate": 9.28194464158978e-05, + "loss": 0.031027427315711974, + "step": 50610 + }, + { + "epoch": 7.185237757274663, + "grad_norm": 0.3068195581436157, + "learning_rate": 9.281802696948191e-05, + "loss": 0.050842708349227904, + "step": 50620 + }, + { + "epoch": 7.186657203690561, + "grad_norm": 0.4566839337348938, + "learning_rate": 9.281660752306601e-05, + "loss": 0.05524548292160034, + "step": 50630 + }, + { + "epoch": 7.188076650106458, + "grad_norm": 12.260385513305664, + "learning_rate": 9.28151880766501e-05, + "loss": 0.07461308240890503, + "step": 50640 + }, + { + "epoch": 7.189496096522356, + "grad_norm": 8.496342658996582, + "learning_rate": 9.281376863023422e-05, + "loss": 0.03306256532669068, + "step": 50650 + }, + { + "epoch": 7.190915542938254, + "grad_norm": 0.06557629257440567, + "learning_rate": 9.281234918381831e-05, + "loss": 0.023129934072494508, + "step": 50660 + }, + { + "epoch": 7.192334989354152, + "grad_norm": 0.11483972519636154, + "learning_rate": 9.281092973740242e-05, + "loss": 0.039566820859909056, + "step": 50670 + }, + { + "epoch": 7.19375443577005, + "grad_norm": 0.21403329074382782, + "learning_rate": 9.280951029098652e-05, + "loss": 0.026081389188766478, + "step": 50680 + }, + { + "epoch": 7.195173882185948, + "grad_norm": 0.513241171836853, + "learning_rate": 9.280809084457062e-05, + "loss": 0.07000666260719299, + "step": 50690 + }, + { + "epoch": 7.196593328601845, + "grad_norm": 0.19217133522033691, + "learning_rate": 9.280667139815472e-05, + "loss": 0.025768482685089113, + "step": 50700 + }, + { + "epoch": 7.198012775017743, + "grad_norm": 10.442272186279297, + "learning_rate": 9.280525195173883e-05, + "loss": 0.049917465448379515, + "step": 50710 + }, + { + "epoch": 7.199432221433641, + "grad_norm": 4.785938262939453, + "learning_rate": 9.280383250532292e-05, + "loss": 0.05000673532485962, + "step": 50720 + }, + { + "epoch": 7.2008516678495385, + "grad_norm": 0.14389902353286743, + "learning_rate": 9.280241305890704e-05, + "loss": 0.06158308386802673, + "step": 50730 + }, + { + "epoch": 7.202271114265437, + "grad_norm": 0.6582837104797363, + "learning_rate": 9.280099361249113e-05, + "loss": 0.007644937932491302, + "step": 50740 + }, + { + "epoch": 7.203690560681334, + "grad_norm": 6.236335277557373, + "learning_rate": 9.279957416607523e-05, + "loss": 0.0622941255569458, + "step": 50750 + }, + { + "epoch": 7.2051100070972325, + "grad_norm": 4.699357509613037, + "learning_rate": 9.279815471965934e-05, + "loss": 0.10394492149353027, + "step": 50760 + }, + { + "epoch": 7.20652945351313, + "grad_norm": 2.788632392883301, + "learning_rate": 9.279673527324344e-05, + "loss": 0.05861709117889404, + "step": 50770 + }, + { + "epoch": 7.207948899929027, + "grad_norm": 7.0571513175964355, + "learning_rate": 9.279531582682755e-05, + "loss": 0.09752376079559326, + "step": 50780 + }, + { + "epoch": 7.209368346344926, + "grad_norm": 1.116761565208435, + "learning_rate": 9.279389638041165e-05, + "loss": 0.07887290120124817, + "step": 50790 + }, + { + "epoch": 7.210787792760823, + "grad_norm": 2.5623505115509033, + "learning_rate": 9.279247693399574e-05, + "loss": 0.06459965705871581, + "step": 50800 + }, + { + "epoch": 7.212207239176721, + "grad_norm": 0.9416432976722717, + "learning_rate": 9.279105748757984e-05, + "loss": 0.02110120952129364, + "step": 50810 + }, + { + "epoch": 7.213626685592619, + "grad_norm": 0.32998695969581604, + "learning_rate": 9.278963804116395e-05, + "loss": 0.054609715938568115, + "step": 50820 + }, + { + "epoch": 7.215046132008517, + "grad_norm": 0.8758980631828308, + "learning_rate": 9.278821859474805e-05, + "loss": 0.03850732147693634, + "step": 50830 + }, + { + "epoch": 7.2164655784244145, + "grad_norm": 4.321909427642822, + "learning_rate": 9.278679914833216e-05, + "loss": 0.01589832454919815, + "step": 50840 + }, + { + "epoch": 7.217885024840312, + "grad_norm": 0.6561715602874756, + "learning_rate": 9.278537970191626e-05, + "loss": 0.04444275200366974, + "step": 50850 + }, + { + "epoch": 7.21930447125621, + "grad_norm": 0.4713881015777588, + "learning_rate": 9.278396025550036e-05, + "loss": 0.05532388091087341, + "step": 50860 + }, + { + "epoch": 7.220723917672108, + "grad_norm": 3.666248083114624, + "learning_rate": 9.278254080908447e-05, + "loss": 0.03535724282264709, + "step": 50870 + }, + { + "epoch": 7.222143364088006, + "grad_norm": 7.525345802307129, + "learning_rate": 9.278112136266856e-05, + "loss": 0.062016028165817264, + "step": 50880 + }, + { + "epoch": 7.223562810503903, + "grad_norm": 4.87885046005249, + "learning_rate": 9.277970191625267e-05, + "loss": 0.031429398059844973, + "step": 50890 + }, + { + "epoch": 7.224982256919802, + "grad_norm": 0.5300387144088745, + "learning_rate": 9.277828246983676e-05, + "loss": 0.025107333064079286, + "step": 50900 + }, + { + "epoch": 7.226401703335699, + "grad_norm": 2.4232094287872314, + "learning_rate": 9.277686302342087e-05, + "loss": 0.043664786219596866, + "step": 50910 + }, + { + "epoch": 7.2278211497515965, + "grad_norm": 1.0089149475097656, + "learning_rate": 9.277544357700497e-05, + "loss": 0.013369449973106384, + "step": 50920 + }, + { + "epoch": 7.229240596167495, + "grad_norm": 3.6185240745544434, + "learning_rate": 9.277402413058908e-05, + "loss": 0.04417063593864441, + "step": 50930 + }, + { + "epoch": 7.230660042583392, + "grad_norm": 1.650692343711853, + "learning_rate": 9.277260468417317e-05, + "loss": 0.02690485417842865, + "step": 50940 + }, + { + "epoch": 7.2320794889992905, + "grad_norm": 0.08742735534906387, + "learning_rate": 9.277118523775727e-05, + "loss": 0.06844819188117982, + "step": 50950 + }, + { + "epoch": 7.233498935415188, + "grad_norm": 0.1685476005077362, + "learning_rate": 9.276976579134138e-05, + "loss": 0.07185621857643128, + "step": 50960 + }, + { + "epoch": 7.234918381831086, + "grad_norm": 0.1583138257265091, + "learning_rate": 9.276834634492548e-05, + "loss": 0.019683878123760223, + "step": 50970 + }, + { + "epoch": 7.236337828246984, + "grad_norm": 1.6047672033309937, + "learning_rate": 9.276692689850959e-05, + "loss": 0.038810908794403076, + "step": 50980 + }, + { + "epoch": 7.237757274662881, + "grad_norm": 1.8266907930374146, + "learning_rate": 9.276550745209369e-05, + "loss": 0.04252366423606872, + "step": 50990 + }, + { + "epoch": 7.239176721078779, + "grad_norm": 3.5572457313537598, + "learning_rate": 9.276408800567779e-05, + "loss": 0.032119011878967284, + "step": 51000 + }, + { + "epoch": 7.239176721078779, + "eval_accuracy": 0.9795256565142748, + "eval_loss": 0.058488957583904266, + "eval_runtime": 33.388, + "eval_samples_per_second": 471.038, + "eval_steps_per_second": 14.736, + "step": 51000 + }, + { + "epoch": 7.240596167494677, + "grad_norm": 0.8323566317558289, + "learning_rate": 9.276266855926188e-05, + "loss": 0.03890877366065979, + "step": 51010 + }, + { + "epoch": 7.242015613910575, + "grad_norm": 5.316452980041504, + "learning_rate": 9.2761249112846e-05, + "loss": 0.045993471145629884, + "step": 51020 + }, + { + "epoch": 7.2434350603264726, + "grad_norm": 0.26393166184425354, + "learning_rate": 9.275982966643009e-05, + "loss": 0.051994693279266355, + "step": 51030 + }, + { + "epoch": 7.244854506742371, + "grad_norm": 0.12484807521104813, + "learning_rate": 9.27584102200142e-05, + "loss": 0.021381711959838866, + "step": 51040 + }, + { + "epoch": 7.246273953158268, + "grad_norm": 1.613708257675171, + "learning_rate": 9.27569907735983e-05, + "loss": 0.06344324350357056, + "step": 51050 + }, + { + "epoch": 7.247693399574166, + "grad_norm": 0.032920923084020615, + "learning_rate": 9.27555713271824e-05, + "loss": 0.06143348813056946, + "step": 51060 + }, + { + "epoch": 7.249112845990064, + "grad_norm": 7.846259593963623, + "learning_rate": 9.275415188076651e-05, + "loss": 0.027347713708877563, + "step": 51070 + }, + { + "epoch": 7.250532292405961, + "grad_norm": 16.31854248046875, + "learning_rate": 9.27527324343506e-05, + "loss": 0.03906906247138977, + "step": 51080 + }, + { + "epoch": 7.25195173882186, + "grad_norm": 1.4159921407699585, + "learning_rate": 9.275131298793472e-05, + "loss": 0.02941213846206665, + "step": 51090 + }, + { + "epoch": 7.253371185237757, + "grad_norm": 7.948617935180664, + "learning_rate": 9.274989354151881e-05, + "loss": 0.05467774271965027, + "step": 51100 + }, + { + "epoch": 7.2547906316536555, + "grad_norm": 0.22301502525806427, + "learning_rate": 9.274847409510291e-05, + "loss": 0.057220518589019775, + "step": 51110 + }, + { + "epoch": 7.256210078069553, + "grad_norm": 3.7436046600341797, + "learning_rate": 9.274705464868701e-05, + "loss": 0.07023456692695618, + "step": 51120 + }, + { + "epoch": 7.25762952448545, + "grad_norm": 0.8607221245765686, + "learning_rate": 9.274563520227112e-05, + "loss": 0.028274688124656677, + "step": 51130 + }, + { + "epoch": 7.259048970901349, + "grad_norm": 2.3814191818237305, + "learning_rate": 9.274421575585522e-05, + "loss": 0.02873871922492981, + "step": 51140 + }, + { + "epoch": 7.260468417317246, + "grad_norm": 0.7200617790222168, + "learning_rate": 9.274279630943933e-05, + "loss": 0.023874977231025697, + "step": 51150 + }, + { + "epoch": 7.261887863733144, + "grad_norm": 6.621379375457764, + "learning_rate": 9.274137686302343e-05, + "loss": 0.036438983678817746, + "step": 51160 + }, + { + "epoch": 7.263307310149042, + "grad_norm": 1.800775170326233, + "learning_rate": 9.273995741660752e-05, + "loss": 0.023814010620117187, + "step": 51170 + }, + { + "epoch": 7.26472675656494, + "grad_norm": 1.1323634386062622, + "learning_rate": 9.273853797019163e-05, + "loss": 0.03467157781124115, + "step": 51180 + }, + { + "epoch": 7.2661462029808375, + "grad_norm": 2.3260347843170166, + "learning_rate": 9.273711852377573e-05, + "loss": 0.03501830995082855, + "step": 51190 + }, + { + "epoch": 7.267565649396735, + "grad_norm": 0.3461279571056366, + "learning_rate": 9.273569907735984e-05, + "loss": 0.02026255279779434, + "step": 51200 + }, + { + "epoch": 7.268985095812633, + "grad_norm": 8.559354782104492, + "learning_rate": 9.273427963094393e-05, + "loss": 0.0435549259185791, + "step": 51210 + }, + { + "epoch": 7.270404542228531, + "grad_norm": 5.178162574768066, + "learning_rate": 9.273286018452804e-05, + "loss": 0.03414537012577057, + "step": 51220 + }, + { + "epoch": 7.271823988644429, + "grad_norm": 4.830866813659668, + "learning_rate": 9.273144073811213e-05, + "loss": 0.0681527316570282, + "step": 51230 + }, + { + "epoch": 7.273243435060326, + "grad_norm": 1.7771570682525635, + "learning_rate": 9.273002129169625e-05, + "loss": 0.0468337744474411, + "step": 51240 + }, + { + "epoch": 7.274662881476225, + "grad_norm": 0.07269584387540817, + "learning_rate": 9.272860184528036e-05, + "loss": 0.02570006251335144, + "step": 51250 + }, + { + "epoch": 7.276082327892122, + "grad_norm": 5.584953308105469, + "learning_rate": 9.272718239886444e-05, + "loss": 0.06317769289016724, + "step": 51260 + }, + { + "epoch": 7.2775017743080195, + "grad_norm": 5.165135383605957, + "learning_rate": 9.272576295244855e-05, + "loss": 0.039837440848350524, + "step": 51270 + }, + { + "epoch": 7.278921220723918, + "grad_norm": 2.866654634475708, + "learning_rate": 9.272434350603265e-05, + "loss": 0.03418941795825958, + "step": 51280 + }, + { + "epoch": 7.280340667139815, + "grad_norm": 7.37003755569458, + "learning_rate": 9.272292405961676e-05, + "loss": 0.09569458365440368, + "step": 51290 + }, + { + "epoch": 7.2817601135557135, + "grad_norm": 0.13972480595111847, + "learning_rate": 9.272150461320086e-05, + "loss": 0.043325915932655334, + "step": 51300 + }, + { + "epoch": 7.283179559971611, + "grad_norm": 6.135573387145996, + "learning_rate": 9.272008516678495e-05, + "loss": 0.04155711829662323, + "step": 51310 + }, + { + "epoch": 7.284599006387509, + "grad_norm": 0.10803493857383728, + "learning_rate": 9.271866572036905e-05, + "loss": 0.06946607828140258, + "step": 51320 + }, + { + "epoch": 7.286018452803407, + "grad_norm": 0.6979092955589294, + "learning_rate": 9.271724627395316e-05, + "loss": 0.03188393712043762, + "step": 51330 + }, + { + "epoch": 7.287437899219304, + "grad_norm": 2.248213291168213, + "learning_rate": 9.271582682753727e-05, + "loss": 0.057016730308532715, + "step": 51340 + }, + { + "epoch": 7.288857345635202, + "grad_norm": 0.8164470791816711, + "learning_rate": 9.271440738112137e-05, + "loss": 0.0411460280418396, + "step": 51350 + }, + { + "epoch": 7.2902767920511, + "grad_norm": 1.162987232208252, + "learning_rate": 9.271298793470547e-05, + "loss": 0.06204650998115539, + "step": 51360 + }, + { + "epoch": 7.291696238466998, + "grad_norm": 0.28414785861968994, + "learning_rate": 9.271156848828957e-05, + "loss": 0.07645946145057678, + "step": 51370 + }, + { + "epoch": 7.2931156848828955, + "grad_norm": 4.3621506690979, + "learning_rate": 9.271014904187368e-05, + "loss": 0.06504054665565491, + "step": 51380 + }, + { + "epoch": 7.294535131298794, + "grad_norm": 1.3629231452941895, + "learning_rate": 9.270872959545777e-05, + "loss": 0.05250757336616516, + "step": 51390 + }, + { + "epoch": 7.295954577714691, + "grad_norm": 0.1615055948495865, + "learning_rate": 9.270731014904188e-05, + "loss": 0.06251652240753174, + "step": 51400 + }, + { + "epoch": 7.297374024130589, + "grad_norm": 0.4495696723461151, + "learning_rate": 9.270589070262597e-05, + "loss": 0.02343766689300537, + "step": 51410 + }, + { + "epoch": 7.298793470546487, + "grad_norm": 0.5210222005844116, + "learning_rate": 9.270447125621008e-05, + "loss": 0.02886645793914795, + "step": 51420 + }, + { + "epoch": 7.300212916962384, + "grad_norm": 1.460491418838501, + "learning_rate": 9.270305180979419e-05, + "loss": 0.05959440469741821, + "step": 51430 + }, + { + "epoch": 7.301632363378283, + "grad_norm": 8.31848430633545, + "learning_rate": 9.270163236337829e-05, + "loss": 0.027948886156082153, + "step": 51440 + }, + { + "epoch": 7.30305180979418, + "grad_norm": 0.22754132747650146, + "learning_rate": 9.27002129169624e-05, + "loss": 0.0412900447845459, + "step": 51450 + }, + { + "epoch": 7.304471256210078, + "grad_norm": 1.9300298690795898, + "learning_rate": 9.26987934705465e-05, + "loss": 0.07148487567901611, + "step": 51460 + }, + { + "epoch": 7.305890702625976, + "grad_norm": 3.128782272338867, + "learning_rate": 9.269737402413059e-05, + "loss": 0.035791015625, + "step": 51470 + }, + { + "epoch": 7.307310149041873, + "grad_norm": 2.424389600753784, + "learning_rate": 9.269595457771469e-05, + "loss": 0.04335363209247589, + "step": 51480 + }, + { + "epoch": 7.308729595457772, + "grad_norm": 6.377645969390869, + "learning_rate": 9.26945351312988e-05, + "loss": 0.027349942922592164, + "step": 51490 + }, + { + "epoch": 7.310149041873669, + "grad_norm": 2.207902193069458, + "learning_rate": 9.26931156848829e-05, + "loss": 0.0474819153547287, + "step": 51500 + }, + { + "epoch": 7.310149041873669, + "eval_accuracy": 0.9765371653843709, + "eval_loss": 0.07471198588609695, + "eval_runtime": 33.8232, + "eval_samples_per_second": 464.976, + "eval_steps_per_second": 14.546, + "step": 51500 + }, + { + "epoch": 7.311568488289567, + "grad_norm": 1.7475389242172241, + "learning_rate": 9.269169623846701e-05, + "loss": 0.032795050740242006, + "step": 51510 + }, + { + "epoch": 7.312987934705465, + "grad_norm": 0.86590576171875, + "learning_rate": 9.269027679205111e-05, + "loss": 0.023722925782203676, + "step": 51520 + }, + { + "epoch": 7.314407381121363, + "grad_norm": 2.6370997428894043, + "learning_rate": 9.26888573456352e-05, + "loss": 0.05528920292854309, + "step": 51530 + }, + { + "epoch": 7.31582682753726, + "grad_norm": 0.4650154411792755, + "learning_rate": 9.268743789921932e-05, + "loss": 0.0307805597782135, + "step": 51540 + }, + { + "epoch": 7.317246273953158, + "grad_norm": 0.9520929455757141, + "learning_rate": 9.268601845280341e-05, + "loss": 0.0699259340763092, + "step": 51550 + }, + { + "epoch": 7.318665720369056, + "grad_norm": 0.24121342599391937, + "learning_rate": 9.268459900638752e-05, + "loss": 0.024345090985298155, + "step": 51560 + }, + { + "epoch": 7.320085166784954, + "grad_norm": 3.9583966732025146, + "learning_rate": 9.268317955997161e-05, + "loss": 0.018575282394886018, + "step": 51570 + }, + { + "epoch": 7.321504613200852, + "grad_norm": 2.0327770709991455, + "learning_rate": 9.268176011355572e-05, + "loss": 0.04089633226394653, + "step": 51580 + }, + { + "epoch": 7.322924059616749, + "grad_norm": 9.078707695007324, + "learning_rate": 9.268034066713982e-05, + "loss": 0.041544276475906375, + "step": 51590 + }, + { + "epoch": 7.324343506032648, + "grad_norm": 1.0922521352767944, + "learning_rate": 9.267892122072393e-05, + "loss": 0.03794052600860596, + "step": 51600 + }, + { + "epoch": 7.325762952448545, + "grad_norm": 12.491849899291992, + "learning_rate": 9.267750177430802e-05, + "loss": 0.02722683846950531, + "step": 51610 + }, + { + "epoch": 7.3271823988644424, + "grad_norm": 0.5909501910209656, + "learning_rate": 9.267608232789212e-05, + "loss": 0.01713113784790039, + "step": 51620 + }, + { + "epoch": 7.328601845280341, + "grad_norm": 6.109976768493652, + "learning_rate": 9.267466288147623e-05, + "loss": 0.033209878206253055, + "step": 51630 + }, + { + "epoch": 7.330021291696238, + "grad_norm": 0.014347659423947334, + "learning_rate": 9.267324343506033e-05, + "loss": 0.03499612212181091, + "step": 51640 + }, + { + "epoch": 7.3314407381121365, + "grad_norm": 3.9987330436706543, + "learning_rate": 9.267182398864444e-05, + "loss": 0.06285910606384278, + "step": 51650 + }, + { + "epoch": 7.332860184528034, + "grad_norm": 1.168338418006897, + "learning_rate": 9.267040454222854e-05, + "loss": 0.05484226942062378, + "step": 51660 + }, + { + "epoch": 7.334279630943932, + "grad_norm": 0.682669997215271, + "learning_rate": 9.266898509581264e-05, + "loss": 0.07656214237213135, + "step": 51670 + }, + { + "epoch": 7.33569907735983, + "grad_norm": 0.18298189342021942, + "learning_rate": 9.266756564939673e-05, + "loss": 0.01965651214122772, + "step": 51680 + }, + { + "epoch": 7.337118523775727, + "grad_norm": 0.39247646927833557, + "learning_rate": 9.266614620298084e-05, + "loss": 0.04974568784236908, + "step": 51690 + }, + { + "epoch": 7.338537970191625, + "grad_norm": 1.2671245336532593, + "learning_rate": 9.266472675656494e-05, + "loss": 0.03542390167713165, + "step": 51700 + }, + { + "epoch": 7.339957416607523, + "grad_norm": 11.112316131591797, + "learning_rate": 9.266330731014905e-05, + "loss": 0.04434594511985779, + "step": 51710 + }, + { + "epoch": 7.341376863023421, + "grad_norm": 1.1118406057357788, + "learning_rate": 9.266188786373315e-05, + "loss": 0.02555614709854126, + "step": 51720 + }, + { + "epoch": 7.3427963094393185, + "grad_norm": 3.3655858039855957, + "learning_rate": 9.266046841731725e-05, + "loss": 0.014670975506305695, + "step": 51730 + }, + { + "epoch": 7.344215755855217, + "grad_norm": 4.719293117523193, + "learning_rate": 9.265904897090136e-05, + "loss": 0.042307913303375244, + "step": 51740 + }, + { + "epoch": 7.345635202271114, + "grad_norm": 0.2682708203792572, + "learning_rate": 9.265762952448546e-05, + "loss": 0.018885372579097746, + "step": 51750 + }, + { + "epoch": 7.347054648687012, + "grad_norm": 4.208601474761963, + "learning_rate": 9.265621007806957e-05, + "loss": 0.06062667965888977, + "step": 51760 + }, + { + "epoch": 7.34847409510291, + "grad_norm": 2.183436155319214, + "learning_rate": 9.265479063165365e-05, + "loss": 0.054003679752349855, + "step": 51770 + }, + { + "epoch": 7.349893541518807, + "grad_norm": 2.069380760192871, + "learning_rate": 9.265337118523776e-05, + "loss": 0.020816315710544587, + "step": 51780 + }, + { + "epoch": 7.351312987934706, + "grad_norm": 0.5530464053153992, + "learning_rate": 9.265195173882186e-05, + "loss": 0.038864347338676455, + "step": 51790 + }, + { + "epoch": 7.352732434350603, + "grad_norm": 5.707481861114502, + "learning_rate": 9.265053229240597e-05, + "loss": 0.06501655578613282, + "step": 51800 + }, + { + "epoch": 7.354151880766501, + "grad_norm": 6.6138153076171875, + "learning_rate": 9.264911284599007e-05, + "loss": 0.055298763513565066, + "step": 51810 + }, + { + "epoch": 7.355571327182399, + "grad_norm": 1.1508212089538574, + "learning_rate": 9.264769339957418e-05, + "loss": 0.01087142527103424, + "step": 51820 + }, + { + "epoch": 7.356990773598296, + "grad_norm": 6.416627883911133, + "learning_rate": 9.264627395315827e-05, + "loss": 0.046122944355010985, + "step": 51830 + }, + { + "epoch": 7.3584102200141945, + "grad_norm": 0.3396334946155548, + "learning_rate": 9.264485450674237e-05, + "loss": 0.037940219044685364, + "step": 51840 + }, + { + "epoch": 7.359829666430092, + "grad_norm": 3.5331318378448486, + "learning_rate": 9.264343506032648e-05, + "loss": 0.04645732939243317, + "step": 51850 + }, + { + "epoch": 7.36124911284599, + "grad_norm": 0.4708951711654663, + "learning_rate": 9.264201561391058e-05, + "loss": 0.06385657787322999, + "step": 51860 + }, + { + "epoch": 7.362668559261888, + "grad_norm": 0.5442249178886414, + "learning_rate": 9.264059616749469e-05, + "loss": 0.021492500603199006, + "step": 51870 + }, + { + "epoch": 7.364088005677786, + "grad_norm": 4.089636325836182, + "learning_rate": 9.263917672107878e-05, + "loss": 0.018961572647094728, + "step": 51880 + }, + { + "epoch": 7.365507452093683, + "grad_norm": 4.164590358734131, + "learning_rate": 9.263775727466289e-05, + "loss": 0.021182073652744292, + "step": 51890 + }, + { + "epoch": 7.366926898509581, + "grad_norm": 0.1637565642595291, + "learning_rate": 9.263633782824698e-05, + "loss": 0.01856728196144104, + "step": 51900 + }, + { + "epoch": 7.368346344925479, + "grad_norm": 7.452282428741455, + "learning_rate": 9.26349183818311e-05, + "loss": 0.049343031644821164, + "step": 51910 + }, + { + "epoch": 7.3697657913413765, + "grad_norm": 3.365506172180176, + "learning_rate": 9.263349893541519e-05, + "loss": 0.042214390635490415, + "step": 51920 + }, + { + "epoch": 7.371185237757275, + "grad_norm": 0.08252433687448502, + "learning_rate": 9.263207948899929e-05, + "loss": 0.01741803586483002, + "step": 51930 + }, + { + "epoch": 7.372604684173172, + "grad_norm": 1.889232873916626, + "learning_rate": 9.26306600425834e-05, + "loss": 0.020835280418395996, + "step": 51940 + }, + { + "epoch": 7.374024130589071, + "grad_norm": 7.806169509887695, + "learning_rate": 9.26292405961675e-05, + "loss": 0.07517807483673096, + "step": 51950 + }, + { + "epoch": 7.375443577004968, + "grad_norm": 5.743335723876953, + "learning_rate": 9.262782114975161e-05, + "loss": 0.030621072649955748, + "step": 51960 + }, + { + "epoch": 7.376863023420865, + "grad_norm": 7.148108959197998, + "learning_rate": 9.26264017033357e-05, + "loss": 0.05479738712310791, + "step": 51970 + }, + { + "epoch": 7.378282469836764, + "grad_norm": 0.13797208666801453, + "learning_rate": 9.26249822569198e-05, + "loss": 0.01111309826374054, + "step": 51980 + }, + { + "epoch": 7.379701916252661, + "grad_norm": 0.9274955987930298, + "learning_rate": 9.26235628105039e-05, + "loss": 0.019465911388397216, + "step": 51990 + }, + { + "epoch": 7.3811213626685594, + "grad_norm": 1.8465386629104614, + "learning_rate": 9.262214336408801e-05, + "loss": 0.02890079915523529, + "step": 52000 + }, + { + "epoch": 7.3811213626685594, + "eval_accuracy": 0.9748203726076174, + "eval_loss": 0.07954176515340805, + "eval_runtime": 32.7811, + "eval_samples_per_second": 479.758, + "eval_steps_per_second": 15.009, + "step": 52000 + }, + { + "epoch": 7.382540809084457, + "grad_norm": 8.025490760803223, + "learning_rate": 9.262072391767211e-05, + "loss": 0.10265064239501953, + "step": 52010 + }, + { + "epoch": 7.383960255500355, + "grad_norm": 0.5458213686943054, + "learning_rate": 9.261930447125622e-05, + "loss": 0.02167295664548874, + "step": 52020 + }, + { + "epoch": 7.385379701916253, + "grad_norm": 1.3527144193649292, + "learning_rate": 9.261788502484032e-05, + "loss": 0.016633424162864684, + "step": 52030 + }, + { + "epoch": 7.38679914833215, + "grad_norm": 4.071643829345703, + "learning_rate": 9.261646557842441e-05, + "loss": 0.05185266137123108, + "step": 52040 + }, + { + "epoch": 7.388218594748048, + "grad_norm": 2.140740156173706, + "learning_rate": 9.261504613200853e-05, + "loss": 0.008098404109477996, + "step": 52050 + }, + { + "epoch": 7.389638041163946, + "grad_norm": 0.37996429204940796, + "learning_rate": 9.261362668559262e-05, + "loss": 0.03676808774471283, + "step": 52060 + }, + { + "epoch": 7.391057487579844, + "grad_norm": 3.958872079849243, + "learning_rate": 9.261220723917673e-05, + "loss": 0.030224177241325378, + "step": 52070 + }, + { + "epoch": 7.3924769339957415, + "grad_norm": 2.3154561519622803, + "learning_rate": 9.261078779276082e-05, + "loss": 0.0440337210893631, + "step": 52080 + }, + { + "epoch": 7.39389638041164, + "grad_norm": 10.971945762634277, + "learning_rate": 9.260936834634493e-05, + "loss": 0.08902759552001953, + "step": 52090 + }, + { + "epoch": 7.395315826827537, + "grad_norm": 3.9263570308685303, + "learning_rate": 9.260794889992903e-05, + "loss": 0.07159250974655151, + "step": 52100 + }, + { + "epoch": 7.396735273243435, + "grad_norm": 7.3109049797058105, + "learning_rate": 9.260652945351314e-05, + "loss": 0.0619121789932251, + "step": 52110 + }, + { + "epoch": 7.398154719659333, + "grad_norm": 2.75917911529541, + "learning_rate": 9.260511000709723e-05, + "loss": 0.05869649052619934, + "step": 52120 + }, + { + "epoch": 7.39957416607523, + "grad_norm": 2.691560983657837, + "learning_rate": 9.260369056068133e-05, + "loss": 0.07508642673492431, + "step": 52130 + }, + { + "epoch": 7.400993612491129, + "grad_norm": 0.7861066460609436, + "learning_rate": 9.260227111426544e-05, + "loss": 0.08113704323768615, + "step": 52140 + }, + { + "epoch": 7.402413058907026, + "grad_norm": 11.484288215637207, + "learning_rate": 9.260085166784954e-05, + "loss": 0.04761863350868225, + "step": 52150 + }, + { + "epoch": 7.403832505322924, + "grad_norm": 2.5290586948394775, + "learning_rate": 9.259943222143365e-05, + "loss": 0.053852963447570804, + "step": 52160 + }, + { + "epoch": 7.405251951738822, + "grad_norm": 7.964162826538086, + "learning_rate": 9.259801277501775e-05, + "loss": 0.03996648192405701, + "step": 52170 + }, + { + "epoch": 7.406671398154719, + "grad_norm": 0.3172842562198639, + "learning_rate": 9.259659332860186e-05, + "loss": 0.03223178386688232, + "step": 52180 + }, + { + "epoch": 7.4080908445706175, + "grad_norm": 1.804693341255188, + "learning_rate": 9.259517388218594e-05, + "loss": 0.04552800059318542, + "step": 52190 + }, + { + "epoch": 7.409510290986515, + "grad_norm": 4.020263195037842, + "learning_rate": 9.259375443577005e-05, + "loss": 0.10396513938903809, + "step": 52200 + }, + { + "epoch": 7.410929737402413, + "grad_norm": 0.20161767303943634, + "learning_rate": 9.259233498935415e-05, + "loss": 0.031718161702156064, + "step": 52210 + }, + { + "epoch": 7.412349183818311, + "grad_norm": 0.14328189194202423, + "learning_rate": 9.259091554293826e-05, + "loss": 0.028231072425842284, + "step": 52220 + }, + { + "epoch": 7.413768630234209, + "grad_norm": 3.6482315063476562, + "learning_rate": 9.258949609652236e-05, + "loss": 0.01824479103088379, + "step": 52230 + }, + { + "epoch": 7.415188076650106, + "grad_norm": 3.887988328933716, + "learning_rate": 9.258807665010646e-05, + "loss": 0.030091965198516847, + "step": 52240 + }, + { + "epoch": 7.416607523066004, + "grad_norm": 0.033909570425748825, + "learning_rate": 9.258665720369057e-05, + "loss": 0.04004532396793366, + "step": 52250 + }, + { + "epoch": 7.418026969481902, + "grad_norm": 1.5418171882629395, + "learning_rate": 9.258523775727467e-05, + "loss": 0.009149040281772613, + "step": 52260 + }, + { + "epoch": 7.4194464158977995, + "grad_norm": 5.650069713592529, + "learning_rate": 9.258381831085878e-05, + "loss": 0.019167867302894593, + "step": 52270 + }, + { + "epoch": 7.420865862313698, + "grad_norm": 1.2293617725372314, + "learning_rate": 9.258239886444287e-05, + "loss": 0.042755690217018125, + "step": 52280 + }, + { + "epoch": 7.422285308729595, + "grad_norm": 6.387624263763428, + "learning_rate": 9.258097941802697e-05, + "loss": 0.026754480600357056, + "step": 52290 + }, + { + "epoch": 7.4237047551454936, + "grad_norm": 10.783160209655762, + "learning_rate": 9.257955997161107e-05, + "loss": 0.04979313015937805, + "step": 52300 + }, + { + "epoch": 7.425124201561391, + "grad_norm": 11.399337768554688, + "learning_rate": 9.257814052519518e-05, + "loss": 0.050091004371643065, + "step": 52310 + }, + { + "epoch": 7.426543647977288, + "grad_norm": 5.577615737915039, + "learning_rate": 9.257672107877928e-05, + "loss": 0.03373092114925384, + "step": 52320 + }, + { + "epoch": 7.427963094393187, + "grad_norm": 4.476070880889893, + "learning_rate": 9.257530163236339e-05, + "loss": 0.05749149918556214, + "step": 52330 + }, + { + "epoch": 7.429382540809084, + "grad_norm": 0.015291991643607616, + "learning_rate": 9.257388218594748e-05, + "loss": 0.03193310499191284, + "step": 52340 + }, + { + "epoch": 7.430801987224982, + "grad_norm": 10.38379192352295, + "learning_rate": 9.257246273953158e-05, + "loss": 0.06969671249389649, + "step": 52350 + }, + { + "epoch": 7.43222143364088, + "grad_norm": 0.3225395381450653, + "learning_rate": 9.25710432931157e-05, + "loss": 0.025198325514793396, + "step": 52360 + }, + { + "epoch": 7.433640880056778, + "grad_norm": 3.8596174716949463, + "learning_rate": 9.256962384669979e-05, + "loss": 0.033972108364105226, + "step": 52370 + }, + { + "epoch": 7.435060326472676, + "grad_norm": 6.728971481323242, + "learning_rate": 9.25682044002839e-05, + "loss": 0.14436639547348024, + "step": 52380 + }, + { + "epoch": 7.436479772888574, + "grad_norm": 0.24112731218338013, + "learning_rate": 9.256678495386799e-05, + "loss": 0.03348296284675598, + "step": 52390 + }, + { + "epoch": 7.437899219304471, + "grad_norm": 4.913303852081299, + "learning_rate": 9.25653655074521e-05, + "loss": 0.03284276723861694, + "step": 52400 + }, + { + "epoch": 7.439318665720369, + "grad_norm": 0.03473828732967377, + "learning_rate": 9.25639460610362e-05, + "loss": 0.020089390873908996, + "step": 52410 + }, + { + "epoch": 7.440738112136267, + "grad_norm": 11.010733604431152, + "learning_rate": 9.25625266146203e-05, + "loss": 0.03317167162895203, + "step": 52420 + }, + { + "epoch": 7.442157558552164, + "grad_norm": 0.09397601336240768, + "learning_rate": 9.25611071682044e-05, + "loss": 0.05286313891410828, + "step": 52430 + }, + { + "epoch": 7.443577004968063, + "grad_norm": 5.776946544647217, + "learning_rate": 9.25596877217885e-05, + "loss": 0.061042767763137815, + "step": 52440 + }, + { + "epoch": 7.44499645138396, + "grad_norm": 10.642087936401367, + "learning_rate": 9.255826827537261e-05, + "loss": 0.08132562637329102, + "step": 52450 + }, + { + "epoch": 7.4464158977998585, + "grad_norm": 0.35297006368637085, + "learning_rate": 9.255684882895671e-05, + "loss": 0.016444140672683717, + "step": 52460 + }, + { + "epoch": 7.447835344215756, + "grad_norm": 7.420588493347168, + "learning_rate": 9.255542938254082e-05, + "loss": 0.07923081517219543, + "step": 52470 + }, + { + "epoch": 7.449254790631653, + "grad_norm": 0.862054705619812, + "learning_rate": 9.255400993612492e-05, + "loss": 0.008070911467075347, + "step": 52480 + }, + { + "epoch": 7.450674237047552, + "grad_norm": 1.598931074142456, + "learning_rate": 9.255259048970903e-05, + "loss": 0.0262810617685318, + "step": 52490 + }, + { + "epoch": 7.452093683463449, + "grad_norm": 0.47471916675567627, + "learning_rate": 9.255117104329311e-05, + "loss": 0.05102187395095825, + "step": 52500 + }, + { + "epoch": 7.452093683463449, + "eval_accuracy": 0.983976600750302, + "eval_loss": 0.048219986259937286, + "eval_runtime": 32.6578, + "eval_samples_per_second": 481.569, + "eval_steps_per_second": 15.065, + "step": 52500 + }, + { + "epoch": 7.453513129879347, + "grad_norm": 0.05551528558135033, + "learning_rate": 9.254975159687722e-05, + "loss": 0.0365988701581955, + "step": 52510 + }, + { + "epoch": 7.454932576295245, + "grad_norm": 0.480398952960968, + "learning_rate": 9.254833215046132e-05, + "loss": 0.04306910634040832, + "step": 52520 + }, + { + "epoch": 7.456352022711143, + "grad_norm": 11.236347198486328, + "learning_rate": 9.254691270404543e-05, + "loss": 0.05073235034942627, + "step": 52530 + }, + { + "epoch": 7.4577714691270405, + "grad_norm": 2.5973517894744873, + "learning_rate": 9.254549325762954e-05, + "loss": 0.02977212965488434, + "step": 52540 + }, + { + "epoch": 7.459190915542938, + "grad_norm": 0.21879629790782928, + "learning_rate": 9.254407381121362e-05, + "loss": 0.06971742510795594, + "step": 52550 + }, + { + "epoch": 7.460610361958836, + "grad_norm": 2.3356027603149414, + "learning_rate": 9.254265436479774e-05, + "loss": 0.05115787982940674, + "step": 52560 + }, + { + "epoch": 7.462029808374734, + "grad_norm": 0.18059638142585754, + "learning_rate": 9.254123491838183e-05, + "loss": 0.04363165497779846, + "step": 52570 + }, + { + "epoch": 7.463449254790632, + "grad_norm": 0.360032320022583, + "learning_rate": 9.253981547196594e-05, + "loss": 0.032967600226402285, + "step": 52580 + }, + { + "epoch": 7.464868701206529, + "grad_norm": 2.464231491088867, + "learning_rate": 9.253839602555004e-05, + "loss": 0.011615180224180222, + "step": 52590 + }, + { + "epoch": 7.466288147622428, + "grad_norm": 0.08998509496450424, + "learning_rate": 9.253697657913414e-05, + "loss": 0.06994263529777527, + "step": 52600 + }, + { + "epoch": 7.467707594038325, + "grad_norm": 0.38058972358703613, + "learning_rate": 9.253555713271824e-05, + "loss": 0.03467016220092774, + "step": 52610 + }, + { + "epoch": 7.4691270404542225, + "grad_norm": 1.0466068983078003, + "learning_rate": 9.253413768630235e-05, + "loss": 0.07900729179382324, + "step": 52620 + }, + { + "epoch": 7.470546486870121, + "grad_norm": 8.88718318939209, + "learning_rate": 9.253271823988644e-05, + "loss": 0.043406492471694945, + "step": 52630 + }, + { + "epoch": 7.471965933286018, + "grad_norm": 9.799579620361328, + "learning_rate": 9.253129879347056e-05, + "loss": 0.07508601546287537, + "step": 52640 + }, + { + "epoch": 7.4733853797019165, + "grad_norm": 0.019487710669636726, + "learning_rate": 9.252987934705465e-05, + "loss": 0.03812042474746704, + "step": 52650 + }, + { + "epoch": 7.474804826117814, + "grad_norm": 5.969607830047607, + "learning_rate": 9.252845990063875e-05, + "loss": 0.07696850299835205, + "step": 52660 + }, + { + "epoch": 7.476224272533712, + "grad_norm": 0.35891488194465637, + "learning_rate": 9.252704045422286e-05, + "loss": 0.026730889081954957, + "step": 52670 + }, + { + "epoch": 7.47764371894961, + "grad_norm": 1.599028468132019, + "learning_rate": 9.252562100780696e-05, + "loss": 0.020498314499855043, + "step": 52680 + }, + { + "epoch": 7.479063165365507, + "grad_norm": 0.4268326759338379, + "learning_rate": 9.252420156139107e-05, + "loss": 0.06487103700637817, + "step": 52690 + }, + { + "epoch": 7.480482611781405, + "grad_norm": 8.808951377868652, + "learning_rate": 9.252278211497515e-05, + "loss": 0.06087350845336914, + "step": 52700 + }, + { + "epoch": 7.481902058197303, + "grad_norm": 1.0655536651611328, + "learning_rate": 9.252136266855926e-05, + "loss": 0.02627456784248352, + "step": 52710 + }, + { + "epoch": 7.483321504613201, + "grad_norm": 0.672973096370697, + "learning_rate": 9.251994322214336e-05, + "loss": 0.03866781890392303, + "step": 52720 + }, + { + "epoch": 7.4847409510290985, + "grad_norm": 7.424531936645508, + "learning_rate": 9.251852377572747e-05, + "loss": 0.02564384639263153, + "step": 52730 + }, + { + "epoch": 7.486160397444997, + "grad_norm": 3.822476863861084, + "learning_rate": 9.251710432931158e-05, + "loss": 0.02148028612136841, + "step": 52740 + }, + { + "epoch": 7.487579843860894, + "grad_norm": 4.0289306640625, + "learning_rate": 9.251568488289567e-05, + "loss": 0.03844795525074005, + "step": 52750 + }, + { + "epoch": 7.488999290276792, + "grad_norm": 3.0319392681121826, + "learning_rate": 9.251426543647978e-05, + "loss": 0.05241814851760864, + "step": 52760 + }, + { + "epoch": 7.49041873669269, + "grad_norm": 0.08324091881513596, + "learning_rate": 9.251284599006388e-05, + "loss": 0.02561030685901642, + "step": 52770 + }, + { + "epoch": 7.491838183108587, + "grad_norm": 0.40191054344177246, + "learning_rate": 9.251142654364799e-05, + "loss": 0.03448401093482971, + "step": 52780 + }, + { + "epoch": 7.493257629524486, + "grad_norm": 8.233901977539062, + "learning_rate": 9.251000709723208e-05, + "loss": 0.06319097876548767, + "step": 52790 + }, + { + "epoch": 7.494677075940383, + "grad_norm": 0.34823575615882874, + "learning_rate": 9.250858765081618e-05, + "loss": 0.038350042700767514, + "step": 52800 + }, + { + "epoch": 7.496096522356281, + "grad_norm": 0.2975291907787323, + "learning_rate": 9.250716820440028e-05, + "loss": 0.057574158906936644, + "step": 52810 + }, + { + "epoch": 7.497515968772179, + "grad_norm": 4.405351638793945, + "learning_rate": 9.250574875798439e-05, + "loss": 0.032033723592758176, + "step": 52820 + }, + { + "epoch": 7.498935415188076, + "grad_norm": 10.231863021850586, + "learning_rate": 9.25043293115685e-05, + "loss": 0.05307228565216064, + "step": 52830 + }, + { + "epoch": 7.500354861603975, + "grad_norm": 4.901642799377441, + "learning_rate": 9.25029098651526e-05, + "loss": 0.09208908081054687, + "step": 52840 + }, + { + "epoch": 7.501774308019872, + "grad_norm": 5.971859931945801, + "learning_rate": 9.250149041873671e-05, + "loss": 0.05865171551704407, + "step": 52850 + }, + { + "epoch": 7.50319375443577, + "grad_norm": 0.3640846312046051, + "learning_rate": 9.250007097232079e-05, + "loss": 0.040382787585258484, + "step": 52860 + }, + { + "epoch": 7.504613200851668, + "grad_norm": 0.23144324123859406, + "learning_rate": 9.24986515259049e-05, + "loss": 0.08787302970886231, + "step": 52870 + }, + { + "epoch": 7.506032647267566, + "grad_norm": 1.2069907188415527, + "learning_rate": 9.2497232079489e-05, + "loss": 0.057518255710601804, + "step": 52880 + }, + { + "epoch": 7.5074520936834634, + "grad_norm": 0.7209001183509827, + "learning_rate": 9.249581263307311e-05, + "loss": 0.05926448106765747, + "step": 52890 + }, + { + "epoch": 7.508871540099361, + "grad_norm": 3.756991386413574, + "learning_rate": 9.249439318665721e-05, + "loss": 0.055469298362731935, + "step": 52900 + }, + { + "epoch": 7.510290986515259, + "grad_norm": 13.026274681091309, + "learning_rate": 9.24929737402413e-05, + "loss": 0.07141894102096558, + "step": 52910 + }, + { + "epoch": 7.511710432931157, + "grad_norm": 1.6600936651229858, + "learning_rate": 9.249155429382542e-05, + "loss": 0.02206961214542389, + "step": 52920 + }, + { + "epoch": 7.513129879347055, + "grad_norm": 4.592076301574707, + "learning_rate": 9.249013484740951e-05, + "loss": 0.068220454454422, + "step": 52930 + }, + { + "epoch": 7.514549325762952, + "grad_norm": 5.431219577789307, + "learning_rate": 9.248871540099363e-05, + "loss": 0.03009980320930481, + "step": 52940 + }, + { + "epoch": 7.515968772178851, + "grad_norm": 0.23516087234020233, + "learning_rate": 9.248729595457772e-05, + "loss": 0.05364044308662415, + "step": 52950 + }, + { + "epoch": 7.517388218594748, + "grad_norm": 0.16321633756160736, + "learning_rate": 9.248587650816182e-05, + "loss": 0.01695691645145416, + "step": 52960 + }, + { + "epoch": 7.518807665010646, + "grad_norm": 7.968390941619873, + "learning_rate": 9.248445706174592e-05, + "loss": 0.046441465616226196, + "step": 52970 + }, + { + "epoch": 7.520227111426544, + "grad_norm": 0.1707213670015335, + "learning_rate": 9.248303761533003e-05, + "loss": 0.022419868409633635, + "step": 52980 + }, + { + "epoch": 7.521646557842441, + "grad_norm": 0.9471738338470459, + "learning_rate": 9.248161816891413e-05, + "loss": 0.04635497331619263, + "step": 52990 + }, + { + "epoch": 7.5230660042583395, + "grad_norm": 5.733717918395996, + "learning_rate": 9.248019872249824e-05, + "loss": 0.05494365692138672, + "step": 53000 + }, + { + "epoch": 7.5230660042583395, + "eval_accuracy": 0.979779996184905, + "eval_loss": 0.0653390884399414, + "eval_runtime": 33.676, + "eval_samples_per_second": 467.009, + "eval_steps_per_second": 14.61, + "step": 53000 + }, + { + "epoch": 7.524485450674237, + "grad_norm": 1.4633334875106812, + "learning_rate": 9.247877927608233e-05, + "loss": 0.04977775514125824, + "step": 53010 + }, + { + "epoch": 7.525904897090135, + "grad_norm": 6.960347652435303, + "learning_rate": 9.247735982966643e-05, + "loss": 0.057191604375839235, + "step": 53020 + }, + { + "epoch": 7.527324343506033, + "grad_norm": 12.815581321716309, + "learning_rate": 9.247594038325054e-05, + "loss": 0.08237308859825135, + "step": 53030 + }, + { + "epoch": 7.528743789921931, + "grad_norm": 0.03340001776814461, + "learning_rate": 9.247452093683464e-05, + "loss": 0.024731306731700896, + "step": 53040 + }, + { + "epoch": 7.530163236337828, + "grad_norm": 8.023792266845703, + "learning_rate": 9.247310149041875e-05, + "loss": 0.024743181467056275, + "step": 53050 + }, + { + "epoch": 7.531582682753726, + "grad_norm": 0.13054397702217102, + "learning_rate": 9.247168204400283e-05, + "loss": 0.07355377674102784, + "step": 53060 + }, + { + "epoch": 7.533002129169624, + "grad_norm": 2.0724384784698486, + "learning_rate": 9.247026259758695e-05, + "loss": 0.036674332618713376, + "step": 53070 + }, + { + "epoch": 7.5344215755855215, + "grad_norm": 6.162533760070801, + "learning_rate": 9.246884315117104e-05, + "loss": 0.09419107437133789, + "step": 53080 + }, + { + "epoch": 7.53584102200142, + "grad_norm": 6.801360607147217, + "learning_rate": 9.246742370475515e-05, + "loss": 0.046077826619148256, + "step": 53090 + }, + { + "epoch": 7.537260468417317, + "grad_norm": 2.5614705085754395, + "learning_rate": 9.246600425833925e-05, + "loss": 0.029644250869750977, + "step": 53100 + }, + { + "epoch": 7.5386799148332155, + "grad_norm": 5.066242694854736, + "learning_rate": 9.246458481192335e-05, + "loss": 0.0544149100780487, + "step": 53110 + }, + { + "epoch": 7.540099361249113, + "grad_norm": 0.11999693512916565, + "learning_rate": 9.246316536550746e-05, + "loss": 0.040882185101509094, + "step": 53120 + }, + { + "epoch": 7.54151880766501, + "grad_norm": 0.4337463080883026, + "learning_rate": 9.246174591909156e-05, + "loss": 0.04220533668994904, + "step": 53130 + }, + { + "epoch": 7.542938254080909, + "grad_norm": 6.067582130432129, + "learning_rate": 9.246032647267567e-05, + "loss": 0.05769921541213989, + "step": 53140 + }, + { + "epoch": 7.544357700496806, + "grad_norm": 4.153407573699951, + "learning_rate": 9.245890702625977e-05, + "loss": 0.033078896999359134, + "step": 53150 + }, + { + "epoch": 7.545777146912704, + "grad_norm": 3.1943609714508057, + "learning_rate": 9.245748757984386e-05, + "loss": 0.04769000113010406, + "step": 53160 + }, + { + "epoch": 7.547196593328602, + "grad_norm": 1.6068007946014404, + "learning_rate": 9.245606813342796e-05, + "loss": 0.024349580705165862, + "step": 53170 + }, + { + "epoch": 7.5486160397445, + "grad_norm": 0.45679542422294617, + "learning_rate": 9.245464868701207e-05, + "loss": 0.03165770173072815, + "step": 53180 + }, + { + "epoch": 7.5500354861603975, + "grad_norm": 4.295965671539307, + "learning_rate": 9.245322924059617e-05, + "loss": 0.05426824688911438, + "step": 53190 + }, + { + "epoch": 7.551454932576295, + "grad_norm": 0.14807292819023132, + "learning_rate": 9.245180979418028e-05, + "loss": 0.050548434257507324, + "step": 53200 + }, + { + "epoch": 7.552874378992193, + "grad_norm": 0.31772199273109436, + "learning_rate": 9.245039034776438e-05, + "loss": 0.050780308246612546, + "step": 53210 + }, + { + "epoch": 7.554293825408091, + "grad_norm": 4.849133491516113, + "learning_rate": 9.244897090134847e-05, + "loss": 0.02840524911880493, + "step": 53220 + }, + { + "epoch": 7.555713271823989, + "grad_norm": 0.765864908695221, + "learning_rate": 9.244755145493259e-05, + "loss": 0.05476508140563965, + "step": 53230 + }, + { + "epoch": 7.557132718239886, + "grad_norm": 0.023372527211904526, + "learning_rate": 9.244613200851668e-05, + "loss": 0.053467082977294925, + "step": 53240 + }, + { + "epoch": 7.558552164655785, + "grad_norm": 0.3663049638271332, + "learning_rate": 9.24447125621008e-05, + "loss": 0.018398307263851166, + "step": 53250 + }, + { + "epoch": 7.559971611071682, + "grad_norm": 0.11951316893100739, + "learning_rate": 9.244329311568489e-05, + "loss": 0.07248743176460266, + "step": 53260 + }, + { + "epoch": 7.56139105748758, + "grad_norm": 0.1535031497478485, + "learning_rate": 9.244187366926899e-05, + "loss": 0.01143306791782379, + "step": 53270 + }, + { + "epoch": 7.562810503903478, + "grad_norm": 0.5286732316017151, + "learning_rate": 9.244045422285309e-05, + "loss": 0.00730847492814064, + "step": 53280 + }, + { + "epoch": 7.564229950319375, + "grad_norm": 1.2395968437194824, + "learning_rate": 9.24390347764372e-05, + "loss": 0.02569035589694977, + "step": 53290 + }, + { + "epoch": 7.565649396735274, + "grad_norm": 4.477364540100098, + "learning_rate": 9.24376153300213e-05, + "loss": 0.06130185127258301, + "step": 53300 + }, + { + "epoch": 7.567068843151171, + "grad_norm": 0.203590527176857, + "learning_rate": 9.24361958836054e-05, + "loss": 0.05250626802444458, + "step": 53310 + }, + { + "epoch": 7.568488289567069, + "grad_norm": 5.73386812210083, + "learning_rate": 9.24347764371895e-05, + "loss": 0.019776782393455504, + "step": 53320 + }, + { + "epoch": 7.569907735982967, + "grad_norm": 0.7581711411476135, + "learning_rate": 9.24333569907736e-05, + "loss": 0.006641269475221634, + "step": 53330 + }, + { + "epoch": 7.571327182398864, + "grad_norm": 3.9233145713806152, + "learning_rate": 9.243193754435771e-05, + "loss": 0.05464006662368774, + "step": 53340 + }, + { + "epoch": 7.5727466288147625, + "grad_norm": 0.6209933757781982, + "learning_rate": 9.243051809794181e-05, + "loss": 0.04515405893325806, + "step": 53350 + }, + { + "epoch": 7.57416607523066, + "grad_norm": 0.1766492873430252, + "learning_rate": 9.242909865152592e-05, + "loss": 0.022017842531204222, + "step": 53360 + }, + { + "epoch": 7.575585521646558, + "grad_norm": 1.5515861511230469, + "learning_rate": 9.242767920511e-05, + "loss": 0.03363422155380249, + "step": 53370 + }, + { + "epoch": 7.577004968062456, + "grad_norm": 0.28514936566352844, + "learning_rate": 9.242625975869411e-05, + "loss": 0.027630746364593506, + "step": 53380 + }, + { + "epoch": 7.578424414478354, + "grad_norm": 0.16114062070846558, + "learning_rate": 9.242484031227821e-05, + "loss": 0.0107742041349411, + "step": 53390 + }, + { + "epoch": 7.579843860894251, + "grad_norm": 5.650407791137695, + "learning_rate": 9.242342086586232e-05, + "loss": 0.05854092836380005, + "step": 53400 + }, + { + "epoch": 7.581263307310149, + "grad_norm": 1.162428855895996, + "learning_rate": 9.242200141944642e-05, + "loss": 0.07622578144073486, + "step": 53410 + }, + { + "epoch": 7.582682753726047, + "grad_norm": 0.809592604637146, + "learning_rate": 9.242058197303052e-05, + "loss": 0.03163085877895355, + "step": 53420 + }, + { + "epoch": 7.5841022001419445, + "grad_norm": 9.978492736816406, + "learning_rate": 9.241916252661463e-05, + "loss": 0.026983675360679627, + "step": 53430 + }, + { + "epoch": 7.585521646557843, + "grad_norm": 2.833834171295166, + "learning_rate": 9.241774308019872e-05, + "loss": 0.04985363781452179, + "step": 53440 + }, + { + "epoch": 7.58694109297374, + "grad_norm": 4.750146389007568, + "learning_rate": 9.241632363378284e-05, + "loss": 0.07900604009628295, + "step": 53450 + }, + { + "epoch": 7.5883605393896385, + "grad_norm": 5.776832103729248, + "learning_rate": 9.241490418736693e-05, + "loss": 0.08184942603111267, + "step": 53460 + }, + { + "epoch": 7.589779985805536, + "grad_norm": 0.146357461810112, + "learning_rate": 9.241348474095103e-05, + "loss": 0.04078640043735504, + "step": 53470 + }, + { + "epoch": 7.591199432221433, + "grad_norm": 2.9139442443847656, + "learning_rate": 9.241206529453513e-05, + "loss": 0.035071760416030884, + "step": 53480 + }, + { + "epoch": 7.592618878637332, + "grad_norm": 9.05538272857666, + "learning_rate": 9.241064584811924e-05, + "loss": 0.059223884344100954, + "step": 53490 + }, + { + "epoch": 7.594038325053229, + "grad_norm": 4.537329196929932, + "learning_rate": 9.240922640170334e-05, + "loss": 0.09483524560928344, + "step": 53500 + }, + { + "epoch": 7.594038325053229, + "eval_accuracy": 0.9792077319259872, + "eval_loss": 0.07600707560777664, + "eval_runtime": 33.1148, + "eval_samples_per_second": 474.923, + "eval_steps_per_second": 14.857, + "step": 53500 + }, + { + "epoch": 7.595457771469127, + "grad_norm": 16.435022354125977, + "learning_rate": 9.240780695528745e-05, + "loss": 0.059524184465408324, + "step": 53510 + }, + { + "epoch": 7.596877217885025, + "grad_norm": 6.466281414031982, + "learning_rate": 9.240638750887154e-05, + "loss": 0.06247789859771728, + "step": 53520 + }, + { + "epoch": 7.598296664300923, + "grad_norm": 2.0099196434020996, + "learning_rate": 9.240496806245564e-05, + "loss": 0.03750507235527038, + "step": 53530 + }, + { + "epoch": 7.5997161107168205, + "grad_norm": 0.4146358072757721, + "learning_rate": 9.240354861603975e-05, + "loss": 0.03174733221530914, + "step": 53540 + }, + { + "epoch": 7.601135557132718, + "grad_norm": 0.6508600115776062, + "learning_rate": 9.240212916962385e-05, + "loss": 0.015483300387859344, + "step": 53550 + }, + { + "epoch": 7.602555003548616, + "grad_norm": 0.19503478705883026, + "learning_rate": 9.240070972320796e-05, + "loss": 0.008923622220754624, + "step": 53560 + }, + { + "epoch": 7.603974449964514, + "grad_norm": 0.29882973432540894, + "learning_rate": 9.239929027679206e-05, + "loss": 0.02937857210636139, + "step": 53570 + }, + { + "epoch": 7.605393896380412, + "grad_norm": 5.896130084991455, + "learning_rate": 9.239787083037616e-05, + "loss": 0.05464982390403748, + "step": 53580 + }, + { + "epoch": 7.606813342796309, + "grad_norm": 0.8281605839729309, + "learning_rate": 9.239645138396025e-05, + "loss": 0.02264205664396286, + "step": 53590 + }, + { + "epoch": 7.608232789212208, + "grad_norm": 5.777096271514893, + "learning_rate": 9.239503193754436e-05, + "loss": 0.03788665533065796, + "step": 53600 + }, + { + "epoch": 7.609652235628105, + "grad_norm": 1.0653955936431885, + "learning_rate": 9.239361249112846e-05, + "loss": 0.03864677846431732, + "step": 53610 + }, + { + "epoch": 7.6110716820440025, + "grad_norm": 2.2212467193603516, + "learning_rate": 9.239219304471257e-05, + "loss": 0.06111682653427124, + "step": 53620 + }, + { + "epoch": 7.612491128459901, + "grad_norm": 8.857563018798828, + "learning_rate": 9.239077359829667e-05, + "loss": 0.0751349151134491, + "step": 53630 + }, + { + "epoch": 7.613910574875798, + "grad_norm": 0.5623897314071655, + "learning_rate": 9.238935415188077e-05, + "loss": 0.04178241789340973, + "step": 53640 + }, + { + "epoch": 7.615330021291697, + "grad_norm": 13.483894348144531, + "learning_rate": 9.238793470546488e-05, + "loss": 0.10559332370758057, + "step": 53650 + }, + { + "epoch": 7.616749467707594, + "grad_norm": 1.6335182189941406, + "learning_rate": 9.238651525904898e-05, + "loss": 0.05756605863571167, + "step": 53660 + }, + { + "epoch": 7.618168914123492, + "grad_norm": 0.2131602019071579, + "learning_rate": 9.238509581263309e-05, + "loss": 0.019205693900585175, + "step": 53670 + }, + { + "epoch": 7.61958836053939, + "grad_norm": 8.674543380737305, + "learning_rate": 9.238381831085877e-05, + "loss": 0.08388531804084778, + "step": 53680 + }, + { + "epoch": 7.621007806955287, + "grad_norm": 0.8919023275375366, + "learning_rate": 9.238239886444288e-05, + "loss": 0.04539136588573456, + "step": 53690 + }, + { + "epoch": 7.622427253371185, + "grad_norm": 0.10621856898069382, + "learning_rate": 9.238097941802697e-05, + "loss": 0.034367746114730834, + "step": 53700 + }, + { + "epoch": 7.623846699787083, + "grad_norm": 1.795030117034912, + "learning_rate": 9.237955997161108e-05, + "loss": 0.02624286413192749, + "step": 53710 + }, + { + "epoch": 7.625266146202981, + "grad_norm": 6.546425819396973, + "learning_rate": 9.237814052519517e-05, + "loss": 0.06192071437835693, + "step": 53720 + }, + { + "epoch": 7.626685592618879, + "grad_norm": 1.9943439960479736, + "learning_rate": 9.237672107877929e-05, + "loss": 0.03261047303676605, + "step": 53730 + }, + { + "epoch": 7.628105039034777, + "grad_norm": 0.13797317445278168, + "learning_rate": 9.237530163236338e-05, + "loss": 0.09458110928535461, + "step": 53740 + }, + { + "epoch": 7.629524485450674, + "grad_norm": 0.1786557137966156, + "learning_rate": 9.237388218594748e-05, + "loss": 0.04373227059841156, + "step": 53750 + }, + { + "epoch": 7.630943931866572, + "grad_norm": 0.11414031684398651, + "learning_rate": 9.237246273953159e-05, + "loss": 0.04276902675628662, + "step": 53760 + }, + { + "epoch": 7.63236337828247, + "grad_norm": 0.09007790684700012, + "learning_rate": 9.237104329311569e-05, + "loss": 0.026761719584465028, + "step": 53770 + }, + { + "epoch": 7.633782824698367, + "grad_norm": 5.950411319732666, + "learning_rate": 9.23696238466998e-05, + "loss": 0.022057650983333586, + "step": 53780 + }, + { + "epoch": 7.635202271114266, + "grad_norm": 0.6405834555625916, + "learning_rate": 9.23682044002839e-05, + "loss": 0.03160939812660217, + "step": 53790 + }, + { + "epoch": 7.636621717530163, + "grad_norm": 8.794205665588379, + "learning_rate": 9.2366784953868e-05, + "loss": 0.04112916588783264, + "step": 53800 + }, + { + "epoch": 7.6380411639460615, + "grad_norm": 0.031496480107307434, + "learning_rate": 9.236536550745209e-05, + "loss": 0.017792116105556487, + "step": 53810 + }, + { + "epoch": 7.639460610361959, + "grad_norm": 3.8262858390808105, + "learning_rate": 9.23639460610362e-05, + "loss": 0.01980331391096115, + "step": 53820 + }, + { + "epoch": 7.640880056777856, + "grad_norm": 0.5895381569862366, + "learning_rate": 9.23625266146203e-05, + "loss": 0.02830488085746765, + "step": 53830 + }, + { + "epoch": 7.642299503193755, + "grad_norm": 2.3932108879089355, + "learning_rate": 9.236110716820441e-05, + "loss": 0.01563961207866669, + "step": 53840 + }, + { + "epoch": 7.643718949609652, + "grad_norm": 0.7757654190063477, + "learning_rate": 9.235968772178851e-05, + "loss": 0.01142323911190033, + "step": 53850 + }, + { + "epoch": 7.64513839602555, + "grad_norm": 1.4721145629882812, + "learning_rate": 9.23582682753726e-05, + "loss": 0.017298223078250886, + "step": 53860 + }, + { + "epoch": 7.646557842441448, + "grad_norm": 14.840720176696777, + "learning_rate": 9.235684882895672e-05, + "loss": 0.09527291059494018, + "step": 53870 + }, + { + "epoch": 7.647977288857346, + "grad_norm": 0.0690033808350563, + "learning_rate": 9.235542938254081e-05, + "loss": 0.05147637128829956, + "step": 53880 + }, + { + "epoch": 7.6493967352732435, + "grad_norm": 17.268529891967773, + "learning_rate": 9.235400993612492e-05, + "loss": 0.08880094289779664, + "step": 53890 + }, + { + "epoch": 7.650816181689141, + "grad_norm": 0.3241688013076782, + "learning_rate": 9.235259048970902e-05, + "loss": 0.024964214861392976, + "step": 53900 + }, + { + "epoch": 7.652235628105039, + "grad_norm": 0.484392911195755, + "learning_rate": 9.235117104329312e-05, + "loss": 0.009301058948040009, + "step": 53910 + }, + { + "epoch": 7.653655074520937, + "grad_norm": 1.3884038925170898, + "learning_rate": 9.234975159687722e-05, + "loss": 0.05612800121307373, + "step": 53920 + }, + { + "epoch": 7.655074520936835, + "grad_norm": 6.86073637008667, + "learning_rate": 9.234833215046133e-05, + "loss": 0.02074751555919647, + "step": 53930 + }, + { + "epoch": 7.656493967352732, + "grad_norm": 3.2943639755249023, + "learning_rate": 9.234691270404543e-05, + "loss": 0.07102017402648926, + "step": 53940 + }, + { + "epoch": 7.657913413768631, + "grad_norm": 4.378195285797119, + "learning_rate": 9.234549325762954e-05, + "loss": 0.03647010326385498, + "step": 53950 + }, + { + "epoch": 7.659332860184528, + "grad_norm": 1.0259917974472046, + "learning_rate": 9.234407381121363e-05, + "loss": 0.037917932868003844, + "step": 53960 + }, + { + "epoch": 7.6607523066004255, + "grad_norm": 0.6418030261993408, + "learning_rate": 9.234265436479773e-05, + "loss": 0.047413745522499086, + "step": 53970 + }, + { + "epoch": 7.662171753016324, + "grad_norm": 6.070641040802002, + "learning_rate": 9.234123491838184e-05, + "loss": 0.06167091727256775, + "step": 53980 + }, + { + "epoch": 7.663591199432221, + "grad_norm": 6.273818016052246, + "learning_rate": 9.233981547196594e-05, + "loss": 0.07756027579307556, + "step": 53990 + }, + { + "epoch": 7.6650106458481195, + "grad_norm": 6.54415225982666, + "learning_rate": 9.233839602555005e-05, + "loss": 0.03750898838043213, + "step": 54000 + }, + { + "epoch": 7.6650106458481195, + "eval_accuracy": 0.972849240160234, + "eval_loss": 0.09029248356819153, + "eval_runtime": 32.2102, + "eval_samples_per_second": 488.262, + "eval_steps_per_second": 15.275, + "step": 54000 + }, + { + "epoch": 7.666430092264017, + "grad_norm": 8.113903045654297, + "learning_rate": 9.233697657913413e-05, + "loss": 0.06530644297599793, + "step": 54010 + }, + { + "epoch": 7.667849538679915, + "grad_norm": 4.259474754333496, + "learning_rate": 9.233555713271824e-05, + "loss": 0.023596912622451782, + "step": 54020 + }, + { + "epoch": 7.669268985095813, + "grad_norm": 0.5020537972450256, + "learning_rate": 9.233413768630234e-05, + "loss": 0.01666277050971985, + "step": 54030 + }, + { + "epoch": 7.67068843151171, + "grad_norm": 0.9523594975471497, + "learning_rate": 9.233271823988645e-05, + "loss": 0.016902516782283782, + "step": 54040 + }, + { + "epoch": 7.672107877927608, + "grad_norm": 4.621238708496094, + "learning_rate": 9.233129879347055e-05, + "loss": 0.05011897087097168, + "step": 54050 + }, + { + "epoch": 7.673527324343506, + "grad_norm": 0.058928538113832474, + "learning_rate": 9.232987934705465e-05, + "loss": 0.05659050941467285, + "step": 54060 + }, + { + "epoch": 7.674946770759404, + "grad_norm": 4.141076564788818, + "learning_rate": 9.232845990063876e-05, + "loss": 0.05621238946914673, + "step": 54070 + }, + { + "epoch": 7.6763662171753015, + "grad_norm": 0.6577054858207703, + "learning_rate": 9.232704045422286e-05, + "loss": 0.030709424614906312, + "step": 54080 + }, + { + "epoch": 7.6777856635912, + "grad_norm": 7.629159927368164, + "learning_rate": 9.232562100780697e-05, + "loss": 0.05306870341300964, + "step": 54090 + }, + { + "epoch": 7.679205110007097, + "grad_norm": 2.1149821281433105, + "learning_rate": 9.232420156139106e-05, + "loss": 0.05833685994148254, + "step": 54100 + }, + { + "epoch": 7.680624556422995, + "grad_norm": 0.2776797115802765, + "learning_rate": 9.232278211497516e-05, + "loss": 0.04917903542518616, + "step": 54110 + }, + { + "epoch": 7.682044002838893, + "grad_norm": 2.9681408405303955, + "learning_rate": 9.232136266855926e-05, + "loss": 0.014532405138015746, + "step": 54120 + }, + { + "epoch": 7.68346344925479, + "grad_norm": 8.385042190551758, + "learning_rate": 9.231994322214337e-05, + "loss": 0.10322569608688355, + "step": 54130 + }, + { + "epoch": 7.684882895670689, + "grad_norm": 0.19604772329330444, + "learning_rate": 9.231852377572747e-05, + "loss": 0.026059174537658693, + "step": 54140 + }, + { + "epoch": 7.686302342086586, + "grad_norm": 1.7579096555709839, + "learning_rate": 9.231710432931158e-05, + "loss": 0.05938120484352112, + "step": 54150 + }, + { + "epoch": 7.687721788502484, + "grad_norm": 0.03762279078364372, + "learning_rate": 9.231568488289568e-05, + "loss": 0.10924702882766724, + "step": 54160 + }, + { + "epoch": 7.689141234918382, + "grad_norm": 0.2558661103248596, + "learning_rate": 9.231426543647977e-05, + "loss": 0.09950585961341858, + "step": 54170 + }, + { + "epoch": 7.690560681334279, + "grad_norm": 1.1616108417510986, + "learning_rate": 9.231284599006388e-05, + "loss": 0.0400057464838028, + "step": 54180 + }, + { + "epoch": 7.691980127750178, + "grad_norm": 4.835945129394531, + "learning_rate": 9.231142654364798e-05, + "loss": 0.07117159366607666, + "step": 54190 + }, + { + "epoch": 7.693399574166075, + "grad_norm": 5.930656909942627, + "learning_rate": 9.231000709723209e-05, + "loss": 0.09944562911987305, + "step": 54200 + }, + { + "epoch": 7.694819020581973, + "grad_norm": 8.935443878173828, + "learning_rate": 9.230858765081618e-05, + "loss": 0.06826504468917846, + "step": 54210 + }, + { + "epoch": 7.696238466997871, + "grad_norm": 9.199212074279785, + "learning_rate": 9.230716820440029e-05, + "loss": 0.04564814865589142, + "step": 54220 + }, + { + "epoch": 7.697657913413769, + "grad_norm": 0.19474904239177704, + "learning_rate": 9.230574875798438e-05, + "loss": 0.01926818788051605, + "step": 54230 + }, + { + "epoch": 7.6990773598296665, + "grad_norm": 5.388856410980225, + "learning_rate": 9.23043293115685e-05, + "loss": 0.020480193197727203, + "step": 54240 + }, + { + "epoch": 7.700496806245564, + "grad_norm": 0.6576113104820251, + "learning_rate": 9.230290986515259e-05, + "loss": 0.022709192335605623, + "step": 54250 + }, + { + "epoch": 7.701916252661462, + "grad_norm": 6.927825927734375, + "learning_rate": 9.23014904187367e-05, + "loss": 0.11104840040206909, + "step": 54260 + }, + { + "epoch": 7.70333569907736, + "grad_norm": 2.824536085128784, + "learning_rate": 9.23000709723208e-05, + "loss": 0.06214058995246887, + "step": 54270 + }, + { + "epoch": 7.704755145493258, + "grad_norm": 1.143356204032898, + "learning_rate": 9.22986515259049e-05, + "loss": 0.030827879905700684, + "step": 54280 + }, + { + "epoch": 7.706174591909155, + "grad_norm": 0.2769818902015686, + "learning_rate": 9.229723207948901e-05, + "loss": 0.03429543673992157, + "step": 54290 + }, + { + "epoch": 7.707594038325054, + "grad_norm": 1.1894688606262207, + "learning_rate": 9.229581263307311e-05, + "loss": 0.04327844679355621, + "step": 54300 + }, + { + "epoch": 7.709013484740951, + "grad_norm": 1.5082286596298218, + "learning_rate": 9.229439318665722e-05, + "loss": 0.06623492836952209, + "step": 54310 + }, + { + "epoch": 7.7104329311568485, + "grad_norm": 2.6335270404815674, + "learning_rate": 9.22929737402413e-05, + "loss": 0.02056298851966858, + "step": 54320 + }, + { + "epoch": 7.711852377572747, + "grad_norm": 0.04475034773349762, + "learning_rate": 9.229155429382541e-05, + "loss": 0.0460908055305481, + "step": 54330 + }, + { + "epoch": 7.713271823988644, + "grad_norm": 0.06595787405967712, + "learning_rate": 9.229013484740951e-05, + "loss": 0.04475194215774536, + "step": 54340 + }, + { + "epoch": 7.7146912704045425, + "grad_norm": 10.94398307800293, + "learning_rate": 9.228871540099362e-05, + "loss": 0.07244617938995361, + "step": 54350 + }, + { + "epoch": 7.71611071682044, + "grad_norm": 3.780848503112793, + "learning_rate": 9.228729595457772e-05, + "loss": 0.0663948893547058, + "step": 54360 + }, + { + "epoch": 7.717530163236338, + "grad_norm": 4.40339994430542, + "learning_rate": 9.228587650816182e-05, + "loss": 0.05741128921508789, + "step": 54370 + }, + { + "epoch": 7.718949609652236, + "grad_norm": 0.6055914759635925, + "learning_rate": 9.228445706174593e-05, + "loss": 0.06476907134056091, + "step": 54380 + }, + { + "epoch": 7.720369056068133, + "grad_norm": 0.7589472532272339, + "learning_rate": 9.228303761533002e-05, + "loss": 0.04766846895217895, + "step": 54390 + }, + { + "epoch": 7.721788502484031, + "grad_norm": 3.3787660598754883, + "learning_rate": 9.228161816891413e-05, + "loss": 0.034449401497840884, + "step": 54400 + }, + { + "epoch": 7.723207948899929, + "grad_norm": 1.4176545143127441, + "learning_rate": 9.228019872249823e-05, + "loss": 0.01606842428445816, + "step": 54410 + }, + { + "epoch": 7.724627395315827, + "grad_norm": 1.9891983270645142, + "learning_rate": 9.227877927608233e-05, + "loss": 0.04497620463371277, + "step": 54420 + }, + { + "epoch": 7.7260468417317245, + "grad_norm": 7.962987899780273, + "learning_rate": 9.227735982966643e-05, + "loss": 0.11056967973709106, + "step": 54430 + }, + { + "epoch": 7.727466288147623, + "grad_norm": 0.19446644186973572, + "learning_rate": 9.227594038325054e-05, + "loss": 0.05493360161781311, + "step": 54440 + }, + { + "epoch": 7.72888573456352, + "grad_norm": 0.07538071274757385, + "learning_rate": 9.227452093683464e-05, + "loss": 0.021178624033927916, + "step": 54450 + }, + { + "epoch": 7.730305180979418, + "grad_norm": 2.5883734226226807, + "learning_rate": 9.227310149041875e-05, + "loss": 0.07062762975692749, + "step": 54460 + }, + { + "epoch": 7.731724627395316, + "grad_norm": 9.73288631439209, + "learning_rate": 9.227168204400284e-05, + "loss": 0.03467971682548523, + "step": 54470 + }, + { + "epoch": 7.733144073811213, + "grad_norm": 4.348404407501221, + "learning_rate": 9.227026259758694e-05, + "loss": 0.07101811170578003, + "step": 54480 + }, + { + "epoch": 7.734563520227112, + "grad_norm": 7.970381259918213, + "learning_rate": 9.226884315117105e-05, + "loss": 0.09218829870223999, + "step": 54490 + }, + { + "epoch": 7.735982966643009, + "grad_norm": 0.49827030301094055, + "learning_rate": 9.226742370475515e-05, + "loss": 0.02308831512928009, + "step": 54500 + }, + { + "epoch": 7.735982966643009, + "eval_accuracy": 0.9711960323011382, + "eval_loss": 0.09986808896064758, + "eval_runtime": 32.8864, + "eval_samples_per_second": 478.222, + "eval_steps_per_second": 14.961, + "step": 54500 + }, + { + "epoch": 7.737402413058907, + "grad_norm": 4.274623870849609, + "learning_rate": 9.226600425833926e-05, + "loss": 0.04567549228668213, + "step": 54510 + }, + { + "epoch": 7.738821859474805, + "grad_norm": 0.24907518923282623, + "learning_rate": 9.226458481192334e-05, + "loss": 0.08113893866539001, + "step": 54520 + }, + { + "epoch": 7.740241305890702, + "grad_norm": 0.5531359314918518, + "learning_rate": 9.226316536550745e-05, + "loss": 0.048743787407875064, + "step": 54530 + }, + { + "epoch": 7.741660752306601, + "grad_norm": 0.20647698640823364, + "learning_rate": 9.226174591909155e-05, + "loss": 0.05019644498825073, + "step": 54540 + }, + { + "epoch": 7.743080198722498, + "grad_norm": 0.23882421851158142, + "learning_rate": 9.226032647267566e-05, + "loss": 0.04471515417098999, + "step": 54550 + }, + { + "epoch": 7.744499645138396, + "grad_norm": 1.7147654294967651, + "learning_rate": 9.225890702625976e-05, + "loss": 0.04694445133209228, + "step": 54560 + }, + { + "epoch": 7.745919091554294, + "grad_norm": 3.365558624267578, + "learning_rate": 9.225748757984386e-05, + "loss": 0.02688348889350891, + "step": 54570 + }, + { + "epoch": 7.747338537970192, + "grad_norm": 0.3501296937465668, + "learning_rate": 9.225606813342797e-05, + "loss": 0.03100045919418335, + "step": 54580 + }, + { + "epoch": 7.748757984386089, + "grad_norm": 8.124733924865723, + "learning_rate": 9.225464868701207e-05, + "loss": 0.04513312876224518, + "step": 54590 + }, + { + "epoch": 7.750177430801987, + "grad_norm": 0.1646394431591034, + "learning_rate": 9.225322924059618e-05, + "loss": 0.01222996562719345, + "step": 54600 + }, + { + "epoch": 7.751596877217885, + "grad_norm": 7.725193500518799, + "learning_rate": 9.225180979418027e-05, + "loss": 0.023356731235980987, + "step": 54610 + }, + { + "epoch": 7.753016323633783, + "grad_norm": 0.9481512904167175, + "learning_rate": 9.225039034776439e-05, + "loss": 0.013700984418392181, + "step": 54620 + }, + { + "epoch": 7.754435770049681, + "grad_norm": 0.10453180968761444, + "learning_rate": 9.224897090134847e-05, + "loss": 0.058964455127716066, + "step": 54630 + }, + { + "epoch": 7.755855216465578, + "grad_norm": 0.024097450077533722, + "learning_rate": 9.224755145493258e-05, + "loss": 0.014117154479026794, + "step": 54640 + }, + { + "epoch": 7.757274662881477, + "grad_norm": 5.321264266967773, + "learning_rate": 9.224613200851668e-05, + "loss": 0.03250417113304138, + "step": 54650 + }, + { + "epoch": 7.758694109297374, + "grad_norm": 0.7793516516685486, + "learning_rate": 9.224471256210079e-05, + "loss": 0.007282558083534241, + "step": 54660 + }, + { + "epoch": 7.760113555713271, + "grad_norm": 5.44902229309082, + "learning_rate": 9.224329311568489e-05, + "loss": 0.019059914350509643, + "step": 54670 + }, + { + "epoch": 7.76153300212917, + "grad_norm": 0.17302462458610535, + "learning_rate": 9.224187366926898e-05, + "loss": 0.013754206895828246, + "step": 54680 + }, + { + "epoch": 7.762952448545067, + "grad_norm": 3.7517333030700684, + "learning_rate": 9.22404542228531e-05, + "loss": 0.07426886558532715, + "step": 54690 + }, + { + "epoch": 7.7643718949609655, + "grad_norm": 1.1429849863052368, + "learning_rate": 9.223903477643719e-05, + "loss": 0.04882776141166687, + "step": 54700 + }, + { + "epoch": 7.765791341376863, + "grad_norm": 0.07773357629776001, + "learning_rate": 9.22376153300213e-05, + "loss": 0.08146860003471375, + "step": 54710 + }, + { + "epoch": 7.767210787792761, + "grad_norm": 0.2552967667579651, + "learning_rate": 9.22361958836054e-05, + "loss": 0.034405875205993655, + "step": 54720 + }, + { + "epoch": 7.768630234208659, + "grad_norm": 1.1035807132720947, + "learning_rate": 9.22347764371895e-05, + "loss": 0.035977023839950564, + "step": 54730 + }, + { + "epoch": 7.770049680624556, + "grad_norm": 8.450946807861328, + "learning_rate": 9.22333569907736e-05, + "loss": 0.034113773703575136, + "step": 54740 + }, + { + "epoch": 7.771469127040454, + "grad_norm": 4.452673435211182, + "learning_rate": 9.22319375443577e-05, + "loss": 0.06801862716674804, + "step": 54750 + }, + { + "epoch": 7.772888573456352, + "grad_norm": 4.749868869781494, + "learning_rate": 9.22305180979418e-05, + "loss": 0.033065930008888245, + "step": 54760 + }, + { + "epoch": 7.77430801987225, + "grad_norm": 0.5759822726249695, + "learning_rate": 9.222909865152591e-05, + "loss": 0.01797463297843933, + "step": 54770 + }, + { + "epoch": 7.7757274662881475, + "grad_norm": 2.1026320457458496, + "learning_rate": 9.222767920511001e-05, + "loss": 0.061784428358078, + "step": 54780 + }, + { + "epoch": 7.777146912704046, + "grad_norm": 0.6788957118988037, + "learning_rate": 9.222625975869411e-05, + "loss": 0.008597303926944733, + "step": 54790 + }, + { + "epoch": 7.778566359119943, + "grad_norm": 4.156731128692627, + "learning_rate": 9.222484031227822e-05, + "loss": 0.03035602569580078, + "step": 54800 + }, + { + "epoch": 7.779985805535841, + "grad_norm": 0.14249205589294434, + "learning_rate": 9.222342086586232e-05, + "loss": 0.012423336505889893, + "step": 54810 + }, + { + "epoch": 7.781405251951739, + "grad_norm": 0.955420970916748, + "learning_rate": 9.222200141944643e-05, + "loss": 0.06543527245521545, + "step": 54820 + }, + { + "epoch": 7.782824698367636, + "grad_norm": 0.056475285440683365, + "learning_rate": 9.222058197303051e-05, + "loss": 0.02058243304491043, + "step": 54830 + }, + { + "epoch": 7.784244144783535, + "grad_norm": 0.41992461681365967, + "learning_rate": 9.221916252661462e-05, + "loss": 0.0476148247718811, + "step": 54840 + }, + { + "epoch": 7.785663591199432, + "grad_norm": 8.196581840515137, + "learning_rate": 9.221774308019872e-05, + "loss": 0.036193230748176576, + "step": 54850 + }, + { + "epoch": 7.78708303761533, + "grad_norm": 0.038563072681427, + "learning_rate": 9.221632363378283e-05, + "loss": 0.02522934377193451, + "step": 54860 + }, + { + "epoch": 7.788502484031228, + "grad_norm": 6.226294040679932, + "learning_rate": 9.221490418736693e-05, + "loss": 0.05935906767845154, + "step": 54870 + }, + { + "epoch": 7.789921930447125, + "grad_norm": 4.046054363250732, + "learning_rate": 9.221348474095103e-05, + "loss": 0.06731109023094177, + "step": 54880 + }, + { + "epoch": 7.7913413768630235, + "grad_norm": 8.530564308166504, + "learning_rate": 9.221206529453514e-05, + "loss": 0.07162481546401978, + "step": 54890 + }, + { + "epoch": 7.792760823278921, + "grad_norm": 0.30639684200286865, + "learning_rate": 9.221064584811923e-05, + "loss": 0.04374118745326996, + "step": 54900 + }, + { + "epoch": 7.794180269694819, + "grad_norm": 3.7728283405303955, + "learning_rate": 9.220922640170334e-05, + "loss": 0.04018616378307342, + "step": 54910 + }, + { + "epoch": 7.795599716110717, + "grad_norm": 5.375783443450928, + "learning_rate": 9.220780695528744e-05, + "loss": 0.07194701433181763, + "step": 54920 + }, + { + "epoch": 7.797019162526615, + "grad_norm": 0.9897046089172363, + "learning_rate": 9.220638750887154e-05, + "loss": 0.012161526829004288, + "step": 54930 + }, + { + "epoch": 7.798438608942512, + "grad_norm": 8.78571891784668, + "learning_rate": 9.220496806245564e-05, + "loss": 0.04038854837417603, + "step": 54940 + }, + { + "epoch": 7.79985805535841, + "grad_norm": 6.383355617523193, + "learning_rate": 9.220354861603975e-05, + "loss": 0.043821310997009276, + "step": 54950 + }, + { + "epoch": 7.801277501774308, + "grad_norm": 0.5270785689353943, + "learning_rate": 9.220212916962385e-05, + "loss": 0.039912080764770506, + "step": 54960 + }, + { + "epoch": 7.8026969481902055, + "grad_norm": 1.264320969581604, + "learning_rate": 9.220070972320796e-05, + "loss": 0.03971914649009704, + "step": 54970 + }, + { + "epoch": 7.804116394606104, + "grad_norm": 5.6460862159729, + "learning_rate": 9.219929027679207e-05, + "loss": 0.04472689926624298, + "step": 54980 + }, + { + "epoch": 7.805535841022001, + "grad_norm": 4.065241813659668, + "learning_rate": 9.219787083037615e-05, + "loss": 0.08542245626449585, + "step": 54990 + }, + { + "epoch": 7.8069552874379, + "grad_norm": 1.2237682342529297, + "learning_rate": 9.219645138396026e-05, + "loss": 0.040656208992004395, + "step": 55000 + }, + { + "epoch": 7.8069552874379, + "eval_accuracy": 0.9778088637375214, + "eval_loss": 0.07491476088762283, + "eval_runtime": 33.0238, + "eval_samples_per_second": 476.233, + "eval_steps_per_second": 14.898, + "step": 55000 + }, + { + "epoch": 7.808374733853797, + "grad_norm": 5.090839862823486, + "learning_rate": 9.219503193754436e-05, + "loss": 0.038296476006507874, + "step": 55010 + }, + { + "epoch": 7.809794180269694, + "grad_norm": 1.009082317352295, + "learning_rate": 9.219361249112847e-05, + "loss": 0.026539346575736998, + "step": 55020 + }, + { + "epoch": 7.811213626685593, + "grad_norm": 2.415933132171631, + "learning_rate": 9.219219304471257e-05, + "loss": 0.05707488656044006, + "step": 55030 + }, + { + "epoch": 7.81263307310149, + "grad_norm": 10.987373352050781, + "learning_rate": 9.219077359829666e-05, + "loss": 0.041009390354156496, + "step": 55040 + }, + { + "epoch": 7.814052519517388, + "grad_norm": 4.925211429595947, + "learning_rate": 9.218935415188076e-05, + "loss": 0.03250816464424133, + "step": 55050 + }, + { + "epoch": 7.815471965933286, + "grad_norm": 0.5724888443946838, + "learning_rate": 9.218793470546487e-05, + "loss": 0.024578140676021577, + "step": 55060 + }, + { + "epoch": 7.816891412349184, + "grad_norm": 5.185868263244629, + "learning_rate": 9.218651525904898e-05, + "loss": 0.11081494092941284, + "step": 55070 + }, + { + "epoch": 7.818310858765082, + "grad_norm": 0.41721123456954956, + "learning_rate": 9.218509581263308e-05, + "loss": 0.02687859833240509, + "step": 55080 + }, + { + "epoch": 7.819730305180979, + "grad_norm": 0.6981793642044067, + "learning_rate": 9.218367636621718e-05, + "loss": 0.043942618370056155, + "step": 55090 + }, + { + "epoch": 7.821149751596877, + "grad_norm": 0.15657079219818115, + "learning_rate": 9.218225691980128e-05, + "loss": 0.022310236096382143, + "step": 55100 + }, + { + "epoch": 7.822569198012775, + "grad_norm": 3.7001686096191406, + "learning_rate": 9.218083747338539e-05, + "loss": 0.03169908821582794, + "step": 55110 + }, + { + "epoch": 7.823988644428673, + "grad_norm": 0.7200214266777039, + "learning_rate": 9.217941802696948e-05, + "loss": 0.03638424575328827, + "step": 55120 + }, + { + "epoch": 7.8254080908445705, + "grad_norm": 2.6627345085144043, + "learning_rate": 9.21779985805536e-05, + "loss": 0.055549895763397215, + "step": 55130 + }, + { + "epoch": 7.826827537260469, + "grad_norm": 0.5793723464012146, + "learning_rate": 9.217657913413768e-05, + "loss": 0.039405593276023866, + "step": 55140 + }, + { + "epoch": 7.828246983676366, + "grad_norm": 0.7050085067749023, + "learning_rate": 9.217515968772179e-05, + "loss": 0.017613281309604645, + "step": 55150 + }, + { + "epoch": 7.829666430092264, + "grad_norm": 0.11719467490911484, + "learning_rate": 9.21737402413059e-05, + "loss": 0.0698439598083496, + "step": 55160 + }, + { + "epoch": 7.831085876508162, + "grad_norm": 10.498146057128906, + "learning_rate": 9.217232079489e-05, + "loss": 0.046284270286560056, + "step": 55170 + }, + { + "epoch": 7.832505322924059, + "grad_norm": 4.107329368591309, + "learning_rate": 9.217090134847411e-05, + "loss": 0.07234654426574708, + "step": 55180 + }, + { + "epoch": 7.833924769339958, + "grad_norm": 7.7555742263793945, + "learning_rate": 9.21694819020582e-05, + "loss": 0.04864185750484466, + "step": 55190 + }, + { + "epoch": 7.835344215755855, + "grad_norm": 0.09962215274572372, + "learning_rate": 9.21680624556423e-05, + "loss": 0.06997425556182861, + "step": 55200 + }, + { + "epoch": 7.836763662171753, + "grad_norm": 0.10970594733953476, + "learning_rate": 9.21666430092264e-05, + "loss": 0.08814730048179627, + "step": 55210 + }, + { + "epoch": 7.838183108587651, + "grad_norm": 1.0084171295166016, + "learning_rate": 9.216522356281051e-05, + "loss": 0.05179111957550049, + "step": 55220 + }, + { + "epoch": 7.839602555003548, + "grad_norm": 0.7253168821334839, + "learning_rate": 9.216380411639461e-05, + "loss": 0.0762027621269226, + "step": 55230 + }, + { + "epoch": 7.8410220014194465, + "grad_norm": 0.6522200107574463, + "learning_rate": 9.216238466997871e-05, + "loss": 0.09144155979156494, + "step": 55240 + }, + { + "epoch": 7.842441447835344, + "grad_norm": 0.2461749017238617, + "learning_rate": 9.216096522356282e-05, + "loss": 0.05044819116592407, + "step": 55250 + }, + { + "epoch": 7.843860894251242, + "grad_norm": 0.39150509238243103, + "learning_rate": 9.215954577714692e-05, + "loss": 0.05807398557662964, + "step": 55260 + }, + { + "epoch": 7.84528034066714, + "grad_norm": 0.10233612358570099, + "learning_rate": 9.215812633073103e-05, + "loss": 0.05883774161338806, + "step": 55270 + }, + { + "epoch": 7.846699787083038, + "grad_norm": 0.454196035861969, + "learning_rate": 9.215670688431512e-05, + "loss": 0.05965339541435242, + "step": 55280 + }, + { + "epoch": 7.848119233498935, + "grad_norm": 0.20604932308197021, + "learning_rate": 9.215528743789923e-05, + "loss": 0.03166616261005402, + "step": 55290 + }, + { + "epoch": 7.849538679914833, + "grad_norm": 1.6209155321121216, + "learning_rate": 9.215386799148332e-05, + "loss": 0.009179739654064179, + "step": 55300 + }, + { + "epoch": 7.850958126330731, + "grad_norm": 0.06912713497877121, + "learning_rate": 9.215244854506743e-05, + "loss": 0.014850091934204102, + "step": 55310 + }, + { + "epoch": 7.8523775727466285, + "grad_norm": 4.286614418029785, + "learning_rate": 9.215102909865153e-05, + "loss": 0.018069779872894286, + "step": 55320 + }, + { + "epoch": 7.853797019162527, + "grad_norm": 0.020820684731006622, + "learning_rate": 9.214960965223564e-05, + "loss": 0.017675217986106873, + "step": 55330 + }, + { + "epoch": 7.855216465578424, + "grad_norm": 0.03339977562427521, + "learning_rate": 9.214819020581974e-05, + "loss": 0.03491029143333435, + "step": 55340 + }, + { + "epoch": 7.8566359119943225, + "grad_norm": 1.0540772676467896, + "learning_rate": 9.214677075940383e-05, + "loss": 0.05180479884147644, + "step": 55350 + }, + { + "epoch": 7.85805535841022, + "grad_norm": 6.752560615539551, + "learning_rate": 9.214535131298794e-05, + "loss": 0.0732165277004242, + "step": 55360 + }, + { + "epoch": 7.859474804826117, + "grad_norm": 6.2393341064453125, + "learning_rate": 9.214393186657204e-05, + "loss": 0.039051464200019835, + "step": 55370 + }, + { + "epoch": 7.860894251242016, + "grad_norm": 0.16125303506851196, + "learning_rate": 9.214251242015615e-05, + "loss": 0.06089925169944763, + "step": 55380 + }, + { + "epoch": 7.862313697657913, + "grad_norm": 0.10348144173622131, + "learning_rate": 9.214109297374025e-05, + "loss": 0.035461637377738955, + "step": 55390 + }, + { + "epoch": 7.863733144073811, + "grad_norm": 0.2502758800983429, + "learning_rate": 9.213967352732435e-05, + "loss": 0.030822911858558656, + "step": 55400 + }, + { + "epoch": 7.865152590489709, + "grad_norm": 0.2445206493139267, + "learning_rate": 9.213825408090844e-05, + "loss": 0.1120613694190979, + "step": 55410 + }, + { + "epoch": 7.866572036905607, + "grad_norm": 2.0001306533813477, + "learning_rate": 9.213683463449255e-05, + "loss": 0.027965742349624633, + "step": 55420 + }, + { + "epoch": 7.8679914833215046, + "grad_norm": 8.780298233032227, + "learning_rate": 9.213541518807665e-05, + "loss": 0.06586897969245911, + "step": 55430 + }, + { + "epoch": 7.869410929737402, + "grad_norm": 0.05981897935271263, + "learning_rate": 9.213399574166076e-05, + "loss": 0.03677443265914917, + "step": 55440 + }, + { + "epoch": 7.8708303761533, + "grad_norm": 6.485029220581055, + "learning_rate": 9.213257629524486e-05, + "loss": 0.05677640438079834, + "step": 55450 + }, + { + "epoch": 7.872249822569198, + "grad_norm": 0.5603395700454712, + "learning_rate": 9.213115684882896e-05, + "loss": 0.02789378762245178, + "step": 55460 + }, + { + "epoch": 7.873669268985096, + "grad_norm": 0.6483574509620667, + "learning_rate": 9.212973740241307e-05, + "loss": 0.03915688693523407, + "step": 55470 + }, + { + "epoch": 7.875088715400993, + "grad_norm": 9.088214874267578, + "learning_rate": 9.212831795599717e-05, + "loss": 0.03713131546974182, + "step": 55480 + }, + { + "epoch": 7.876508161816892, + "grad_norm": 6.221549034118652, + "learning_rate": 9.212689850958128e-05, + "loss": 0.02524724304676056, + "step": 55490 + }, + { + "epoch": 7.877927608232789, + "grad_norm": 0.056300897151231766, + "learning_rate": 9.212547906316536e-05, + "loss": 0.039009875059127806, + "step": 55500 + }, + { + "epoch": 7.877927608232789, + "eval_accuracy": 0.9793349017613022, + "eval_loss": 0.067763552069664, + "eval_runtime": 32.7453, + "eval_samples_per_second": 480.282, + "eval_steps_per_second": 15.025, + "step": 55500 + }, + { + "epoch": 7.879347054648687, + "grad_norm": 12.098105430603027, + "learning_rate": 9.212405961674947e-05, + "loss": 0.0938036561012268, + "step": 55510 + }, + { + "epoch": 7.880766501064585, + "grad_norm": 4.351677894592285, + "learning_rate": 9.212264017033357e-05, + "loss": 0.1319635510444641, + "step": 55520 + }, + { + "epoch": 7.882185947480482, + "grad_norm": 13.525969505310059, + "learning_rate": 9.212122072391768e-05, + "loss": 0.10193095207214356, + "step": 55530 + }, + { + "epoch": 7.883605393896381, + "grad_norm": 3.3304812908172607, + "learning_rate": 9.211980127750178e-05, + "loss": 0.0709221601486206, + "step": 55540 + }, + { + "epoch": 7.885024840312278, + "grad_norm": 1.3782596588134766, + "learning_rate": 9.211838183108588e-05, + "loss": 0.0606769323348999, + "step": 55550 + }, + { + "epoch": 7.886444286728176, + "grad_norm": 0.3674951195716858, + "learning_rate": 9.211696238466999e-05, + "loss": 0.044303598999977115, + "step": 55560 + }, + { + "epoch": 7.887863733144074, + "grad_norm": 1.6610469818115234, + "learning_rate": 9.211554293825408e-05, + "loss": 0.04471073746681213, + "step": 55570 + }, + { + "epoch": 7.889283179559971, + "grad_norm": 0.8602482676506042, + "learning_rate": 9.21141234918382e-05, + "loss": 0.030886751413345338, + "step": 55580 + }, + { + "epoch": 7.8907026259758695, + "grad_norm": 2.7088918685913086, + "learning_rate": 9.211270404542229e-05, + "loss": 0.06906713843345642, + "step": 55590 + }, + { + "epoch": 7.892122072391767, + "grad_norm": 0.3230050504207611, + "learning_rate": 9.211128459900639e-05, + "loss": 0.053826934099197386, + "step": 55600 + }, + { + "epoch": 7.893541518807665, + "grad_norm": 0.27713441848754883, + "learning_rate": 9.210986515259049e-05, + "loss": 0.04268667995929718, + "step": 55610 + }, + { + "epoch": 7.894960965223563, + "grad_norm": 12.684019088745117, + "learning_rate": 9.21084457061746e-05, + "loss": 0.03835551738739014, + "step": 55620 + }, + { + "epoch": 7.896380411639461, + "grad_norm": 0.29225078225135803, + "learning_rate": 9.21070262597587e-05, + "loss": 0.08084606528282165, + "step": 55630 + }, + { + "epoch": 7.897799858055358, + "grad_norm": 0.03829476609826088, + "learning_rate": 9.21056068133428e-05, + "loss": 0.06641941666603088, + "step": 55640 + }, + { + "epoch": 7.899219304471256, + "grad_norm": 0.5478501915931702, + "learning_rate": 9.21041873669269e-05, + "loss": 0.06162059307098389, + "step": 55650 + }, + { + "epoch": 7.900638750887154, + "grad_norm": 2.589578628540039, + "learning_rate": 9.2102767920511e-05, + "loss": 0.025594592094421387, + "step": 55660 + }, + { + "epoch": 7.9020581973030515, + "grad_norm": 7.136966228485107, + "learning_rate": 9.210134847409511e-05, + "loss": 0.0472207635641098, + "step": 55670 + }, + { + "epoch": 7.90347764371895, + "grad_norm": 5.2671966552734375, + "learning_rate": 9.209992902767921e-05, + "loss": 0.06289007067680359, + "step": 55680 + }, + { + "epoch": 7.904897090134847, + "grad_norm": 1.7476320266723633, + "learning_rate": 9.209850958126332e-05, + "loss": 0.02860119938850403, + "step": 55690 + }, + { + "epoch": 7.9063165365507455, + "grad_norm": 1.2390192747116089, + "learning_rate": 9.209709013484742e-05, + "loss": 0.07715204358100891, + "step": 55700 + }, + { + "epoch": 7.907735982966643, + "grad_norm": 7.412806510925293, + "learning_rate": 9.209567068843151e-05, + "loss": 0.05441153049468994, + "step": 55710 + }, + { + "epoch": 7.90915542938254, + "grad_norm": 0.0410551056265831, + "learning_rate": 9.209425124201561e-05, + "loss": 0.03540098369121551, + "step": 55720 + }, + { + "epoch": 7.910574875798439, + "grad_norm": 0.848318874835968, + "learning_rate": 9.209283179559972e-05, + "loss": 0.04798963665962219, + "step": 55730 + }, + { + "epoch": 7.911994322214336, + "grad_norm": 0.9137446284294128, + "learning_rate": 9.209141234918382e-05, + "loss": 0.046340417861938474, + "step": 55740 + }, + { + "epoch": 7.913413768630234, + "grad_norm": 0.49675452709198, + "learning_rate": 9.208999290276793e-05, + "loss": 0.03758726119995117, + "step": 55750 + }, + { + "epoch": 7.914833215046132, + "grad_norm": 8.145282745361328, + "learning_rate": 9.208857345635203e-05, + "loss": 0.01985916793346405, + "step": 55760 + }, + { + "epoch": 7.91625266146203, + "grad_norm": 0.5327089428901672, + "learning_rate": 9.208715400993613e-05, + "loss": 0.0250131219625473, + "step": 55770 + }, + { + "epoch": 7.9176721078779275, + "grad_norm": 6.69525146484375, + "learning_rate": 9.208573456352024e-05, + "loss": 0.06276538968086243, + "step": 55780 + }, + { + "epoch": 7.919091554293825, + "grad_norm": 2.453524589538574, + "learning_rate": 9.208431511710433e-05, + "loss": 0.05585165619850159, + "step": 55790 + }, + { + "epoch": 7.920511000709723, + "grad_norm": 1.1722917556762695, + "learning_rate": 9.208289567068844e-05, + "loss": 0.03681559562683105, + "step": 55800 + }, + { + "epoch": 7.921930447125621, + "grad_norm": 2.173949718475342, + "learning_rate": 9.208147622427253e-05, + "loss": 0.0207523837685585, + "step": 55810 + }, + { + "epoch": 7.923349893541519, + "grad_norm": 1.8896828889846802, + "learning_rate": 9.208005677785664e-05, + "loss": 0.012036536633968354, + "step": 55820 + }, + { + "epoch": 7.924769339957416, + "grad_norm": 12.876811981201172, + "learning_rate": 9.207863733144074e-05, + "loss": 0.06892849206924438, + "step": 55830 + }, + { + "epoch": 7.926188786373315, + "grad_norm": 8.169452667236328, + "learning_rate": 9.207721788502485e-05, + "loss": 0.019931772351264955, + "step": 55840 + }, + { + "epoch": 7.927608232789212, + "grad_norm": 9.522194862365723, + "learning_rate": 9.207594038325053e-05, + "loss": 0.07275225520133972, + "step": 55850 + }, + { + "epoch": 7.9290276792051095, + "grad_norm": 4.947075366973877, + "learning_rate": 9.207452093683464e-05, + "loss": 0.01999004781246185, + "step": 55860 + }, + { + "epoch": 7.930447125621008, + "grad_norm": 0.13232362270355225, + "learning_rate": 9.207310149041874e-05, + "loss": 0.03356763422489166, + "step": 55870 + }, + { + "epoch": 7.931866572036905, + "grad_norm": 7.417023658752441, + "learning_rate": 9.207168204400284e-05, + "loss": 0.023995618522167205, + "step": 55880 + }, + { + "epoch": 7.933286018452804, + "grad_norm": 0.10036950558423996, + "learning_rate": 9.207026259758694e-05, + "loss": 0.05645661950111389, + "step": 55890 + }, + { + "epoch": 7.934705464868701, + "grad_norm": 3.993074417114258, + "learning_rate": 9.206884315117105e-05, + "loss": 0.05878961682319641, + "step": 55900 + }, + { + "epoch": 7.936124911284599, + "grad_norm": 6.879668235778809, + "learning_rate": 9.206742370475516e-05, + "loss": 0.04947432279586792, + "step": 55910 + }, + { + "epoch": 7.937544357700497, + "grad_norm": 1.353629469871521, + "learning_rate": 9.206600425833926e-05, + "loss": 0.06450677514076233, + "step": 55920 + }, + { + "epoch": 7.938963804116394, + "grad_norm": 1.9450629949569702, + "learning_rate": 9.206458481192335e-05, + "loss": 0.08942593336105346, + "step": 55930 + }, + { + "epoch": 7.940383250532292, + "grad_norm": 13.481873512268066, + "learning_rate": 9.206316536550745e-05, + "loss": 0.04448253512382507, + "step": 55940 + }, + { + "epoch": 7.94180269694819, + "grad_norm": 5.543252468109131, + "learning_rate": 9.206174591909156e-05, + "loss": 0.1036454200744629, + "step": 55950 + }, + { + "epoch": 7.943222143364088, + "grad_norm": 4.105419158935547, + "learning_rate": 9.206032647267566e-05, + "loss": 0.06337498426437378, + "step": 55960 + }, + { + "epoch": 7.944641589779986, + "grad_norm": 0.6640766263008118, + "learning_rate": 9.205890702625977e-05, + "loss": 0.03773975372314453, + "step": 55970 + }, + { + "epoch": 7.946061036195884, + "grad_norm": 2.163017988204956, + "learning_rate": 9.205748757984387e-05, + "loss": 0.09545602798461914, + "step": 55980 + }, + { + "epoch": 7.947480482611781, + "grad_norm": 6.420900821685791, + "learning_rate": 9.205606813342796e-05, + "loss": 0.05853666663169861, + "step": 55990 + }, + { + "epoch": 7.948899929027679, + "grad_norm": 6.487877368927002, + "learning_rate": 9.205464868701207e-05, + "loss": 0.057965916395187375, + "step": 56000 + }, + { + "epoch": 7.948899929027679, + "eval_accuracy": 0.9719590513130285, + "eval_loss": 0.09048442542552948, + "eval_runtime": 34.3429, + "eval_samples_per_second": 457.94, + "eval_steps_per_second": 14.326, + "step": 56000 + }, + { + "epoch": 7.950319375443577, + "grad_norm": 0.12126820534467697, + "learning_rate": 9.205322924059617e-05, + "loss": 0.04208188056945801, + "step": 56010 + }, + { + "epoch": 7.9517388218594744, + "grad_norm": 1.2649909257888794, + "learning_rate": 9.205180979418028e-05, + "loss": 0.041204127669334414, + "step": 56020 + }, + { + "epoch": 7.953158268275373, + "grad_norm": 0.5406652092933655, + "learning_rate": 9.205039034776438e-05, + "loss": 0.013593432307243348, + "step": 56030 + }, + { + "epoch": 7.95457771469127, + "grad_norm": 0.8398789763450623, + "learning_rate": 9.204897090134848e-05, + "loss": 0.024010343849658965, + "step": 56040 + }, + { + "epoch": 7.9559971611071685, + "grad_norm": 8.085817337036133, + "learning_rate": 9.204755145493258e-05, + "loss": 0.039969196915626524, + "step": 56050 + }, + { + "epoch": 7.957416607523066, + "grad_norm": 5.8064398765563965, + "learning_rate": 9.204613200851669e-05, + "loss": 0.05945647358894348, + "step": 56060 + }, + { + "epoch": 7.958836053938963, + "grad_norm": 5.265396595001221, + "learning_rate": 9.204471256210078e-05, + "loss": 0.049289605021476744, + "step": 56070 + }, + { + "epoch": 7.960255500354862, + "grad_norm": 0.12839733064174652, + "learning_rate": 9.20432931156849e-05, + "loss": 0.05018383264541626, + "step": 56080 + }, + { + "epoch": 7.961674946770759, + "grad_norm": 2.0961010456085205, + "learning_rate": 9.204187366926899e-05, + "loss": 0.06823894381523132, + "step": 56090 + }, + { + "epoch": 7.963094393186657, + "grad_norm": 4.042204856872559, + "learning_rate": 9.204045422285309e-05, + "loss": 0.04348786175251007, + "step": 56100 + }, + { + "epoch": 7.964513839602555, + "grad_norm": 5.022154331207275, + "learning_rate": 9.20390347764372e-05, + "loss": 0.05021085143089295, + "step": 56110 + }, + { + "epoch": 7.965933286018453, + "grad_norm": 3.282322406768799, + "learning_rate": 9.20376153300213e-05, + "loss": 0.0450539231300354, + "step": 56120 + }, + { + "epoch": 7.9673527324343505, + "grad_norm": 7.373341083526611, + "learning_rate": 9.203619588360541e-05, + "loss": 0.044395309686660764, + "step": 56130 + }, + { + "epoch": 7.968772178850248, + "grad_norm": 5.55653715133667, + "learning_rate": 9.203477643718949e-05, + "loss": 0.034415480494499204, + "step": 56140 + }, + { + "epoch": 7.970191625266146, + "grad_norm": 4.097558975219727, + "learning_rate": 9.20333569907736e-05, + "loss": 0.08280274868011475, + "step": 56150 + }, + { + "epoch": 7.971611071682044, + "grad_norm": 0.11635271459817886, + "learning_rate": 9.20319375443577e-05, + "loss": 0.010257638990879059, + "step": 56160 + }, + { + "epoch": 7.973030518097942, + "grad_norm": 0.73923659324646, + "learning_rate": 9.203051809794181e-05, + "loss": 0.02687770426273346, + "step": 56170 + }, + { + "epoch": 7.974449964513839, + "grad_norm": 0.6702300310134888, + "learning_rate": 9.202909865152591e-05, + "loss": 0.053887850046157836, + "step": 56180 + }, + { + "epoch": 7.975869410929738, + "grad_norm": 2.243748664855957, + "learning_rate": 9.202767920511e-05, + "loss": 0.03436025381088257, + "step": 56190 + }, + { + "epoch": 7.977288857345635, + "grad_norm": 0.31168264150619507, + "learning_rate": 9.202625975869412e-05, + "loss": 0.08324195742607117, + "step": 56200 + }, + { + "epoch": 7.9787083037615325, + "grad_norm": 0.38794392347335815, + "learning_rate": 9.202484031227821e-05, + "loss": 0.026874610781669618, + "step": 56210 + }, + { + "epoch": 7.980127750177431, + "grad_norm": 1.8175753355026245, + "learning_rate": 9.202342086586233e-05, + "loss": 0.020836614072322845, + "step": 56220 + }, + { + "epoch": 7.981547196593328, + "grad_norm": 0.1702578067779541, + "learning_rate": 9.202200141944642e-05, + "loss": 0.09062458276748657, + "step": 56230 + }, + { + "epoch": 7.9829666430092265, + "grad_norm": 4.012740612030029, + "learning_rate": 9.202058197303052e-05, + "loss": 0.054927331209182736, + "step": 56240 + }, + { + "epoch": 7.984386089425124, + "grad_norm": 1.2998656034469604, + "learning_rate": 9.201916252661462e-05, + "loss": 0.01595239043235779, + "step": 56250 + }, + { + "epoch": 7.985805535841022, + "grad_norm": 0.5248915553092957, + "learning_rate": 9.201774308019873e-05, + "loss": 0.054577767848968506, + "step": 56260 + }, + { + "epoch": 7.98722498225692, + "grad_norm": 8.47024154663086, + "learning_rate": 9.201632363378283e-05, + "loss": 0.07780020236968994, + "step": 56270 + }, + { + "epoch": 7.988644428672818, + "grad_norm": 0.10993131995201111, + "learning_rate": 9.201490418736694e-05, + "loss": 0.006740601360797882, + "step": 56280 + }, + { + "epoch": 7.990063875088715, + "grad_norm": 2.291124105453491, + "learning_rate": 9.201348474095103e-05, + "loss": 0.0112013079226017, + "step": 56290 + }, + { + "epoch": 7.991483321504613, + "grad_norm": 1.062869906425476, + "learning_rate": 9.201206529453513e-05, + "loss": 0.013367721438407898, + "step": 56300 + }, + { + "epoch": 7.992902767920511, + "grad_norm": 0.40215814113616943, + "learning_rate": 9.201064584811924e-05, + "loss": 0.01611262857913971, + "step": 56310 + }, + { + "epoch": 7.9943222143364085, + "grad_norm": 2.8733534812927246, + "learning_rate": 9.200922640170334e-05, + "loss": 0.010476227104663848, + "step": 56320 + }, + { + "epoch": 7.995741660752307, + "grad_norm": 4.937908172607422, + "learning_rate": 9.200780695528745e-05, + "loss": 0.018619158864021303, + "step": 56330 + }, + { + "epoch": 7.997161107168204, + "grad_norm": 6.0632829666137695, + "learning_rate": 9.200638750887155e-05, + "loss": 0.024484434723854066, + "step": 56340 + }, + { + "epoch": 7.998580553584103, + "grad_norm": 0.44283390045166016, + "learning_rate": 9.200496806245565e-05, + "loss": 0.005785078555345535, + "step": 56350 + }, + { + "epoch": 8.0, + "grad_norm": 10.04196548461914, + "learning_rate": 9.200354861603974e-05, + "loss": 0.03202285170555115, + "step": 56360 + }, + { + "epoch": 8.001419446415898, + "grad_norm": 3.1271493434906006, + "learning_rate": 9.200212916962385e-05, + "loss": 0.00439850240945816, + "step": 56370 + }, + { + "epoch": 8.002838892831795, + "grad_norm": 6.92768669128418, + "learning_rate": 9.200070972320795e-05, + "loss": 0.04164240658283234, + "step": 56380 + }, + { + "epoch": 8.004258339247693, + "grad_norm": 1.7308955192565918, + "learning_rate": 9.199929027679206e-05, + "loss": 0.018328216671943665, + "step": 56390 + }, + { + "epoch": 8.005677785663591, + "grad_norm": 3.8155698776245117, + "learning_rate": 9.199787083037616e-05, + "loss": 0.05507233142852783, + "step": 56400 + }, + { + "epoch": 8.00709723207949, + "grad_norm": 1.0746243000030518, + "learning_rate": 9.199645138396026e-05, + "loss": 0.044785207509994505, + "step": 56410 + }, + { + "epoch": 8.008516678495386, + "grad_norm": 3.7036960124969482, + "learning_rate": 9.199503193754437e-05, + "loss": 0.08312212824821472, + "step": 56420 + }, + { + "epoch": 8.009936124911285, + "grad_norm": 0.4116200804710388, + "learning_rate": 9.199361249112847e-05, + "loss": 0.06781967878341674, + "step": 56430 + }, + { + "epoch": 8.011355571327183, + "grad_norm": 1.766020655632019, + "learning_rate": 9.199219304471258e-05, + "loss": 0.05059321522712708, + "step": 56440 + }, + { + "epoch": 8.01277501774308, + "grad_norm": 8.156139373779297, + "learning_rate": 9.199077359829666e-05, + "loss": 0.042923194169998166, + "step": 56450 + }, + { + "epoch": 8.014194464158978, + "grad_norm": 7.318849086761475, + "learning_rate": 9.198935415188077e-05, + "loss": 0.022391645610332488, + "step": 56460 + }, + { + "epoch": 8.015613910574876, + "grad_norm": 7.450283527374268, + "learning_rate": 9.198793470546487e-05, + "loss": 0.0307634174823761, + "step": 56470 + }, + { + "epoch": 8.017033356990774, + "grad_norm": 0.19863024353981018, + "learning_rate": 9.198651525904898e-05, + "loss": 0.02928740680217743, + "step": 56480 + }, + { + "epoch": 8.01845280340667, + "grad_norm": 3.1025230884552, + "learning_rate": 9.198509581263308e-05, + "loss": 0.031206589937210084, + "step": 56490 + }, + { + "epoch": 8.01987224982257, + "grad_norm": 0.772976279258728, + "learning_rate": 9.198367636621717e-05, + "loss": 0.022878825664520264, + "step": 56500 + }, + { + "epoch": 8.01987224982257, + "eval_accuracy": 0.9783811279964393, + "eval_loss": 0.06529545783996582, + "eval_runtime": 32.6501, + "eval_samples_per_second": 481.682, + "eval_steps_per_second": 15.069, + "step": 56500 + }, + { + "epoch": 8.021291696238467, + "grad_norm": 0.32695847749710083, + "learning_rate": 9.198225691980129e-05, + "loss": 0.02905186414718628, + "step": 56510 + }, + { + "epoch": 8.022711142654364, + "grad_norm": 7.177985668182373, + "learning_rate": 9.198083747338538e-05, + "loss": 0.047767966985702515, + "step": 56520 + }, + { + "epoch": 8.024130589070262, + "grad_norm": 3.370894432067871, + "learning_rate": 9.19794180269695e-05, + "loss": 0.03640216886997223, + "step": 56530 + }, + { + "epoch": 8.02555003548616, + "grad_norm": 6.081117153167725, + "learning_rate": 9.197799858055359e-05, + "loss": 0.04813358187675476, + "step": 56540 + }, + { + "epoch": 8.026969481902059, + "grad_norm": 0.587363600730896, + "learning_rate": 9.197657913413769e-05, + "loss": 0.04699629247188568, + "step": 56550 + }, + { + "epoch": 8.028388928317955, + "grad_norm": 2.6055822372436523, + "learning_rate": 9.197515968772179e-05, + "loss": 0.012260462343692779, + "step": 56560 + }, + { + "epoch": 8.029808374733854, + "grad_norm": 1.192359447479248, + "learning_rate": 9.19737402413059e-05, + "loss": 0.08580980896949768, + "step": 56570 + }, + { + "epoch": 8.031227821149752, + "grad_norm": 2.338804006576538, + "learning_rate": 9.197232079489e-05, + "loss": 0.04305451214313507, + "step": 56580 + }, + { + "epoch": 8.032647267565649, + "grad_norm": 1.1648589372634888, + "learning_rate": 9.19709013484741e-05, + "loss": 0.027252352237701415, + "step": 56590 + }, + { + "epoch": 8.034066713981547, + "grad_norm": 0.39368653297424316, + "learning_rate": 9.19694819020582e-05, + "loss": 0.027997153997421264, + "step": 56600 + }, + { + "epoch": 8.035486160397445, + "grad_norm": 12.663429260253906, + "learning_rate": 9.19680624556423e-05, + "loss": 0.09810233116149902, + "step": 56610 + }, + { + "epoch": 8.036905606813344, + "grad_norm": 0.03624679520726204, + "learning_rate": 9.196664300922641e-05, + "loss": 0.05567052960395813, + "step": 56620 + }, + { + "epoch": 8.03832505322924, + "grad_norm": 2.5650811195373535, + "learning_rate": 9.196522356281051e-05, + "loss": 0.044278931617736814, + "step": 56630 + }, + { + "epoch": 8.039744499645138, + "grad_norm": 4.6383161544799805, + "learning_rate": 9.196380411639462e-05, + "loss": 0.05508902668952942, + "step": 56640 + }, + { + "epoch": 8.041163946061037, + "grad_norm": 0.8782206177711487, + "learning_rate": 9.19623846699787e-05, + "loss": 0.0375711590051651, + "step": 56650 + }, + { + "epoch": 8.042583392476933, + "grad_norm": 0.17253684997558594, + "learning_rate": 9.196096522356281e-05, + "loss": 0.02836502194404602, + "step": 56660 + }, + { + "epoch": 8.044002838892832, + "grad_norm": 0.18491551280021667, + "learning_rate": 9.195954577714691e-05, + "loss": 0.007738977670669556, + "step": 56670 + }, + { + "epoch": 8.04542228530873, + "grad_norm": 4.824859619140625, + "learning_rate": 9.195812633073102e-05, + "loss": 0.06535216569900512, + "step": 56680 + }, + { + "epoch": 8.046841731724628, + "grad_norm": 0.2667035758495331, + "learning_rate": 9.195670688431512e-05, + "loss": 0.051838308572769165, + "step": 56690 + }, + { + "epoch": 8.048261178140525, + "grad_norm": 1.4201091527938843, + "learning_rate": 9.195528743789923e-05, + "loss": 0.010924571752548217, + "step": 56700 + }, + { + "epoch": 8.049680624556423, + "grad_norm": 5.541697025299072, + "learning_rate": 9.195386799148333e-05, + "loss": 0.03459354043006897, + "step": 56710 + }, + { + "epoch": 8.051100070972321, + "grad_norm": 6.982559680938721, + "learning_rate": 9.195244854506742e-05, + "loss": 0.05080728530883789, + "step": 56720 + }, + { + "epoch": 8.052519517388218, + "grad_norm": 0.2359095960855484, + "learning_rate": 9.195102909865154e-05, + "loss": 0.004116867855191231, + "step": 56730 + }, + { + "epoch": 8.053938963804116, + "grad_norm": 4.762861251831055, + "learning_rate": 9.194960965223563e-05, + "loss": 0.030681657791137695, + "step": 56740 + }, + { + "epoch": 8.055358410220014, + "grad_norm": 0.20742924511432648, + "learning_rate": 9.194819020581974e-05, + "loss": 0.028750818967819215, + "step": 56750 + }, + { + "epoch": 8.056777856635913, + "grad_norm": 0.384204238653183, + "learning_rate": 9.194677075940383e-05, + "loss": 0.05377202033996582, + "step": 56760 + }, + { + "epoch": 8.05819730305181, + "grad_norm": 0.2828398644924164, + "learning_rate": 9.194535131298794e-05, + "loss": 0.053729516267776486, + "step": 56770 + }, + { + "epoch": 8.059616749467708, + "grad_norm": 2.284846305847168, + "learning_rate": 9.194393186657204e-05, + "loss": 0.029895415902137755, + "step": 56780 + }, + { + "epoch": 8.061036195883606, + "grad_norm": 2.943188190460205, + "learning_rate": 9.194251242015615e-05, + "loss": 0.04509938657283783, + "step": 56790 + }, + { + "epoch": 8.062455642299502, + "grad_norm": 0.010644367896020412, + "learning_rate": 9.194109297374024e-05, + "loss": 0.019495250284671785, + "step": 56800 + }, + { + "epoch": 8.0638750887154, + "grad_norm": 6.129455089569092, + "learning_rate": 9.193967352732434e-05, + "loss": 0.043457993865013124, + "step": 56810 + }, + { + "epoch": 8.065294535131299, + "grad_norm": 0.03838249295949936, + "learning_rate": 9.193825408090845e-05, + "loss": 0.02848859131336212, + "step": 56820 + }, + { + "epoch": 8.066713981547197, + "grad_norm": 1.1387866735458374, + "learning_rate": 9.193683463449255e-05, + "loss": 0.018293626606464386, + "step": 56830 + }, + { + "epoch": 8.068133427963094, + "grad_norm": 4.466863632202148, + "learning_rate": 9.193541518807666e-05, + "loss": 0.0418965220451355, + "step": 56840 + }, + { + "epoch": 8.069552874378992, + "grad_norm": 0.7588008642196655, + "learning_rate": 9.193399574166076e-05, + "loss": 0.03291020691394806, + "step": 56850 + }, + { + "epoch": 8.07097232079489, + "grad_norm": 11.674447059631348, + "learning_rate": 9.193257629524486e-05, + "loss": 0.035113191604614256, + "step": 56860 + }, + { + "epoch": 8.072391767210787, + "grad_norm": 4.501001358032227, + "learning_rate": 9.193115684882895e-05, + "loss": 0.0164683535695076, + "step": 56870 + }, + { + "epoch": 8.073811213626685, + "grad_norm": 0.5031645894050598, + "learning_rate": 9.192973740241306e-05, + "loss": 0.049401518702507016, + "step": 56880 + }, + { + "epoch": 8.075230660042584, + "grad_norm": 0.4161587655544281, + "learning_rate": 9.192831795599716e-05, + "loss": 0.041995969414710996, + "step": 56890 + }, + { + "epoch": 8.076650106458482, + "grad_norm": 1.415582299232483, + "learning_rate": 9.192689850958127e-05, + "loss": 0.007475908100605011, + "step": 56900 + }, + { + "epoch": 8.078069552874378, + "grad_norm": 13.384356498718262, + "learning_rate": 9.192547906316537e-05, + "loss": 0.0641768991947174, + "step": 56910 + }, + { + "epoch": 8.079488999290277, + "grad_norm": 2.316368818283081, + "learning_rate": 9.192405961674947e-05, + "loss": 0.019245807826519013, + "step": 56920 + }, + { + "epoch": 8.080908445706175, + "grad_norm": 12.386263847351074, + "learning_rate": 9.192264017033358e-05, + "loss": 0.023176319897174835, + "step": 56930 + }, + { + "epoch": 8.082327892122072, + "grad_norm": 1.6595792770385742, + "learning_rate": 9.192122072391768e-05, + "loss": 0.056136542558670045, + "step": 56940 + }, + { + "epoch": 8.08374733853797, + "grad_norm": 8.082763671875, + "learning_rate": 9.191980127750179e-05, + "loss": 0.08039953112602234, + "step": 56950 + }, + { + "epoch": 8.085166784953868, + "grad_norm": 2.5632402896881104, + "learning_rate": 9.191838183108587e-05, + "loss": 0.03842626512050629, + "step": 56960 + }, + { + "epoch": 8.086586231369767, + "grad_norm": 1.1970226764678955, + "learning_rate": 9.191696238466998e-05, + "loss": 0.018404172360897066, + "step": 56970 + }, + { + "epoch": 8.088005677785663, + "grad_norm": 3.487342119216919, + "learning_rate": 9.191554293825408e-05, + "loss": 0.013470767438411713, + "step": 56980 + }, + { + "epoch": 8.089425124201561, + "grad_norm": 0.19747193157672882, + "learning_rate": 9.191412349183819e-05, + "loss": 0.03778769075870514, + "step": 56990 + }, + { + "epoch": 8.09084457061746, + "grad_norm": 0.46198809146881104, + "learning_rate": 9.191270404542229e-05, + "loss": 0.029198932647705077, + "step": 57000 + }, + { + "epoch": 8.09084457061746, + "eval_accuracy": 0.983849430914987, + "eval_loss": 0.050795987248420715, + "eval_runtime": 33.4179, + "eval_samples_per_second": 470.616, + "eval_steps_per_second": 14.723, + "step": 57000 + }, + { + "epoch": 8.092264017033356, + "grad_norm": 1.7470345497131348, + "learning_rate": 9.191128459900638e-05, + "loss": 0.028783124685287476, + "step": 57010 + }, + { + "epoch": 8.093683463449254, + "grad_norm": 10.447305679321289, + "learning_rate": 9.19098651525905e-05, + "loss": 0.03660332858562469, + "step": 57020 + }, + { + "epoch": 8.095102909865153, + "grad_norm": 2.9325790405273438, + "learning_rate": 9.190844570617459e-05, + "loss": 0.006524309515953064, + "step": 57030 + }, + { + "epoch": 8.096522356281051, + "grad_norm": 0.6765064597129822, + "learning_rate": 9.19070262597587e-05, + "loss": 0.04815886616706848, + "step": 57040 + }, + { + "epoch": 8.097941802696948, + "grad_norm": 0.4403517544269562, + "learning_rate": 9.19056068133428e-05, + "loss": 0.009912458062171937, + "step": 57050 + }, + { + "epoch": 8.099361249112846, + "grad_norm": 0.5875139236450195, + "learning_rate": 9.190418736692691e-05, + "loss": 0.09076798558235169, + "step": 57060 + }, + { + "epoch": 8.100780695528744, + "grad_norm": 0.27696019411087036, + "learning_rate": 9.1902767920511e-05, + "loss": 0.022283504903316497, + "step": 57070 + }, + { + "epoch": 8.10220014194464, + "grad_norm": 0.488571435213089, + "learning_rate": 9.19013484740951e-05, + "loss": 0.01099751442670822, + "step": 57080 + }, + { + "epoch": 8.103619588360539, + "grad_norm": 0.2157941311597824, + "learning_rate": 9.18999290276792e-05, + "loss": 0.018986338376998903, + "step": 57090 + }, + { + "epoch": 8.105039034776437, + "grad_norm": 0.009966165758669376, + "learning_rate": 9.189850958126331e-05, + "loss": 0.03489084541797638, + "step": 57100 + }, + { + "epoch": 8.106458481192336, + "grad_norm": 0.18608416616916656, + "learning_rate": 9.189709013484741e-05, + "loss": 0.055800986289978025, + "step": 57110 + }, + { + "epoch": 8.107877927608232, + "grad_norm": 0.023776527494192123, + "learning_rate": 9.189567068843151e-05, + "loss": 0.050034058094024655, + "step": 57120 + }, + { + "epoch": 8.10929737402413, + "grad_norm": 1.362025260925293, + "learning_rate": 9.189425124201562e-05, + "loss": 0.018491455912590028, + "step": 57130 + }, + { + "epoch": 8.110716820440029, + "grad_norm": 15.343968391418457, + "learning_rate": 9.189283179559972e-05, + "loss": 0.10932686328887939, + "step": 57140 + }, + { + "epoch": 8.112136266855925, + "grad_norm": 0.018440308049321175, + "learning_rate": 9.189141234918383e-05, + "loss": 0.046657025814056396, + "step": 57150 + }, + { + "epoch": 8.113555713271824, + "grad_norm": 0.047869276255369186, + "learning_rate": 9.188999290276793e-05, + "loss": 0.03132939338684082, + "step": 57160 + }, + { + "epoch": 8.114975159687722, + "grad_norm": 0.28167399764060974, + "learning_rate": 9.188857345635202e-05, + "loss": 0.03517512679100036, + "step": 57170 + }, + { + "epoch": 8.11639460610362, + "grad_norm": 1.3338134288787842, + "learning_rate": 9.188715400993612e-05, + "loss": 0.057652842998504636, + "step": 57180 + }, + { + "epoch": 8.117814052519517, + "grad_norm": 0.7916436791419983, + "learning_rate": 9.188573456352023e-05, + "loss": 0.01187950223684311, + "step": 57190 + }, + { + "epoch": 8.119233498935415, + "grad_norm": 10.301247596740723, + "learning_rate": 9.188431511710433e-05, + "loss": 0.027448675036430357, + "step": 57200 + }, + { + "epoch": 8.120652945351313, + "grad_norm": 0.07494665682315826, + "learning_rate": 9.188289567068844e-05, + "loss": 0.0361156702041626, + "step": 57210 + }, + { + "epoch": 8.12207239176721, + "grad_norm": 0.010714360512793064, + "learning_rate": 9.188147622427254e-05, + "loss": 0.08930673599243164, + "step": 57220 + }, + { + "epoch": 8.123491838183108, + "grad_norm": 1.554410696029663, + "learning_rate": 9.188005677785663e-05, + "loss": 0.04430621564388275, + "step": 57230 + }, + { + "epoch": 8.124911284599007, + "grad_norm": 1.4681209325790405, + "learning_rate": 9.187863733144075e-05, + "loss": 0.02614140808582306, + "step": 57240 + }, + { + "epoch": 8.126330731014905, + "grad_norm": 0.13754108548164368, + "learning_rate": 9.187721788502484e-05, + "loss": 0.017473718523979186, + "step": 57250 + }, + { + "epoch": 8.127750177430801, + "grad_norm": 7.498713493347168, + "learning_rate": 9.187579843860895e-05, + "loss": 0.06623184084892272, + "step": 57260 + }, + { + "epoch": 8.1291696238467, + "grad_norm": 4.109618186950684, + "learning_rate": 9.187437899219304e-05, + "loss": 0.016468723118305207, + "step": 57270 + }, + { + "epoch": 8.130589070262598, + "grad_norm": 0.10974710434675217, + "learning_rate": 9.187295954577715e-05, + "loss": 0.03356311023235321, + "step": 57280 + }, + { + "epoch": 8.132008516678495, + "grad_norm": 0.2429163157939911, + "learning_rate": 9.187154009936125e-05, + "loss": 0.011898426711559296, + "step": 57290 + }, + { + "epoch": 8.133427963094393, + "grad_norm": 1.5502437353134155, + "learning_rate": 9.187012065294536e-05, + "loss": 0.03775279223918915, + "step": 57300 + }, + { + "epoch": 8.134847409510291, + "grad_norm": 1.5370547771453857, + "learning_rate": 9.186870120652947e-05, + "loss": 0.07396373748779297, + "step": 57310 + }, + { + "epoch": 8.13626685592619, + "grad_norm": 0.12712019681930542, + "learning_rate": 9.186728176011355e-05, + "loss": 0.032404530048370364, + "step": 57320 + }, + { + "epoch": 8.137686302342086, + "grad_norm": 3.8964903354644775, + "learning_rate": 9.186586231369766e-05, + "loss": 0.01291709989309311, + "step": 57330 + }, + { + "epoch": 8.139105748757984, + "grad_norm": 2.898106575012207, + "learning_rate": 9.186444286728176e-05, + "loss": 0.027319177985191345, + "step": 57340 + }, + { + "epoch": 8.140525195173883, + "grad_norm": 6.190265655517578, + "learning_rate": 9.186302342086587e-05, + "loss": 0.022334299981594086, + "step": 57350 + }, + { + "epoch": 8.14194464158978, + "grad_norm": 6.5095038414001465, + "learning_rate": 9.186160397444997e-05, + "loss": 0.02422604411840439, + "step": 57360 + }, + { + "epoch": 8.143364088005677, + "grad_norm": 0.18886369466781616, + "learning_rate": 9.186018452803407e-05, + "loss": 0.08141739964485169, + "step": 57370 + }, + { + "epoch": 8.144783534421576, + "grad_norm": 0.38172465562820435, + "learning_rate": 9.185876508161816e-05, + "loss": 0.027005189657211305, + "step": 57380 + }, + { + "epoch": 8.146202980837474, + "grad_norm": 3.8629918098449707, + "learning_rate": 9.185734563520227e-05, + "loss": 0.015311364829540253, + "step": 57390 + }, + { + "epoch": 8.14762242725337, + "grad_norm": 0.2354840785264969, + "learning_rate": 9.185592618878639e-05, + "loss": 0.06443232297897339, + "step": 57400 + }, + { + "epoch": 8.149041873669269, + "grad_norm": 8.87781047821045, + "learning_rate": 9.185450674237048e-05, + "loss": 0.02635810077190399, + "step": 57410 + }, + { + "epoch": 8.150461320085167, + "grad_norm": 0.2784138023853302, + "learning_rate": 9.18530872959546e-05, + "loss": 0.038064810633659366, + "step": 57420 + }, + { + "epoch": 8.151880766501064, + "grad_norm": 1.3603695631027222, + "learning_rate": 9.185166784953868e-05, + "loss": 0.009778007864952087, + "step": 57430 + }, + { + "epoch": 8.153300212916962, + "grad_norm": 1.439544916152954, + "learning_rate": 9.185024840312279e-05, + "loss": 0.03558608889579773, + "step": 57440 + }, + { + "epoch": 8.15471965933286, + "grad_norm": 0.5217808485031128, + "learning_rate": 9.184882895670689e-05, + "loss": 0.017676195502281188, + "step": 57450 + }, + { + "epoch": 8.156139105748759, + "grad_norm": 0.015781676396727562, + "learning_rate": 9.1847409510291e-05, + "loss": 0.013397561013698578, + "step": 57460 + }, + { + "epoch": 8.157558552164655, + "grad_norm": 0.17529694736003876, + "learning_rate": 9.18459900638751e-05, + "loss": 0.02492748200893402, + "step": 57470 + }, + { + "epoch": 8.158977998580554, + "grad_norm": 1.654995083808899, + "learning_rate": 9.184457061745919e-05, + "loss": 0.01247488558292389, + "step": 57480 + }, + { + "epoch": 8.160397444996452, + "grad_norm": 2.895176410675049, + "learning_rate": 9.18431511710433e-05, + "loss": 0.04395711421966553, + "step": 57490 + }, + { + "epoch": 8.161816891412348, + "grad_norm": 0.12417486310005188, + "learning_rate": 9.18417317246274e-05, + "loss": 0.018846186995506286, + "step": 57500 + }, + { + "epoch": 8.161816891412348, + "eval_accuracy": 0.9755833916195078, + "eval_loss": 0.08238392323255539, + "eval_runtime": 32.0699, + "eval_samples_per_second": 490.398, + "eval_steps_per_second": 15.342, + "step": 57500 + }, + { + "epoch": 8.163236337828247, + "grad_norm": 4.528477668762207, + "learning_rate": 9.184031227821151e-05, + "loss": 0.0625986099243164, + "step": 57510 + }, + { + "epoch": 8.164655784244145, + "grad_norm": 1.188218355178833, + "learning_rate": 9.183889283179561e-05, + "loss": 0.024391371011734008, + "step": 57520 + }, + { + "epoch": 8.166075230660043, + "grad_norm": 0.12762023508548737, + "learning_rate": 9.18374733853797e-05, + "loss": 0.058549624681472776, + "step": 57530 + }, + { + "epoch": 8.16749467707594, + "grad_norm": 7.117990016937256, + "learning_rate": 9.18360539389638e-05, + "loss": 0.05282506942749023, + "step": 57540 + }, + { + "epoch": 8.168914123491838, + "grad_norm": 1.1791610717773438, + "learning_rate": 9.183463449254791e-05, + "loss": 0.016684700548648835, + "step": 57550 + }, + { + "epoch": 8.170333569907736, + "grad_norm": 0.29182952642440796, + "learning_rate": 9.183321504613201e-05, + "loss": 0.03156082630157471, + "step": 57560 + }, + { + "epoch": 8.171753016323633, + "grad_norm": 0.5603981018066406, + "learning_rate": 9.183179559971612e-05, + "loss": 0.045778483152389526, + "step": 57570 + }, + { + "epoch": 8.173172462739531, + "grad_norm": 10.272153854370117, + "learning_rate": 9.183037615330022e-05, + "loss": 0.0619217574596405, + "step": 57580 + }, + { + "epoch": 8.17459190915543, + "grad_norm": 0.1799510270357132, + "learning_rate": 9.182895670688432e-05, + "loss": 0.03966604471206665, + "step": 57590 + }, + { + "epoch": 8.176011355571328, + "grad_norm": 0.05715738981962204, + "learning_rate": 9.182753726046843e-05, + "loss": 0.02208338528871536, + "step": 57600 + }, + { + "epoch": 8.177430801987224, + "grad_norm": 1.5309598445892334, + "learning_rate": 9.182611781405252e-05, + "loss": 0.00791141539812088, + "step": 57610 + }, + { + "epoch": 8.178850248403123, + "grad_norm": 4.317909240722656, + "learning_rate": 9.182469836763664e-05, + "loss": 0.01654169410467148, + "step": 57620 + }, + { + "epoch": 8.180269694819021, + "grad_norm": 3.71759033203125, + "learning_rate": 9.182327892122072e-05, + "loss": 0.0275895893573761, + "step": 57630 + }, + { + "epoch": 8.181689141234918, + "grad_norm": 6.036670684814453, + "learning_rate": 9.182185947480483e-05, + "loss": 0.05108698606491089, + "step": 57640 + }, + { + "epoch": 8.183108587650816, + "grad_norm": 3.292255163192749, + "learning_rate": 9.182044002838893e-05, + "loss": 0.023200985789299012, + "step": 57650 + }, + { + "epoch": 8.184528034066714, + "grad_norm": 1.7355892658233643, + "learning_rate": 9.181902058197304e-05, + "loss": 0.046505868434906006, + "step": 57660 + }, + { + "epoch": 8.185947480482612, + "grad_norm": 2.1724367141723633, + "learning_rate": 9.181760113555714e-05, + "loss": 0.03132735192775726, + "step": 57670 + }, + { + "epoch": 8.187366926898509, + "grad_norm": 8.026420593261719, + "learning_rate": 9.181618168914123e-05, + "loss": 0.04148833453655243, + "step": 57680 + }, + { + "epoch": 8.188786373314407, + "grad_norm": 4.966795921325684, + "learning_rate": 9.181476224272534e-05, + "loss": 0.05792571902275086, + "step": 57690 + }, + { + "epoch": 8.190205819730306, + "grad_norm": 0.702034056186676, + "learning_rate": 9.181334279630944e-05, + "loss": 0.021608872711658476, + "step": 57700 + }, + { + "epoch": 8.191625266146202, + "grad_norm": 0.0565681979060173, + "learning_rate": 9.181192334989355e-05, + "loss": 0.009969682991504669, + "step": 57710 + }, + { + "epoch": 8.1930447125621, + "grad_norm": 0.2924593985080719, + "learning_rate": 9.181050390347765e-05, + "loss": 0.04140026867389679, + "step": 57720 + }, + { + "epoch": 8.194464158977999, + "grad_norm": 7.954497337341309, + "learning_rate": 9.180908445706175e-05, + "loss": 0.06450521945953369, + "step": 57730 + }, + { + "epoch": 8.195883605393897, + "grad_norm": 0.15417218208312988, + "learning_rate": 9.180766501064584e-05, + "loss": 0.02865954041481018, + "step": 57740 + }, + { + "epoch": 8.197303051809794, + "grad_norm": 1.5217573642730713, + "learning_rate": 9.180624556422996e-05, + "loss": 0.05589728355407715, + "step": 57750 + }, + { + "epoch": 8.198722498225692, + "grad_norm": 11.092070579528809, + "learning_rate": 9.180482611781405e-05, + "loss": 0.06653887033462524, + "step": 57760 + }, + { + "epoch": 8.20014194464159, + "grad_norm": 1.9384713172912598, + "learning_rate": 9.180340667139816e-05, + "loss": 0.015503853559494019, + "step": 57770 + }, + { + "epoch": 8.201561391057487, + "grad_norm": 3.561843156814575, + "learning_rate": 9.180198722498226e-05, + "loss": 0.06690990924835205, + "step": 57780 + }, + { + "epoch": 8.202980837473385, + "grad_norm": 0.6527613997459412, + "learning_rate": 9.180056777856636e-05, + "loss": 0.02870522141456604, + "step": 57790 + }, + { + "epoch": 8.204400283889283, + "grad_norm": 0.692225992679596, + "learning_rate": 9.179914833215047e-05, + "loss": 0.047651296854019164, + "step": 57800 + }, + { + "epoch": 8.205819730305182, + "grad_norm": 0.11748456209897995, + "learning_rate": 9.179772888573457e-05, + "loss": 0.026576727628707886, + "step": 57810 + }, + { + "epoch": 8.207239176721078, + "grad_norm": 5.99944543838501, + "learning_rate": 9.179630943931868e-05, + "loss": 0.061805450916290285, + "step": 57820 + }, + { + "epoch": 8.208658623136976, + "grad_norm": 3.832113265991211, + "learning_rate": 9.179488999290278e-05, + "loss": 0.01249971017241478, + "step": 57830 + }, + { + "epoch": 8.210078069552875, + "grad_norm": 0.19495727121829987, + "learning_rate": 9.179347054648687e-05, + "loss": 0.05017414689064026, + "step": 57840 + }, + { + "epoch": 8.211497515968771, + "grad_norm": 0.5911961197853088, + "learning_rate": 9.179205110007097e-05, + "loss": 0.025897520780563354, + "step": 57850 + }, + { + "epoch": 8.21291696238467, + "grad_norm": 5.365500450134277, + "learning_rate": 9.179063165365508e-05, + "loss": 0.037281885743141174, + "step": 57860 + }, + { + "epoch": 8.214336408800568, + "grad_norm": 1.0354958772659302, + "learning_rate": 9.178921220723918e-05, + "loss": 0.028408104181289674, + "step": 57870 + }, + { + "epoch": 8.215755855216466, + "grad_norm": 6.38060188293457, + "learning_rate": 9.178779276082329e-05, + "loss": 0.019732609391212463, + "step": 57880 + }, + { + "epoch": 8.217175301632363, + "grad_norm": 0.3738914728164673, + "learning_rate": 9.178637331440739e-05, + "loss": 0.020280544459819794, + "step": 57890 + }, + { + "epoch": 8.218594748048261, + "grad_norm": 7.738147258758545, + "learning_rate": 9.178495386799148e-05, + "loss": 0.06968256831169128, + "step": 57900 + }, + { + "epoch": 8.22001419446416, + "grad_norm": 3.8040566444396973, + "learning_rate": 9.17835344215756e-05, + "loss": 0.05860614776611328, + "step": 57910 + }, + { + "epoch": 8.221433640880056, + "grad_norm": 0.19749833643436432, + "learning_rate": 9.178211497515969e-05, + "loss": 0.05739356875419617, + "step": 57920 + }, + { + "epoch": 8.222853087295954, + "grad_norm": 0.8916294574737549, + "learning_rate": 9.17806955287438e-05, + "loss": 0.027829304337501526, + "step": 57930 + }, + { + "epoch": 8.224272533711853, + "grad_norm": 2.294523239135742, + "learning_rate": 9.177927608232789e-05, + "loss": 0.024026399850845336, + "step": 57940 + }, + { + "epoch": 8.22569198012775, + "grad_norm": 1.959633469581604, + "learning_rate": 9.1777856635912e-05, + "loss": 0.057163572311401366, + "step": 57950 + }, + { + "epoch": 8.227111426543647, + "grad_norm": 5.235497951507568, + "learning_rate": 9.17764371894961e-05, + "loss": 0.11002181768417359, + "step": 57960 + }, + { + "epoch": 8.228530872959546, + "grad_norm": 5.246006965637207, + "learning_rate": 9.17750177430802e-05, + "loss": 0.036422187089920045, + "step": 57970 + }, + { + "epoch": 8.229950319375444, + "grad_norm": 3.0154199600219727, + "learning_rate": 9.17735982966643e-05, + "loss": 0.016849853098392487, + "step": 57980 + }, + { + "epoch": 8.231369765791342, + "grad_norm": 3.4750466346740723, + "learning_rate": 9.17721788502484e-05, + "loss": 0.03442394733428955, + "step": 57990 + }, + { + "epoch": 8.232789212207239, + "grad_norm": 8.968158721923828, + "learning_rate": 9.177075940383251e-05, + "loss": 0.043063384294509885, + "step": 58000 + }, + { + "epoch": 8.232789212207239, + "eval_accuracy": 0.9691613149360971, + "eval_loss": 0.10660364478826523, + "eval_runtime": 32.1198, + "eval_samples_per_second": 489.635, + "eval_steps_per_second": 15.318, + "step": 58000 + }, + { + "epoch": 8.234208658623137, + "grad_norm": 0.9778158068656921, + "learning_rate": 9.176933995741661e-05, + "loss": 0.030573081970214844, + "step": 58010 + }, + { + "epoch": 8.235628105039035, + "grad_norm": 0.6507449150085449, + "learning_rate": 9.176792051100072e-05, + "loss": 0.014274489879608155, + "step": 58020 + }, + { + "epoch": 8.237047551454932, + "grad_norm": 3.5025880336761475, + "learning_rate": 9.176650106458482e-05, + "loss": 0.020151573419570922, + "step": 58030 + }, + { + "epoch": 8.23846699787083, + "grad_norm": 1.5362058877944946, + "learning_rate": 9.176508161816892e-05, + "loss": 0.06194206476211548, + "step": 58040 + }, + { + "epoch": 8.239886444286729, + "grad_norm": 2.623915672302246, + "learning_rate": 9.176366217175301e-05, + "loss": 0.023121093213558198, + "step": 58050 + }, + { + "epoch": 8.241305890702627, + "grad_norm": 0.05486688017845154, + "learning_rate": 9.176224272533712e-05, + "loss": 0.02971988618373871, + "step": 58060 + }, + { + "epoch": 8.242725337118523, + "grad_norm": 0.1611616164445877, + "learning_rate": 9.176082327892122e-05, + "loss": 0.04156226217746735, + "step": 58070 + }, + { + "epoch": 8.244144783534422, + "grad_norm": 3.102126359939575, + "learning_rate": 9.175940383250533e-05, + "loss": 0.018605512380599976, + "step": 58080 + }, + { + "epoch": 8.24556422995032, + "grad_norm": 1.3434499502182007, + "learning_rate": 9.175812633073102e-05, + "loss": 0.04484846293926239, + "step": 58090 + }, + { + "epoch": 8.246983676366217, + "grad_norm": 0.9203706383705139, + "learning_rate": 9.175670688431513e-05, + "loss": 0.020824790000915527, + "step": 58100 + }, + { + "epoch": 8.248403122782115, + "grad_norm": 0.8287354111671448, + "learning_rate": 9.175528743789923e-05, + "loss": 0.03844572603702545, + "step": 58110 + }, + { + "epoch": 8.249822569198013, + "grad_norm": 11.324418067932129, + "learning_rate": 9.175386799148332e-05, + "loss": 0.04563019275665283, + "step": 58120 + }, + { + "epoch": 8.251242015613911, + "grad_norm": 0.2999362051486969, + "learning_rate": 9.175244854506742e-05, + "loss": 0.039099177718162535, + "step": 58130 + }, + { + "epoch": 8.252661462029808, + "grad_norm": 1.6591378450393677, + "learning_rate": 9.175102909865153e-05, + "loss": 0.03310187757015228, + "step": 58140 + }, + { + "epoch": 8.254080908445706, + "grad_norm": 6.494851112365723, + "learning_rate": 9.174960965223564e-05, + "loss": 0.061228638887405394, + "step": 58150 + }, + { + "epoch": 8.255500354861605, + "grad_norm": 1.309449315071106, + "learning_rate": 9.174819020581974e-05, + "loss": 0.015565997362136841, + "step": 58160 + }, + { + "epoch": 8.256919801277501, + "grad_norm": 0.35763129591941833, + "learning_rate": 9.174677075940384e-05, + "loss": 0.07266973853111267, + "step": 58170 + }, + { + "epoch": 8.2583392476934, + "grad_norm": 2.6130881309509277, + "learning_rate": 9.174535131298793e-05, + "loss": 0.03264107704162598, + "step": 58180 + }, + { + "epoch": 8.259758694109298, + "grad_norm": 0.6050202250480652, + "learning_rate": 9.174393186657204e-05, + "loss": 0.015243317186832427, + "step": 58190 + }, + { + "epoch": 8.261178140525196, + "grad_norm": 1.1150106191635132, + "learning_rate": 9.174251242015614e-05, + "loss": 0.05223976969718933, + "step": 58200 + }, + { + "epoch": 8.262597586941093, + "grad_norm": 6.839016437530518, + "learning_rate": 9.174109297374025e-05, + "loss": 0.0411196768283844, + "step": 58210 + }, + { + "epoch": 8.264017033356991, + "grad_norm": 6.911043167114258, + "learning_rate": 9.173967352732434e-05, + "loss": 0.04330936968326569, + "step": 58220 + }, + { + "epoch": 8.26543647977289, + "grad_norm": 1.0361573696136475, + "learning_rate": 9.173825408090845e-05, + "loss": 0.025824397802352905, + "step": 58230 + }, + { + "epoch": 8.266855926188786, + "grad_norm": 12.14588451385498, + "learning_rate": 9.173683463449256e-05, + "loss": 0.02549113631248474, + "step": 58240 + }, + { + "epoch": 8.268275372604684, + "grad_norm": 0.05422484129667282, + "learning_rate": 9.173541518807666e-05, + "loss": 0.032814472913742065, + "step": 58250 + }, + { + "epoch": 8.269694819020582, + "grad_norm": 3.4622371196746826, + "learning_rate": 9.173399574166077e-05, + "loss": 0.029658371210098268, + "step": 58260 + }, + { + "epoch": 8.27111426543648, + "grad_norm": 0.06342365592718124, + "learning_rate": 9.173257629524485e-05, + "loss": 0.02727014124393463, + "step": 58270 + }, + { + "epoch": 8.272533711852377, + "grad_norm": 7.434948921203613, + "learning_rate": 9.173115684882896e-05, + "loss": 0.04313863217830658, + "step": 58280 + }, + { + "epoch": 8.273953158268275, + "grad_norm": 2.478610038757324, + "learning_rate": 9.172973740241306e-05, + "loss": 0.021670131385326384, + "step": 58290 + }, + { + "epoch": 8.275372604684174, + "grad_norm": 6.441994667053223, + "learning_rate": 9.172831795599717e-05, + "loss": 0.04854116439819336, + "step": 58300 + }, + { + "epoch": 8.27679205110007, + "grad_norm": 4.104756832122803, + "learning_rate": 9.172689850958127e-05, + "loss": 0.03957696259021759, + "step": 58310 + }, + { + "epoch": 8.278211497515969, + "grad_norm": 0.4648134112358093, + "learning_rate": 9.172547906316536e-05, + "loss": 0.03666155338287354, + "step": 58320 + }, + { + "epoch": 8.279630943931867, + "grad_norm": 5.658158779144287, + "learning_rate": 9.172405961674948e-05, + "loss": 0.05015560984611511, + "step": 58330 + }, + { + "epoch": 8.281050390347765, + "grad_norm": 0.17183201014995575, + "learning_rate": 9.172264017033357e-05, + "loss": 0.05419689416885376, + "step": 58340 + }, + { + "epoch": 8.282469836763662, + "grad_norm": 0.5585965514183044, + "learning_rate": 9.172122072391768e-05, + "loss": 0.019291809201240538, + "step": 58350 + }, + { + "epoch": 8.28388928317956, + "grad_norm": 0.5460109114646912, + "learning_rate": 9.171980127750178e-05, + "loss": 0.024260058999061584, + "step": 58360 + }, + { + "epoch": 8.285308729595458, + "grad_norm": 4.628096103668213, + "learning_rate": 9.171838183108588e-05, + "loss": 0.06682268977165222, + "step": 58370 + }, + { + "epoch": 8.286728176011355, + "grad_norm": 0.07002594321966171, + "learning_rate": 9.171696238466998e-05, + "loss": 0.06908130049705505, + "step": 58380 + }, + { + "epoch": 8.288147622427253, + "grad_norm": 5.662642478942871, + "learning_rate": 9.171554293825409e-05, + "loss": 0.0431951105594635, + "step": 58390 + }, + { + "epoch": 8.289567068843152, + "grad_norm": 3.9657466411590576, + "learning_rate": 9.171412349183818e-05, + "loss": 0.023533882200717927, + "step": 58400 + }, + { + "epoch": 8.29098651525905, + "grad_norm": 0.025615880265831947, + "learning_rate": 9.17127040454223e-05, + "loss": 0.01455162763595581, + "step": 58410 + }, + { + "epoch": 8.292405961674946, + "grad_norm": 5.503678321838379, + "learning_rate": 9.171128459900639e-05, + "loss": 0.0331308126449585, + "step": 58420 + }, + { + "epoch": 8.293825408090845, + "grad_norm": 4.88674259185791, + "learning_rate": 9.170986515259049e-05, + "loss": 0.028948378562927247, + "step": 58430 + }, + { + "epoch": 8.295244854506743, + "grad_norm": 0.2964005470275879, + "learning_rate": 9.17084457061746e-05, + "loss": 0.04497859477996826, + "step": 58440 + }, + { + "epoch": 8.29666430092264, + "grad_norm": 1.4039242267608643, + "learning_rate": 9.17070262597587e-05, + "loss": 0.03003017008304596, + "step": 58450 + }, + { + "epoch": 8.298083747338538, + "grad_norm": 0.22310487926006317, + "learning_rate": 9.170560681334281e-05, + "loss": 0.034260991215705874, + "step": 58460 + }, + { + "epoch": 8.299503193754436, + "grad_norm": 3.774014472961426, + "learning_rate": 9.170418736692691e-05, + "loss": 0.05678040981292724, + "step": 58470 + }, + { + "epoch": 8.300922640170334, + "grad_norm": 4.474982261657715, + "learning_rate": 9.1702767920511e-05, + "loss": 0.05777202248573303, + "step": 58480 + }, + { + "epoch": 8.302342086586231, + "grad_norm": 0.0562448650598526, + "learning_rate": 9.17013484740951e-05, + "loss": 0.04792693853378296, + "step": 58490 + }, + { + "epoch": 8.30376153300213, + "grad_norm": 1.0092743635177612, + "learning_rate": 9.169992902767921e-05, + "loss": 0.033463281393051145, + "step": 58500 + }, + { + "epoch": 8.30376153300213, + "eval_accuracy": 0.9818782984676034, + "eval_loss": 0.056806765496730804, + "eval_runtime": 34.6905, + "eval_samples_per_second": 453.351, + "eval_steps_per_second": 14.183, + "step": 58500 + }, + { + "epoch": 8.305180979418028, + "grad_norm": 0.3739173412322998, + "learning_rate": 9.169850958126331e-05, + "loss": 0.04968686699867249, + "step": 58510 + }, + { + "epoch": 8.306600425833924, + "grad_norm": 4.6436767578125, + "learning_rate": 9.169709013484742e-05, + "loss": 0.05652905702590942, + "step": 58520 + }, + { + "epoch": 8.308019872249822, + "grad_norm": 1.215824842453003, + "learning_rate": 9.169567068843152e-05, + "loss": 0.08560553789138795, + "step": 58530 + }, + { + "epoch": 8.30943931866572, + "grad_norm": 2.3930976390838623, + "learning_rate": 9.169425124201562e-05, + "loss": 0.047614786028862, + "step": 58540 + }, + { + "epoch": 8.310858765081619, + "grad_norm": 0.21128606796264648, + "learning_rate": 9.169283179559973e-05, + "loss": 0.041520559787750246, + "step": 58550 + }, + { + "epoch": 8.312278211497516, + "grad_norm": 0.11912447959184647, + "learning_rate": 9.169141234918382e-05, + "loss": 0.04494628310203552, + "step": 58560 + }, + { + "epoch": 8.313697657913414, + "grad_norm": 3.504589557647705, + "learning_rate": 9.168999290276793e-05, + "loss": 0.05546298623085022, + "step": 58570 + }, + { + "epoch": 8.315117104329312, + "grad_norm": 0.06115560233592987, + "learning_rate": 9.168857345635202e-05, + "loss": 0.020033690333366393, + "step": 58580 + }, + { + "epoch": 8.316536550745209, + "grad_norm": 0.08107715845108032, + "learning_rate": 9.168715400993613e-05, + "loss": 0.025489938259124757, + "step": 58590 + }, + { + "epoch": 8.317955997161107, + "grad_norm": 2.1215507984161377, + "learning_rate": 9.168573456352023e-05, + "loss": 0.05170263051986694, + "step": 58600 + }, + { + "epoch": 8.319375443577005, + "grad_norm": 3.4608006477355957, + "learning_rate": 9.168431511710434e-05, + "loss": 0.029605063796043395, + "step": 58610 + }, + { + "epoch": 8.320794889992904, + "grad_norm": 0.22780439257621765, + "learning_rate": 9.168289567068844e-05, + "loss": 0.039026209712028505, + "step": 58620 + }, + { + "epoch": 8.3222143364088, + "grad_norm": 6.255256175994873, + "learning_rate": 9.168147622427253e-05, + "loss": 0.06340646147727966, + "step": 58630 + }, + { + "epoch": 8.323633782824698, + "grad_norm": 0.20648564398288727, + "learning_rate": 9.168005677785664e-05, + "loss": 0.04623824059963226, + "step": 58640 + }, + { + "epoch": 8.325053229240597, + "grad_norm": 2.573063373565674, + "learning_rate": 9.167863733144074e-05, + "loss": 0.019829289615154268, + "step": 58650 + }, + { + "epoch": 8.326472675656493, + "grad_norm": 1.750908374786377, + "learning_rate": 9.167721788502485e-05, + "loss": 0.050472980737686156, + "step": 58660 + }, + { + "epoch": 8.327892122072392, + "grad_norm": 0.2912693917751312, + "learning_rate": 9.167579843860895e-05, + "loss": 0.02320457398891449, + "step": 58670 + }, + { + "epoch": 8.32931156848829, + "grad_norm": 1.3368074893951416, + "learning_rate": 9.167437899219305e-05, + "loss": 0.03441727757453918, + "step": 58680 + }, + { + "epoch": 8.330731014904188, + "grad_norm": 0.020405098795890808, + "learning_rate": 9.167295954577714e-05, + "loss": 0.030085077881813048, + "step": 58690 + }, + { + "epoch": 8.332150461320085, + "grad_norm": 2.079420804977417, + "learning_rate": 9.167154009936125e-05, + "loss": 0.03475523889064789, + "step": 58700 + }, + { + "epoch": 8.333569907735983, + "grad_norm": 6.957132816314697, + "learning_rate": 9.167012065294535e-05, + "loss": 0.04805854558944702, + "step": 58710 + }, + { + "epoch": 8.334989354151881, + "grad_norm": 0.8798670768737793, + "learning_rate": 9.166870120652946e-05, + "loss": 0.03421752452850342, + "step": 58720 + }, + { + "epoch": 8.336408800567778, + "grad_norm": 4.264864921569824, + "learning_rate": 9.166728176011356e-05, + "loss": 0.04698627889156341, + "step": 58730 + }, + { + "epoch": 8.337828246983676, + "grad_norm": 8.51839828491211, + "learning_rate": 9.166586231369766e-05, + "loss": 0.024400044977664948, + "step": 58740 + }, + { + "epoch": 8.339247693399575, + "grad_norm": 2.2272045612335205, + "learning_rate": 9.166444286728177e-05, + "loss": 0.0337568461894989, + "step": 58750 + }, + { + "epoch": 8.340667139815473, + "grad_norm": 0.3619248569011688, + "learning_rate": 9.166302342086587e-05, + "loss": 0.06187044978141785, + "step": 58760 + }, + { + "epoch": 8.34208658623137, + "grad_norm": 0.6603171229362488, + "learning_rate": 9.166160397444998e-05, + "loss": 0.02449636459350586, + "step": 58770 + }, + { + "epoch": 8.343506032647268, + "grad_norm": 2.9686007499694824, + "learning_rate": 9.166018452803407e-05, + "loss": 0.03370268642902374, + "step": 58780 + }, + { + "epoch": 8.344925479063166, + "grad_norm": 0.051271889358758926, + "learning_rate": 9.165876508161817e-05, + "loss": 0.01729312539100647, + "step": 58790 + }, + { + "epoch": 8.346344925479062, + "grad_norm": 0.945408046245575, + "learning_rate": 9.165734563520227e-05, + "loss": 0.01863696575164795, + "step": 58800 + }, + { + "epoch": 8.34776437189496, + "grad_norm": 8.780373573303223, + "learning_rate": 9.165592618878638e-05, + "loss": 0.04798442721366882, + "step": 58810 + }, + { + "epoch": 8.349183818310859, + "grad_norm": 2.2021658420562744, + "learning_rate": 9.165450674237048e-05, + "loss": 0.04746388792991638, + "step": 58820 + }, + { + "epoch": 8.350603264726757, + "grad_norm": 2.684445381164551, + "learning_rate": 9.165308729595459e-05, + "loss": 0.045510712265968326, + "step": 58830 + }, + { + "epoch": 8.352022711142654, + "grad_norm": 0.4354016184806824, + "learning_rate": 9.165166784953869e-05, + "loss": 0.061435526609420775, + "step": 58840 + }, + { + "epoch": 8.353442157558552, + "grad_norm": 9.262724876403809, + "learning_rate": 9.165024840312278e-05, + "loss": 0.09312753081321716, + "step": 58850 + }, + { + "epoch": 8.35486160397445, + "grad_norm": 0.48700007796287537, + "learning_rate": 9.16488289567069e-05, + "loss": 0.049555063247680664, + "step": 58860 + }, + { + "epoch": 8.356281050390347, + "grad_norm": 0.06040867790579796, + "learning_rate": 9.164740951029099e-05, + "loss": 0.046083787083625795, + "step": 58870 + }, + { + "epoch": 8.357700496806245, + "grad_norm": 5.327953338623047, + "learning_rate": 9.16459900638751e-05, + "loss": 0.04298398494720459, + "step": 58880 + }, + { + "epoch": 8.359119943222144, + "grad_norm": 6.591610431671143, + "learning_rate": 9.164457061745919e-05, + "loss": 0.10864330530166626, + "step": 58890 + }, + { + "epoch": 8.360539389638042, + "grad_norm": 17.488178253173828, + "learning_rate": 9.16431511710433e-05, + "loss": 0.07181705236434936, + "step": 58900 + }, + { + "epoch": 8.361958836053939, + "grad_norm": 8.191031455993652, + "learning_rate": 9.16417317246274e-05, + "loss": 0.10081918239593506, + "step": 58910 + }, + { + "epoch": 8.363378282469837, + "grad_norm": 0.9254047274589539, + "learning_rate": 9.16403122782115e-05, + "loss": 0.03317170143127442, + "step": 58920 + }, + { + "epoch": 8.364797728885735, + "grad_norm": 15.292092323303223, + "learning_rate": 9.16388928317956e-05, + "loss": 0.11680049896240234, + "step": 58930 + }, + { + "epoch": 8.366217175301632, + "grad_norm": 3.000251531600952, + "learning_rate": 9.16374733853797e-05, + "loss": 0.050798237323760986, + "step": 58940 + }, + { + "epoch": 8.36763662171753, + "grad_norm": 3.4350600242614746, + "learning_rate": 9.163605393896381e-05, + "loss": 0.019013065099716186, + "step": 58950 + }, + { + "epoch": 8.369056068133428, + "grad_norm": 0.20841482281684875, + "learning_rate": 9.163463449254791e-05, + "loss": 0.06327688694000244, + "step": 58960 + }, + { + "epoch": 8.370475514549327, + "grad_norm": 10.657003402709961, + "learning_rate": 9.163321504613202e-05, + "loss": 0.05808635950088501, + "step": 58970 + }, + { + "epoch": 8.371894960965223, + "grad_norm": 2.521904706954956, + "learning_rate": 9.163179559971612e-05, + "loss": 0.03354381322860718, + "step": 58980 + }, + { + "epoch": 8.373314407381121, + "grad_norm": 9.845879554748535, + "learning_rate": 9.163037615330021e-05, + "loss": 0.05215628147125244, + "step": 58990 + }, + { + "epoch": 8.37473385379702, + "grad_norm": 6.998453617095947, + "learning_rate": 9.162895670688431e-05, + "loss": 0.04951879382133484, + "step": 59000 + }, + { + "epoch": 8.37473385379702, + "eval_accuracy": 0.977045844725631, + "eval_loss": 0.07441914826631546, + "eval_runtime": 33.0904, + "eval_samples_per_second": 475.274, + "eval_steps_per_second": 14.868, + "step": 59000 + }, + { + "epoch": 8.376153300212916, + "grad_norm": 4.48925256729126, + "learning_rate": 9.162753726046842e-05, + "loss": 0.03418838381767273, + "step": 59010 + }, + { + "epoch": 8.377572746628815, + "grad_norm": 0.2785710096359253, + "learning_rate": 9.162611781405252e-05, + "loss": 0.026520654559135437, + "step": 59020 + }, + { + "epoch": 8.378992193044713, + "grad_norm": 0.04143285006284714, + "learning_rate": 9.162469836763663e-05, + "loss": 0.03496352732181549, + "step": 59030 + }, + { + "epoch": 8.380411639460611, + "grad_norm": 0.034225188195705414, + "learning_rate": 9.162327892122073e-05, + "loss": 0.020401456952095033, + "step": 59040 + }, + { + "epoch": 8.381831085876508, + "grad_norm": 0.8951627612113953, + "learning_rate": 9.162185947480483e-05, + "loss": 0.025759845972061157, + "step": 59050 + }, + { + "epoch": 8.383250532292406, + "grad_norm": 4.353384971618652, + "learning_rate": 9.162044002838894e-05, + "loss": 0.04796704351902008, + "step": 59060 + }, + { + "epoch": 8.384669978708304, + "grad_norm": 6.735307216644287, + "learning_rate": 9.161902058197303e-05, + "loss": 0.03449790477752686, + "step": 59070 + }, + { + "epoch": 8.3860894251242, + "grad_norm": 0.14655238389968872, + "learning_rate": 9.161760113555714e-05, + "loss": 0.010301701724529266, + "step": 59080 + }, + { + "epoch": 8.3875088715401, + "grad_norm": 10.556654930114746, + "learning_rate": 9.161618168914123e-05, + "loss": 0.05006436705589294, + "step": 59090 + }, + { + "epoch": 8.388928317955997, + "grad_norm": 0.158490389585495, + "learning_rate": 9.161476224272534e-05, + "loss": 0.04050736129283905, + "step": 59100 + }, + { + "epoch": 8.390347764371896, + "grad_norm": 0.09657344222068787, + "learning_rate": 9.161334279630944e-05, + "loss": 0.017902058362960816, + "step": 59110 + }, + { + "epoch": 8.391767210787792, + "grad_norm": 0.02483726106584072, + "learning_rate": 9.161192334989355e-05, + "loss": 0.017482933402061463, + "step": 59120 + }, + { + "epoch": 8.39318665720369, + "grad_norm": 7.812098026275635, + "learning_rate": 9.161050390347765e-05, + "loss": 0.018153285980224608, + "step": 59130 + }, + { + "epoch": 8.394606103619589, + "grad_norm": 0.03699856996536255, + "learning_rate": 9.160908445706176e-05, + "loss": 0.019261515140533446, + "step": 59140 + }, + { + "epoch": 8.396025550035485, + "grad_norm": 8.018878936767578, + "learning_rate": 9.160766501064585e-05, + "loss": 0.05388169288635254, + "step": 59150 + }, + { + "epoch": 8.397444996451384, + "grad_norm": 0.09866965562105179, + "learning_rate": 9.160624556422995e-05, + "loss": 0.027665621042251586, + "step": 59160 + }, + { + "epoch": 8.398864442867282, + "grad_norm": 18.310009002685547, + "learning_rate": 9.160482611781406e-05, + "loss": 0.0914535403251648, + "step": 59170 + }, + { + "epoch": 8.40028388928318, + "grad_norm": 0.0723535567522049, + "learning_rate": 9.160340667139816e-05, + "loss": 0.029044130444526674, + "step": 59180 + }, + { + "epoch": 8.401703335699077, + "grad_norm": 0.02246226742863655, + "learning_rate": 9.160198722498227e-05, + "loss": 0.016526098549365997, + "step": 59190 + }, + { + "epoch": 8.403122782114975, + "grad_norm": 0.16880199313163757, + "learning_rate": 9.160056777856635e-05, + "loss": 0.014294581115245819, + "step": 59200 + }, + { + "epoch": 8.404542228530874, + "grad_norm": 4.8066864013671875, + "learning_rate": 9.159914833215047e-05, + "loss": 0.017830350995063783, + "step": 59210 + }, + { + "epoch": 8.40596167494677, + "grad_norm": 6.521173000335693, + "learning_rate": 9.159772888573456e-05, + "loss": 0.028379026055335998, + "step": 59220 + }, + { + "epoch": 8.407381121362668, + "grad_norm": 2.4245357513427734, + "learning_rate": 9.159630943931867e-05, + "loss": 0.023089191317558287, + "step": 59230 + }, + { + "epoch": 8.408800567778567, + "grad_norm": 5.615114688873291, + "learning_rate": 9.159488999290277e-05, + "loss": 0.0288492351770401, + "step": 59240 + }, + { + "epoch": 8.410220014194465, + "grad_norm": 6.010306358337402, + "learning_rate": 9.159347054648687e-05, + "loss": 0.043870294094085695, + "step": 59250 + }, + { + "epoch": 8.411639460610361, + "grad_norm": 0.6510958671569824, + "learning_rate": 9.159205110007098e-05, + "loss": 0.03329501152038574, + "step": 59260 + }, + { + "epoch": 8.41305890702626, + "grad_norm": 0.7864037752151489, + "learning_rate": 9.159063165365508e-05, + "loss": 0.04816370904445648, + "step": 59270 + }, + { + "epoch": 8.414478353442158, + "grad_norm": 6.080152988433838, + "learning_rate": 9.158921220723919e-05, + "loss": 0.04071834087371826, + "step": 59280 + }, + { + "epoch": 8.415897799858055, + "grad_norm": 0.08487723022699356, + "learning_rate": 9.158779276082328e-05, + "loss": 0.021995453536510466, + "step": 59290 + }, + { + "epoch": 8.417317246273953, + "grad_norm": 1.2968878746032715, + "learning_rate": 9.158637331440738e-05, + "loss": 0.03177845478057861, + "step": 59300 + }, + { + "epoch": 8.418736692689851, + "grad_norm": 4.66085147857666, + "learning_rate": 9.158495386799148e-05, + "loss": 0.02630176246166229, + "step": 59310 + }, + { + "epoch": 8.42015613910575, + "grad_norm": 11.192215919494629, + "learning_rate": 9.158353442157559e-05, + "loss": 0.05713456869125366, + "step": 59320 + }, + { + "epoch": 8.421575585521646, + "grad_norm": 6.2787628173828125, + "learning_rate": 9.158211497515969e-05, + "loss": 0.056651723384857175, + "step": 59330 + }, + { + "epoch": 8.422995031937544, + "grad_norm": 1.7867172956466675, + "learning_rate": 9.15806955287438e-05, + "loss": 0.062083113193511966, + "step": 59340 + }, + { + "epoch": 8.424414478353443, + "grad_norm": 6.2273783683776855, + "learning_rate": 9.15792760823279e-05, + "loss": 0.1109097957611084, + "step": 59350 + }, + { + "epoch": 8.42583392476934, + "grad_norm": 0.06000044196844101, + "learning_rate": 9.1577856635912e-05, + "loss": 0.032784104347229004, + "step": 59360 + }, + { + "epoch": 8.427253371185238, + "grad_norm": 5.235750198364258, + "learning_rate": 9.15764371894961e-05, + "loss": 0.027906310558319092, + "step": 59370 + }, + { + "epoch": 8.428672817601136, + "grad_norm": 1.2695759534835815, + "learning_rate": 9.15750177430802e-05, + "loss": 0.01285722553730011, + "step": 59380 + }, + { + "epoch": 8.430092264017034, + "grad_norm": 2.5559439659118652, + "learning_rate": 9.157359829666431e-05, + "loss": 0.01039145290851593, + "step": 59390 + }, + { + "epoch": 8.43151171043293, + "grad_norm": 8.719844818115234, + "learning_rate": 9.15721788502484e-05, + "loss": 0.06004953384399414, + "step": 59400 + }, + { + "epoch": 8.432931156848829, + "grad_norm": 0.029175806790590286, + "learning_rate": 9.157075940383251e-05, + "loss": 0.031366673111915586, + "step": 59410 + }, + { + "epoch": 8.434350603264727, + "grad_norm": 0.4012078046798706, + "learning_rate": 9.15693399574166e-05, + "loss": 0.029990941286087036, + "step": 59420 + }, + { + "epoch": 8.435770049680624, + "grad_norm": 3.6554975509643555, + "learning_rate": 9.156792051100072e-05, + "loss": 0.011965757608413697, + "step": 59430 + }, + { + "epoch": 8.437189496096522, + "grad_norm": 1.1532933712005615, + "learning_rate": 9.156650106458481e-05, + "loss": 0.03454259634017944, + "step": 59440 + }, + { + "epoch": 8.43860894251242, + "grad_norm": 5.955157279968262, + "learning_rate": 9.156508161816891e-05, + "loss": 0.011077064275741576, + "step": 59450 + }, + { + "epoch": 8.440028388928319, + "grad_norm": 0.1530897617340088, + "learning_rate": 9.156366217175302e-05, + "loss": 0.026709139347076416, + "step": 59460 + }, + { + "epoch": 8.441447835344215, + "grad_norm": 1.0983542203903198, + "learning_rate": 9.156224272533712e-05, + "loss": 0.023664931952953338, + "step": 59470 + }, + { + "epoch": 8.442867281760114, + "grad_norm": 0.36292317509651184, + "learning_rate": 9.156082327892123e-05, + "loss": 0.048028239607810976, + "step": 59480 + }, + { + "epoch": 8.444286728176012, + "grad_norm": 0.14995090663433075, + "learning_rate": 9.155940383250533e-05, + "loss": 0.015602460503578186, + "step": 59490 + }, + { + "epoch": 8.445706174591908, + "grad_norm": 0.9158965945243835, + "learning_rate": 9.155798438608944e-05, + "loss": 0.026582181453704834, + "step": 59500 + }, + { + "epoch": 8.445706174591908, + "eval_accuracy": 0.9734850893368093, + "eval_loss": 0.08671265840530396, + "eval_runtime": 33.331, + "eval_samples_per_second": 471.843, + "eval_steps_per_second": 14.761, + "step": 59500 + }, + { + "epoch": 8.447125621007807, + "grad_norm": 0.5541223287582397, + "learning_rate": 9.155656493967352e-05, + "loss": 0.040118956565856935, + "step": 59510 + }, + { + "epoch": 8.448545067423705, + "grad_norm": 1.745306372642517, + "learning_rate": 9.155514549325763e-05, + "loss": 0.024646060168743135, + "step": 59520 + }, + { + "epoch": 8.449964513839603, + "grad_norm": 0.9366075992584229, + "learning_rate": 9.155372604684173e-05, + "loss": 0.09292943477630615, + "step": 59530 + }, + { + "epoch": 8.4513839602555, + "grad_norm": 0.06262222677469254, + "learning_rate": 9.155230660042584e-05, + "loss": 0.04963191449642181, + "step": 59540 + }, + { + "epoch": 8.452803406671398, + "grad_norm": 0.08809320628643036, + "learning_rate": 9.155088715400995e-05, + "loss": 0.017690953612327576, + "step": 59550 + }, + { + "epoch": 8.454222853087296, + "grad_norm": 1.2314077615737915, + "learning_rate": 9.154946770759404e-05, + "loss": 0.0151987686753273, + "step": 59560 + }, + { + "epoch": 8.455642299503193, + "grad_norm": 0.8722369074821472, + "learning_rate": 9.154804826117815e-05, + "loss": 0.0057766992598772045, + "step": 59570 + }, + { + "epoch": 8.457061745919091, + "grad_norm": 0.21585650742053986, + "learning_rate": 9.154662881476224e-05, + "loss": 0.031658861041069034, + "step": 59580 + }, + { + "epoch": 8.45848119233499, + "grad_norm": 7.521589756011963, + "learning_rate": 9.154520936834636e-05, + "loss": 0.03252851366996765, + "step": 59590 + }, + { + "epoch": 8.459900638750888, + "grad_norm": 10.103790283203125, + "learning_rate": 9.154378992193045e-05, + "loss": 0.029417049884796143, + "step": 59600 + }, + { + "epoch": 8.461320085166784, + "grad_norm": 11.28209400177002, + "learning_rate": 9.154237047551455e-05, + "loss": 0.0774505078792572, + "step": 59610 + }, + { + "epoch": 8.462739531582683, + "grad_norm": 13.403277397155762, + "learning_rate": 9.154095102909865e-05, + "loss": 0.10143941640853882, + "step": 59620 + }, + { + "epoch": 8.464158977998581, + "grad_norm": 0.5809075832366943, + "learning_rate": 9.153953158268276e-05, + "loss": 0.07998875379562378, + "step": 59630 + }, + { + "epoch": 8.465578424414478, + "grad_norm": 4.999480724334717, + "learning_rate": 9.153811213626687e-05, + "loss": 0.03397766649723053, + "step": 59640 + }, + { + "epoch": 8.466997870830376, + "grad_norm": 0.23321618139743805, + "learning_rate": 9.153669268985097e-05, + "loss": 0.034213504195213316, + "step": 59650 + }, + { + "epoch": 8.468417317246274, + "grad_norm": 5.279557228088379, + "learning_rate": 9.153527324343506e-05, + "loss": 0.016608065366744994, + "step": 59660 + }, + { + "epoch": 8.469836763662173, + "grad_norm": 3.1837151050567627, + "learning_rate": 9.153385379701916e-05, + "loss": 0.011586660146713256, + "step": 59670 + }, + { + "epoch": 8.471256210078069, + "grad_norm": 0.05866161733865738, + "learning_rate": 9.153243435060327e-05, + "loss": 0.03473564088344574, + "step": 59680 + }, + { + "epoch": 8.472675656493967, + "grad_norm": 1.3842390775680542, + "learning_rate": 9.153101490418737e-05, + "loss": 0.009509885311126709, + "step": 59690 + }, + { + "epoch": 8.474095102909866, + "grad_norm": 0.1759142428636551, + "learning_rate": 9.152959545777148e-05, + "loss": 0.009905293583869934, + "step": 59700 + }, + { + "epoch": 8.475514549325762, + "grad_norm": 0.848075270652771, + "learning_rate": 9.152817601135556e-05, + "loss": 0.0336533784866333, + "step": 59710 + }, + { + "epoch": 8.47693399574166, + "grad_norm": 0.3343120813369751, + "learning_rate": 9.152675656493968e-05, + "loss": 0.017269288003444672, + "step": 59720 + }, + { + "epoch": 8.478353442157559, + "grad_norm": 6.539244651794434, + "learning_rate": 9.152533711852379e-05, + "loss": 0.04893214702606201, + "step": 59730 + }, + { + "epoch": 8.479772888573457, + "grad_norm": 7.863105773925781, + "learning_rate": 9.152391767210788e-05, + "loss": 0.03542499840259552, + "step": 59740 + }, + { + "epoch": 8.481192334989354, + "grad_norm": 1.095321774482727, + "learning_rate": 9.1522498225692e-05, + "loss": 0.019741693139076234, + "step": 59750 + }, + { + "epoch": 8.482611781405252, + "grad_norm": 0.1375374048948288, + "learning_rate": 9.152107877927608e-05, + "loss": 0.026455044746398926, + "step": 59760 + }, + { + "epoch": 8.48403122782115, + "grad_norm": 5.922633647918701, + "learning_rate": 9.151965933286019e-05, + "loss": 0.06508615016937255, + "step": 59770 + }, + { + "epoch": 8.485450674237047, + "grad_norm": 0.24430686235427856, + "learning_rate": 9.151823988644429e-05, + "loss": 0.06395163536071777, + "step": 59780 + }, + { + "epoch": 8.486870120652945, + "grad_norm": 3.22963285446167, + "learning_rate": 9.15168204400284e-05, + "loss": 0.046134963631629944, + "step": 59790 + }, + { + "epoch": 8.488289567068843, + "grad_norm": 0.17032699286937714, + "learning_rate": 9.15154009936125e-05, + "loss": 0.011191642284393311, + "step": 59800 + }, + { + "epoch": 8.489709013484742, + "grad_norm": 2.9801077842712402, + "learning_rate": 9.151398154719659e-05, + "loss": 0.03180201649665833, + "step": 59810 + }, + { + "epoch": 8.491128459900638, + "grad_norm": 4.111161708831787, + "learning_rate": 9.15125621007807e-05, + "loss": 0.0533446192741394, + "step": 59820 + }, + { + "epoch": 8.492547906316537, + "grad_norm": 4.968252182006836, + "learning_rate": 9.15111426543648e-05, + "loss": 0.04567363262176514, + "step": 59830 + }, + { + "epoch": 8.493967352732435, + "grad_norm": 7.942319393157959, + "learning_rate": 9.150972320794891e-05, + "loss": 0.0842665195465088, + "step": 59840 + }, + { + "epoch": 8.495386799148331, + "grad_norm": 7.745817184448242, + "learning_rate": 9.150830376153301e-05, + "loss": 0.05556324124336243, + "step": 59850 + }, + { + "epoch": 8.49680624556423, + "grad_norm": 1.72450590133667, + "learning_rate": 9.150688431511712e-05, + "loss": 0.028340262174606324, + "step": 59860 + }, + { + "epoch": 8.498225691980128, + "grad_norm": 5.714693546295166, + "learning_rate": 9.15054648687012e-05, + "loss": 0.06224752068519592, + "step": 59870 + }, + { + "epoch": 8.499645138396026, + "grad_norm": 7.615067958831787, + "learning_rate": 9.150404542228531e-05, + "loss": 0.08627032041549683, + "step": 59880 + }, + { + "epoch": 8.501064584811923, + "grad_norm": 0.1859230101108551, + "learning_rate": 9.150262597586941e-05, + "loss": 0.01632542759180069, + "step": 59890 + }, + { + "epoch": 8.502484031227821, + "grad_norm": 1.340531826019287, + "learning_rate": 9.150120652945352e-05, + "loss": 0.028228405117988586, + "step": 59900 + }, + { + "epoch": 8.50390347764372, + "grad_norm": 8.235560417175293, + "learning_rate": 9.149978708303762e-05, + "loss": 0.04519372284412384, + "step": 59910 + }, + { + "epoch": 8.505322924059616, + "grad_norm": 9.629444122314453, + "learning_rate": 9.149836763662172e-05, + "loss": 0.02357942909002304, + "step": 59920 + }, + { + "epoch": 8.506742370475514, + "grad_norm": 0.11500538885593414, + "learning_rate": 9.149694819020583e-05, + "loss": 0.004986101761460304, + "step": 59930 + }, + { + "epoch": 8.508161816891413, + "grad_norm": 0.19352419674396515, + "learning_rate": 9.149552874378993e-05, + "loss": 0.028413400053977966, + "step": 59940 + }, + { + "epoch": 8.509581263307311, + "grad_norm": 2.6463675498962402, + "learning_rate": 9.149410929737404e-05, + "loss": 0.05246865153312683, + "step": 59950 + }, + { + "epoch": 8.511000709723207, + "grad_norm": 0.2576519548892975, + "learning_rate": 9.149268985095813e-05, + "loss": 0.023058263957500456, + "step": 59960 + }, + { + "epoch": 8.512420156139106, + "grad_norm": 0.25054165720939636, + "learning_rate": 9.149127040454223e-05, + "loss": 0.045983174443244935, + "step": 59970 + }, + { + "epoch": 8.513839602555004, + "grad_norm": 3.5932352542877197, + "learning_rate": 9.148985095812633e-05, + "loss": 0.05506667494773865, + "step": 59980 + }, + { + "epoch": 8.5152590489709, + "grad_norm": 0.24045352637767792, + "learning_rate": 9.148843151171044e-05, + "loss": 0.03867987096309662, + "step": 59990 + }, + { + "epoch": 8.516678495386799, + "grad_norm": 8.400899887084961, + "learning_rate": 9.148701206529454e-05, + "loss": 0.08673510551452637, + "step": 60000 + }, + { + "epoch": 8.516678495386799, + "eval_accuracy": 0.9781267883258091, + "eval_loss": 0.07569287717342377, + "eval_runtime": 32.1624, + "eval_samples_per_second": 488.987, + "eval_steps_per_second": 15.297, + "step": 60000 + }, + { + "epoch": 8.518097941802697, + "grad_norm": 4.49797248840332, + "learning_rate": 9.148559261887865e-05, + "loss": 0.04673793017864227, + "step": 60010 + }, + { + "epoch": 8.519517388218595, + "grad_norm": 0.5943466424942017, + "learning_rate": 9.148417317246275e-05, + "loss": 0.019504909217357636, + "step": 60020 + }, + { + "epoch": 8.520936834634492, + "grad_norm": 0.06766325235366821, + "learning_rate": 9.148275372604684e-05, + "loss": 0.009393461048603058, + "step": 60030 + }, + { + "epoch": 8.52235628105039, + "grad_norm": 0.09707861393690109, + "learning_rate": 9.148133427963095e-05, + "loss": 0.044020998477935794, + "step": 60040 + }, + { + "epoch": 8.523775727466289, + "grad_norm": 10.096122741699219, + "learning_rate": 9.147991483321505e-05, + "loss": 0.06480343341827392, + "step": 60050 + }, + { + "epoch": 8.525195173882185, + "grad_norm": 0.09844963252544403, + "learning_rate": 9.147849538679916e-05, + "loss": 0.03513259589672089, + "step": 60060 + }, + { + "epoch": 8.526614620298083, + "grad_norm": 0.8048601150512695, + "learning_rate": 9.147707594038325e-05, + "loss": 0.07613663077354431, + "step": 60070 + }, + { + "epoch": 8.528034066713982, + "grad_norm": 0.13436748087406158, + "learning_rate": 9.147565649396736e-05, + "loss": 0.02750980854034424, + "step": 60080 + }, + { + "epoch": 8.52945351312988, + "grad_norm": 1.0650267601013184, + "learning_rate": 9.147423704755145e-05, + "loss": 0.04318079948425293, + "step": 60090 + }, + { + "epoch": 8.530872959545777, + "grad_norm": 0.5778047442436218, + "learning_rate": 9.147281760113557e-05, + "loss": 0.04353642165660858, + "step": 60100 + }, + { + "epoch": 8.532292405961675, + "grad_norm": 0.41429048776626587, + "learning_rate": 9.147139815471966e-05, + "loss": 0.02352541536092758, + "step": 60110 + }, + { + "epoch": 8.533711852377573, + "grad_norm": 1.7214840650558472, + "learning_rate": 9.146997870830376e-05, + "loss": 0.02559060454368591, + "step": 60120 + }, + { + "epoch": 8.53513129879347, + "grad_norm": 0.04450737684965134, + "learning_rate": 9.146870120652946e-05, + "loss": 0.030207446217536925, + "step": 60130 + }, + { + "epoch": 8.536550745209368, + "grad_norm": 7.381211280822754, + "learning_rate": 9.146728176011357e-05, + "loss": 0.06880269050598145, + "step": 60140 + }, + { + "epoch": 8.537970191625266, + "grad_norm": 0.11311613768339157, + "learning_rate": 9.146586231369765e-05, + "loss": 0.017041406035423277, + "step": 60150 + }, + { + "epoch": 8.539389638041165, + "grad_norm": 0.23204351961612701, + "learning_rate": 9.146444286728176e-05, + "loss": 0.03590482473373413, + "step": 60160 + }, + { + "epoch": 8.540809084457061, + "grad_norm": 0.2040676772594452, + "learning_rate": 9.146302342086586e-05, + "loss": 0.025861364603042603, + "step": 60170 + }, + { + "epoch": 8.54222853087296, + "grad_norm": 0.7327109575271606, + "learning_rate": 9.146160397444997e-05, + "loss": 0.06635326743125916, + "step": 60180 + }, + { + "epoch": 8.543647977288858, + "grad_norm": 0.03353278711438179, + "learning_rate": 9.146018452803407e-05, + "loss": 0.03246139287948609, + "step": 60190 + }, + { + "epoch": 8.545067423704754, + "grad_norm": 3.9755334854125977, + "learning_rate": 9.145876508161817e-05, + "loss": 0.055750757455825806, + "step": 60200 + }, + { + "epoch": 8.546486870120653, + "grad_norm": 0.18508094549179077, + "learning_rate": 9.145734563520228e-05, + "loss": 0.042376190423965454, + "step": 60210 + }, + { + "epoch": 8.547906316536551, + "grad_norm": 0.05352622643113136, + "learning_rate": 9.145592618878638e-05, + "loss": 0.021348334848880768, + "step": 60220 + }, + { + "epoch": 8.54932576295245, + "grad_norm": 1.2180757522583008, + "learning_rate": 9.145450674237049e-05, + "loss": 0.021335867047309876, + "step": 60230 + }, + { + "epoch": 8.550745209368346, + "grad_norm": 3.486859083175659, + "learning_rate": 9.145308729595458e-05, + "loss": 0.03150171637535095, + "step": 60240 + }, + { + "epoch": 8.552164655784244, + "grad_norm": 9.03134822845459, + "learning_rate": 9.145166784953868e-05, + "loss": 0.05840170979499817, + "step": 60250 + }, + { + "epoch": 8.553584102200142, + "grad_norm": 2.2280516624450684, + "learning_rate": 9.145024840312278e-05, + "loss": 0.007564665377140045, + "step": 60260 + }, + { + "epoch": 8.555003548616039, + "grad_norm": 0.3790028691291809, + "learning_rate": 9.144882895670689e-05, + "loss": 0.026224061846733093, + "step": 60270 + }, + { + "epoch": 8.556422995031937, + "grad_norm": 3.474783182144165, + "learning_rate": 9.144740951029099e-05, + "loss": 0.015592548251152038, + "step": 60280 + }, + { + "epoch": 8.557842441447836, + "grad_norm": 7.350448131561279, + "learning_rate": 9.14459900638751e-05, + "loss": 0.05067678689956665, + "step": 60290 + }, + { + "epoch": 8.559261887863734, + "grad_norm": 7.781404972076416, + "learning_rate": 9.14445706174592e-05, + "loss": 0.06740889549255372, + "step": 60300 + }, + { + "epoch": 8.56068133427963, + "grad_norm": 0.024709496647119522, + "learning_rate": 9.144315117104329e-05, + "loss": 0.03684686422348023, + "step": 60310 + }, + { + "epoch": 8.562100780695529, + "grad_norm": 0.09711408615112305, + "learning_rate": 9.14417317246274e-05, + "loss": 0.016073787212371828, + "step": 60320 + }, + { + "epoch": 8.563520227111427, + "grad_norm": 1.8171788454055786, + "learning_rate": 9.14403122782115e-05, + "loss": 0.0676068663597107, + "step": 60330 + }, + { + "epoch": 8.564939673527324, + "grad_norm": 7.71488094329834, + "learning_rate": 9.143889283179561e-05, + "loss": 0.08123016357421875, + "step": 60340 + }, + { + "epoch": 8.566359119943222, + "grad_norm": 7.40566873550415, + "learning_rate": 9.14374733853797e-05, + "loss": 0.03387263715267182, + "step": 60350 + }, + { + "epoch": 8.56777856635912, + "grad_norm": 0.3361523449420929, + "learning_rate": 9.14360539389638e-05, + "loss": 0.0076973557472229, + "step": 60360 + }, + { + "epoch": 8.569198012775018, + "grad_norm": 0.7388091683387756, + "learning_rate": 9.14346344925479e-05, + "loss": 0.023807825148105623, + "step": 60370 + }, + { + "epoch": 8.570617459190915, + "grad_norm": 9.611340522766113, + "learning_rate": 9.143321504613201e-05, + "loss": 0.016511398553848266, + "step": 60380 + }, + { + "epoch": 8.572036905606813, + "grad_norm": 0.3703271746635437, + "learning_rate": 9.143179559971613e-05, + "loss": 0.03306230902671814, + "step": 60390 + }, + { + "epoch": 8.573456352022712, + "grad_norm": 0.2659400999546051, + "learning_rate": 9.143037615330021e-05, + "loss": 0.02672841548919678, + "step": 60400 + }, + { + "epoch": 8.574875798438608, + "grad_norm": 0.2967086136341095, + "learning_rate": 9.142895670688432e-05, + "loss": 0.027674263715744017, + "step": 60410 + }, + { + "epoch": 8.576295244854506, + "grad_norm": 0.7257947325706482, + "learning_rate": 9.142753726046842e-05, + "loss": 0.04073569178581238, + "step": 60420 + }, + { + "epoch": 8.577714691270405, + "grad_norm": 0.048524580895900726, + "learning_rate": 9.142611781405253e-05, + "loss": 0.056384187936782834, + "step": 60430 + }, + { + "epoch": 8.579134137686303, + "grad_norm": 7.515527725219727, + "learning_rate": 9.142469836763663e-05, + "loss": 0.024915623664855956, + "step": 60440 + }, + { + "epoch": 8.5805535841022, + "grad_norm": 3.3118956089019775, + "learning_rate": 9.142327892122072e-05, + "loss": 0.03472829461097717, + "step": 60450 + }, + { + "epoch": 8.581973030518098, + "grad_norm": 4.994110584259033, + "learning_rate": 9.142185947480482e-05, + "loss": 0.04696405827999115, + "step": 60460 + }, + { + "epoch": 8.583392476933996, + "grad_norm": 6.274696350097656, + "learning_rate": 9.142044002838893e-05, + "loss": 0.05722887516021728, + "step": 60470 + }, + { + "epoch": 8.584811923349893, + "grad_norm": 2.1877236366271973, + "learning_rate": 9.141902058197304e-05, + "loss": 0.07386709451675415, + "step": 60480 + }, + { + "epoch": 8.586231369765791, + "grad_norm": 3.222991704940796, + "learning_rate": 9.141760113555714e-05, + "loss": 0.06314558982849121, + "step": 60490 + }, + { + "epoch": 8.58765081618169, + "grad_norm": 0.050618402659893036, + "learning_rate": 9.141618168914125e-05, + "loss": 0.04251847565174103, + "step": 60500 + }, + { + "epoch": 8.58765081618169, + "eval_accuracy": 0.9722133909836587, + "eval_loss": 0.09201868623495102, + "eval_runtime": 33.2517, + "eval_samples_per_second": 472.969, + "eval_steps_per_second": 14.796, + "step": 60500 + }, + { + "epoch": 8.589070262597588, + "grad_norm": 6.501929759979248, + "learning_rate": 9.141476224272533e-05, + "loss": 0.025920677185058593, + "step": 60510 + }, + { + "epoch": 8.590489709013484, + "grad_norm": 1.283429741859436, + "learning_rate": 9.141334279630945e-05, + "loss": 0.04298240840435028, + "step": 60520 + }, + { + "epoch": 8.591909155429382, + "grad_norm": 0.33241111040115356, + "learning_rate": 9.141192334989354e-05, + "loss": 0.07493058443069459, + "step": 60530 + }, + { + "epoch": 8.59332860184528, + "grad_norm": 5.392263412475586, + "learning_rate": 9.141050390347765e-05, + "loss": 0.057287472486495974, + "step": 60540 + }, + { + "epoch": 8.594748048261177, + "grad_norm": 2.195725679397583, + "learning_rate": 9.140908445706175e-05, + "loss": 0.05433647036552429, + "step": 60550 + }, + { + "epoch": 8.596167494677076, + "grad_norm": 4.504883766174316, + "learning_rate": 9.140766501064585e-05, + "loss": 0.0373142272233963, + "step": 60560 + }, + { + "epoch": 8.597586941092974, + "grad_norm": 5.910608768463135, + "learning_rate": 9.140624556422996e-05, + "loss": 0.06574462652206421, + "step": 60570 + }, + { + "epoch": 8.599006387508872, + "grad_norm": 0.08602695912122726, + "learning_rate": 9.140482611781406e-05, + "loss": 0.1206291913986206, + "step": 60580 + }, + { + "epoch": 8.600425833924769, + "grad_norm": 4.0139875411987305, + "learning_rate": 9.140340667139817e-05, + "loss": 0.046517929434776305, + "step": 60590 + }, + { + "epoch": 8.601845280340667, + "grad_norm": 1.3188396692276, + "learning_rate": 9.140198722498227e-05, + "loss": 0.02969059944152832, + "step": 60600 + }, + { + "epoch": 8.603264726756565, + "grad_norm": 0.9926785826683044, + "learning_rate": 9.140056777856636e-05, + "loss": 0.03127261996269226, + "step": 60610 + }, + { + "epoch": 8.604684173172462, + "grad_norm": 2.0638065338134766, + "learning_rate": 9.139914833215046e-05, + "loss": 0.07006618976593018, + "step": 60620 + }, + { + "epoch": 8.60610361958836, + "grad_norm": 0.8679757118225098, + "learning_rate": 9.139772888573457e-05, + "loss": 0.022166214883327484, + "step": 60630 + }, + { + "epoch": 8.607523066004259, + "grad_norm": 0.7310284376144409, + "learning_rate": 9.139630943931867e-05, + "loss": 0.027963387966156005, + "step": 60640 + }, + { + "epoch": 8.608942512420157, + "grad_norm": 0.1053808331489563, + "learning_rate": 9.139488999290278e-05, + "loss": 0.0790288507938385, + "step": 60650 + }, + { + "epoch": 8.610361958836053, + "grad_norm": 3.352461338043213, + "learning_rate": 9.139347054648688e-05, + "loss": 0.05328698754310608, + "step": 60660 + }, + { + "epoch": 8.611781405251952, + "grad_norm": 0.1270398646593094, + "learning_rate": 9.139205110007097e-05, + "loss": 0.027239418029785155, + "step": 60670 + }, + { + "epoch": 8.61320085166785, + "grad_norm": 0.03895813971757889, + "learning_rate": 9.139063165365509e-05, + "loss": 0.02223038524389267, + "step": 60680 + }, + { + "epoch": 8.614620298083747, + "grad_norm": 0.4873507618904114, + "learning_rate": 9.138921220723918e-05, + "loss": 0.063723224401474, + "step": 60690 + }, + { + "epoch": 8.616039744499645, + "grad_norm": 0.9177292585372925, + "learning_rate": 9.13877927608233e-05, + "loss": 0.006845385581254959, + "step": 60700 + }, + { + "epoch": 8.617459190915543, + "grad_norm": 0.0890699028968811, + "learning_rate": 9.138637331440738e-05, + "loss": 0.04476172029972077, + "step": 60710 + }, + { + "epoch": 8.618878637331441, + "grad_norm": 0.1363246887922287, + "learning_rate": 9.138495386799149e-05, + "loss": 0.06530548930168152, + "step": 60720 + }, + { + "epoch": 8.620298083747338, + "grad_norm": 4.468372821807861, + "learning_rate": 9.138353442157559e-05, + "loss": 0.05170977711677551, + "step": 60730 + }, + { + "epoch": 8.621717530163236, + "grad_norm": 12.12729549407959, + "learning_rate": 9.13821149751597e-05, + "loss": 0.06713943481445313, + "step": 60740 + }, + { + "epoch": 8.623136976579135, + "grad_norm": 0.13509313762187958, + "learning_rate": 9.13806955287438e-05, + "loss": 0.0637391209602356, + "step": 60750 + }, + { + "epoch": 8.624556422995031, + "grad_norm": 0.43068927526474, + "learning_rate": 9.137927608232789e-05, + "loss": 0.03880758285522461, + "step": 60760 + }, + { + "epoch": 8.62597586941093, + "grad_norm": 2.7275197505950928, + "learning_rate": 9.1377856635912e-05, + "loss": 0.02224656194448471, + "step": 60770 + }, + { + "epoch": 8.627395315826828, + "grad_norm": 1.675925850868225, + "learning_rate": 9.13764371894961e-05, + "loss": 0.01180570274591446, + "step": 60780 + }, + { + "epoch": 8.628814762242726, + "grad_norm": 0.25846660137176514, + "learning_rate": 9.137501774308021e-05, + "loss": 0.02627829313278198, + "step": 60790 + }, + { + "epoch": 8.630234208658623, + "grad_norm": 7.447933673858643, + "learning_rate": 9.137359829666431e-05, + "loss": 0.0631419837474823, + "step": 60800 + }, + { + "epoch": 8.63165365507452, + "grad_norm": 0.4778624176979065, + "learning_rate": 9.13721788502484e-05, + "loss": 0.05850786566734314, + "step": 60810 + }, + { + "epoch": 8.63307310149042, + "grad_norm": 0.9950453042984009, + "learning_rate": 9.13707594038325e-05, + "loss": 0.02292805016040802, + "step": 60820 + }, + { + "epoch": 8.634492547906316, + "grad_norm": 0.47801464796066284, + "learning_rate": 9.136933995741661e-05, + "loss": 0.01733073443174362, + "step": 60830 + }, + { + "epoch": 8.635911994322214, + "grad_norm": 11.81622314453125, + "learning_rate": 9.136792051100071e-05, + "loss": 0.03141593337059021, + "step": 60840 + }, + { + "epoch": 8.637331440738112, + "grad_norm": 0.011311270296573639, + "learning_rate": 9.136650106458482e-05, + "loss": 0.008127608895301819, + "step": 60850 + }, + { + "epoch": 8.63875088715401, + "grad_norm": 8.19806957244873, + "learning_rate": 9.136508161816892e-05, + "loss": 0.03081195056438446, + "step": 60860 + }, + { + "epoch": 8.640170333569907, + "grad_norm": 0.19872935116291046, + "learning_rate": 9.136366217175302e-05, + "loss": 0.03248392045497894, + "step": 60870 + }, + { + "epoch": 8.641589779985805, + "grad_norm": 4.823294162750244, + "learning_rate": 9.136224272533713e-05, + "loss": 0.029790616035461424, + "step": 60880 + }, + { + "epoch": 8.643009226401704, + "grad_norm": 1.5712690353393555, + "learning_rate": 9.136082327892122e-05, + "loss": 0.03219999372959137, + "step": 60890 + }, + { + "epoch": 8.6444286728176, + "grad_norm": 2.975715160369873, + "learning_rate": 9.135940383250534e-05, + "loss": 0.05302545428276062, + "step": 60900 + }, + { + "epoch": 8.645848119233499, + "grad_norm": 1.2663664817810059, + "learning_rate": 9.135798438608943e-05, + "loss": 0.035529720783233645, + "step": 60910 + }, + { + "epoch": 8.647267565649397, + "grad_norm": 2.0028679370880127, + "learning_rate": 9.135656493967353e-05, + "loss": 0.040467509627342226, + "step": 60920 + }, + { + "epoch": 8.648687012065295, + "grad_norm": 0.7642605304718018, + "learning_rate": 9.135514549325763e-05, + "loss": 0.0312233567237854, + "step": 60930 + }, + { + "epoch": 8.650106458481192, + "grad_norm": 2.8044888973236084, + "learning_rate": 9.135372604684174e-05, + "loss": 0.045302554965019226, + "step": 60940 + }, + { + "epoch": 8.65152590489709, + "grad_norm": 11.276861190795898, + "learning_rate": 9.135230660042584e-05, + "loss": 0.02674412727355957, + "step": 60950 + }, + { + "epoch": 8.652945351312988, + "grad_norm": 5.281027317047119, + "learning_rate": 9.135088715400995e-05, + "loss": 0.044106674194335935, + "step": 60960 + }, + { + "epoch": 8.654364797728885, + "grad_norm": 7.230155944824219, + "learning_rate": 9.134946770759404e-05, + "loss": 0.025055408477783203, + "step": 60970 + }, + { + "epoch": 8.655784244144783, + "grad_norm": 0.44767463207244873, + "learning_rate": 9.134804826117814e-05, + "loss": 0.023888878524303436, + "step": 60980 + }, + { + "epoch": 8.657203690560682, + "grad_norm": 0.1281173974275589, + "learning_rate": 9.134662881476225e-05, + "loss": 0.02549477815628052, + "step": 60990 + }, + { + "epoch": 8.65862313697658, + "grad_norm": 0.732596755027771, + "learning_rate": 9.134520936834635e-05, + "loss": 0.05355830192565918, + "step": 61000 + }, + { + "epoch": 8.65862313697658, + "eval_accuracy": 0.9806066001144529, + "eval_loss": 0.06500012427568436, + "eval_runtime": 34.837, + "eval_samples_per_second": 451.445, + "eval_steps_per_second": 14.123, + "step": 61000 + }, + { + "epoch": 8.660042583392476, + "grad_norm": 0.9538720846176147, + "learning_rate": 9.134378992193046e-05, + "loss": 0.08146008253097534, + "step": 61010 + }, + { + "epoch": 8.661462029808375, + "grad_norm": 0.6794492602348328, + "learning_rate": 9.134237047551454e-05, + "loss": 0.013472935557365418, + "step": 61020 + }, + { + "epoch": 8.662881476224273, + "grad_norm": 0.09928761422634125, + "learning_rate": 9.134095102909866e-05, + "loss": 0.023411236703395844, + "step": 61030 + }, + { + "epoch": 8.66430092264017, + "grad_norm": 0.618294358253479, + "learning_rate": 9.133953158268275e-05, + "loss": 0.033431851863861085, + "step": 61040 + }, + { + "epoch": 8.665720369056068, + "grad_norm": 0.7553914189338684, + "learning_rate": 9.133811213626686e-05, + "loss": 0.06925633549690247, + "step": 61050 + }, + { + "epoch": 8.667139815471966, + "grad_norm": 0.39856305718421936, + "learning_rate": 9.133669268985096e-05, + "loss": 0.072854083776474, + "step": 61060 + }, + { + "epoch": 8.668559261887864, + "grad_norm": 4.453004837036133, + "learning_rate": 9.133527324343506e-05, + "loss": 0.02382792830467224, + "step": 61070 + }, + { + "epoch": 8.669978708303761, + "grad_norm": 0.14222095906734467, + "learning_rate": 9.133385379701917e-05, + "loss": 0.044448480010032654, + "step": 61080 + }, + { + "epoch": 8.67139815471966, + "grad_norm": 0.8726344704627991, + "learning_rate": 9.133243435060327e-05, + "loss": 0.04786675274372101, + "step": 61090 + }, + { + "epoch": 8.672817601135558, + "grad_norm": 0.4254017174243927, + "learning_rate": 9.133101490418738e-05, + "loss": 0.042910799384117126, + "step": 61100 + }, + { + "epoch": 8.674237047551454, + "grad_norm": 0.20297612249851227, + "learning_rate": 9.132959545777148e-05, + "loss": 0.0331714004278183, + "step": 61110 + }, + { + "epoch": 8.675656493967352, + "grad_norm": 0.44168731570243835, + "learning_rate": 9.132817601135557e-05, + "loss": 0.06532721519470215, + "step": 61120 + }, + { + "epoch": 8.67707594038325, + "grad_norm": 0.5480031967163086, + "learning_rate": 9.132675656493967e-05, + "loss": 0.03251819014549255, + "step": 61130 + }, + { + "epoch": 8.678495386799149, + "grad_norm": 0.6780239939689636, + "learning_rate": 9.132533711852378e-05, + "loss": 0.029584136605262757, + "step": 61140 + }, + { + "epoch": 8.679914833215046, + "grad_norm": 7.8614630699157715, + "learning_rate": 9.132391767210788e-05, + "loss": 0.032931667566299436, + "step": 61150 + }, + { + "epoch": 8.681334279630944, + "grad_norm": 0.2410171777009964, + "learning_rate": 9.132249822569199e-05, + "loss": 0.0210477352142334, + "step": 61160 + }, + { + "epoch": 8.682753726046842, + "grad_norm": 0.20945946872234344, + "learning_rate": 9.132107877927609e-05, + "loss": 0.06597599387168884, + "step": 61170 + }, + { + "epoch": 8.684173172462739, + "grad_norm": 0.11647208780050278, + "learning_rate": 9.131965933286018e-05, + "loss": 0.020939578115940095, + "step": 61180 + }, + { + "epoch": 8.685592618878637, + "grad_norm": 0.26495155692100525, + "learning_rate": 9.13182398864443e-05, + "loss": 0.036896157264709475, + "step": 61190 + }, + { + "epoch": 8.687012065294535, + "grad_norm": 0.42610204219818115, + "learning_rate": 9.131682044002839e-05, + "loss": 0.03127295076847077, + "step": 61200 + }, + { + "epoch": 8.688431511710434, + "grad_norm": 0.9225974678993225, + "learning_rate": 9.13154009936125e-05, + "loss": 0.03571774959564209, + "step": 61210 + }, + { + "epoch": 8.68985095812633, + "grad_norm": 6.106173992156982, + "learning_rate": 9.13139815471966e-05, + "loss": 0.05818299055099487, + "step": 61220 + }, + { + "epoch": 8.691270404542228, + "grad_norm": 0.33078286051750183, + "learning_rate": 9.13125621007807e-05, + "loss": 0.05615794658660889, + "step": 61230 + }, + { + "epoch": 8.692689850958127, + "grad_norm": 2.8934178352355957, + "learning_rate": 9.13111426543648e-05, + "loss": 0.05344209671020508, + "step": 61240 + }, + { + "epoch": 8.694109297374023, + "grad_norm": 0.8880506157875061, + "learning_rate": 9.13097232079489e-05, + "loss": 0.025896552205085754, + "step": 61250 + }, + { + "epoch": 8.695528743789922, + "grad_norm": 0.15858466923236847, + "learning_rate": 9.1308303761533e-05, + "loss": 0.05899338126182556, + "step": 61260 + }, + { + "epoch": 8.69694819020582, + "grad_norm": 0.5504029989242554, + "learning_rate": 9.130688431511711e-05, + "loss": 0.00966411828994751, + "step": 61270 + }, + { + "epoch": 8.698367636621718, + "grad_norm": 12.35024642944336, + "learning_rate": 9.130546486870121e-05, + "loss": 0.024048765003681184, + "step": 61280 + }, + { + "epoch": 8.699787083037615, + "grad_norm": 0.01407754234969616, + "learning_rate": 9.130404542228531e-05, + "loss": 0.012420719116926193, + "step": 61290 + }, + { + "epoch": 8.701206529453513, + "grad_norm": 5.070805072784424, + "learning_rate": 9.130262597586942e-05, + "loss": 0.05343193411827087, + "step": 61300 + }, + { + "epoch": 8.702625975869411, + "grad_norm": 0.9735597372055054, + "learning_rate": 9.130120652945352e-05, + "loss": 0.05009015202522278, + "step": 61310 + }, + { + "epoch": 8.704045422285308, + "grad_norm": 0.05359075218439102, + "learning_rate": 9.129978708303763e-05, + "loss": 0.041198867559432986, + "step": 61320 + }, + { + "epoch": 8.705464868701206, + "grad_norm": 0.19079919159412384, + "learning_rate": 9.129836763662171e-05, + "loss": 0.11265591382980347, + "step": 61330 + }, + { + "epoch": 8.706884315117104, + "grad_norm": 0.39164605736732483, + "learning_rate": 9.129694819020582e-05, + "loss": 0.09336072206497192, + "step": 61340 + }, + { + "epoch": 8.708303761533003, + "grad_norm": 0.1398046314716339, + "learning_rate": 9.129552874378992e-05, + "loss": 0.02090988904237747, + "step": 61350 + }, + { + "epoch": 8.7097232079489, + "grad_norm": 3.872218370437622, + "learning_rate": 9.129410929737403e-05, + "loss": 0.014080584049224854, + "step": 61360 + }, + { + "epoch": 8.711142654364798, + "grad_norm": 2.848639726638794, + "learning_rate": 9.129268985095813e-05, + "loss": 0.028994157910346985, + "step": 61370 + }, + { + "epoch": 8.712562100780696, + "grad_norm": 1.4137672185897827, + "learning_rate": 9.129127040454223e-05, + "loss": 0.02338533103466034, + "step": 61380 + }, + { + "epoch": 8.713981547196592, + "grad_norm": 5.973514080047607, + "learning_rate": 9.128985095812634e-05, + "loss": 0.06741483807563782, + "step": 61390 + }, + { + "epoch": 8.71540099361249, + "grad_norm": 0.1293167769908905, + "learning_rate": 9.128843151171043e-05, + "loss": 0.04931153953075409, + "step": 61400 + }, + { + "epoch": 8.716820440028389, + "grad_norm": 4.29320764541626, + "learning_rate": 9.128701206529455e-05, + "loss": 0.0691063404083252, + "step": 61410 + }, + { + "epoch": 8.718239886444287, + "grad_norm": 3.396770477294922, + "learning_rate": 9.128559261887864e-05, + "loss": 0.03781647980213165, + "step": 61420 + }, + { + "epoch": 8.719659332860184, + "grad_norm": 11.031145095825195, + "learning_rate": 9.128417317246274e-05, + "loss": 0.015141361951828003, + "step": 61430 + }, + { + "epoch": 8.721078779276082, + "grad_norm": 1.7965271472930908, + "learning_rate": 9.128275372604684e-05, + "loss": 0.01115918904542923, + "step": 61440 + }, + { + "epoch": 8.72249822569198, + "grad_norm": 0.16115064918994904, + "learning_rate": 9.128133427963095e-05, + "loss": 0.03681345283985138, + "step": 61450 + }, + { + "epoch": 8.723917672107877, + "grad_norm": 0.4038357436656952, + "learning_rate": 9.127991483321505e-05, + "loss": 0.05916250944137573, + "step": 61460 + }, + { + "epoch": 8.725337118523775, + "grad_norm": 0.09256431460380554, + "learning_rate": 9.127849538679916e-05, + "loss": 0.01472775936126709, + "step": 61470 + }, + { + "epoch": 8.726756564939674, + "grad_norm": 10.232614517211914, + "learning_rate": 9.127707594038325e-05, + "loss": 0.07860915660858155, + "step": 61480 + }, + { + "epoch": 8.728176011355572, + "grad_norm": 0.18803325295448303, + "learning_rate": 9.127565649396735e-05, + "loss": 0.03942298591136932, + "step": 61490 + }, + { + "epoch": 8.729595457771469, + "grad_norm": 1.1620979309082031, + "learning_rate": 9.127423704755146e-05, + "loss": 0.016197699308395385, + "step": 61500 + }, + { + "epoch": 8.729595457771469, + "eval_accuracy": 0.9816875437146309, + "eval_loss": 0.059635695070028305, + "eval_runtime": 33.2881, + "eval_samples_per_second": 472.451, + "eval_steps_per_second": 14.78, + "step": 61500 + }, + { + "epoch": 8.731014904187367, + "grad_norm": 1.1357897520065308, + "learning_rate": 9.127281760113556e-05, + "loss": 0.03907504975795746, + "step": 61510 + }, + { + "epoch": 8.732434350603265, + "grad_norm": 0.44886356592178345, + "learning_rate": 9.127139815471967e-05, + "loss": 0.031098437309265137, + "step": 61520 + }, + { + "epoch": 8.733853797019162, + "grad_norm": 0.46833470463752747, + "learning_rate": 9.126997870830376e-05, + "loss": 0.05453131794929504, + "step": 61530 + }, + { + "epoch": 8.73527324343506, + "grad_norm": 6.623453140258789, + "learning_rate": 9.126855926188787e-05, + "loss": 0.026876044273376466, + "step": 61540 + }, + { + "epoch": 8.736692689850958, + "grad_norm": 2.4892704486846924, + "learning_rate": 9.126713981547196e-05, + "loss": 0.03759989440441132, + "step": 61550 + }, + { + "epoch": 8.738112136266857, + "grad_norm": 5.195185661315918, + "learning_rate": 9.126572036905607e-05, + "loss": 0.014632970094680786, + "step": 61560 + }, + { + "epoch": 8.739531582682753, + "grad_norm": 1.108044981956482, + "learning_rate": 9.126430092264017e-05, + "loss": 0.045201820135116574, + "step": 61570 + }, + { + "epoch": 8.740951029098651, + "grad_norm": 5.98253870010376, + "learning_rate": 9.126288147622428e-05, + "loss": 0.017468076944351197, + "step": 61580 + }, + { + "epoch": 8.74237047551455, + "grad_norm": 3.968717336654663, + "learning_rate": 9.126146202980838e-05, + "loss": 0.03230096399784088, + "step": 61590 + }, + { + "epoch": 8.743789921930446, + "grad_norm": 0.03290829062461853, + "learning_rate": 9.126004258339248e-05, + "loss": 0.0632810652256012, + "step": 61600 + }, + { + "epoch": 8.745209368346345, + "grad_norm": 11.86566162109375, + "learning_rate": 9.125862313697659e-05, + "loss": 0.05286848545074463, + "step": 61610 + }, + { + "epoch": 8.746628814762243, + "grad_norm": 2.780843734741211, + "learning_rate": 9.125720369056069e-05, + "loss": 0.06637284755706788, + "step": 61620 + }, + { + "epoch": 8.748048261178141, + "grad_norm": 1.4611948728561401, + "learning_rate": 9.12557842441448e-05, + "loss": 0.06998544335365295, + "step": 61630 + }, + { + "epoch": 8.749467707594038, + "grad_norm": 13.01425838470459, + "learning_rate": 9.125436479772888e-05, + "loss": 0.038280272483825685, + "step": 61640 + }, + { + "epoch": 8.750887154009936, + "grad_norm": 0.677115797996521, + "learning_rate": 9.125294535131299e-05, + "loss": 0.02972142696380615, + "step": 61650 + }, + { + "epoch": 8.752306600425834, + "grad_norm": 6.362784385681152, + "learning_rate": 9.125152590489709e-05, + "loss": 0.059755778312683104, + "step": 61660 + }, + { + "epoch": 8.75372604684173, + "grad_norm": 1.1002269983291626, + "learning_rate": 9.12501064584812e-05, + "loss": 0.03250996172428131, + "step": 61670 + }, + { + "epoch": 8.75514549325763, + "grad_norm": 0.5199268460273743, + "learning_rate": 9.12486870120653e-05, + "loss": 0.01646912842988968, + "step": 61680 + }, + { + "epoch": 8.756564939673527, + "grad_norm": 4.037311553955078, + "learning_rate": 9.12472675656494e-05, + "loss": 0.018910709023475646, + "step": 61690 + }, + { + "epoch": 8.757984386089426, + "grad_norm": 5.73612117767334, + "learning_rate": 9.12458481192335e-05, + "loss": 0.0628777265548706, + "step": 61700 + }, + { + "epoch": 8.759403832505322, + "grad_norm": 0.10442260652780533, + "learning_rate": 9.12444286728176e-05, + "loss": 0.013911408185958863, + "step": 61710 + }, + { + "epoch": 8.76082327892122, + "grad_norm": 0.38511592149734497, + "learning_rate": 9.124300922640171e-05, + "loss": 0.015263143181800842, + "step": 61720 + }, + { + "epoch": 8.762242725337119, + "grad_norm": 1.906474232673645, + "learning_rate": 9.124158977998581e-05, + "loss": 0.01780613958835602, + "step": 61730 + }, + { + "epoch": 8.763662171753015, + "grad_norm": 8.139535903930664, + "learning_rate": 9.124017033356991e-05, + "loss": 0.037744688987731936, + "step": 61740 + }, + { + "epoch": 8.765081618168914, + "grad_norm": 10.752461433410645, + "learning_rate": 9.1238750887154e-05, + "loss": 0.03349531888961792, + "step": 61750 + }, + { + "epoch": 8.766501064584812, + "grad_norm": 1.434235692024231, + "learning_rate": 9.123733144073812e-05, + "loss": 0.02006615549325943, + "step": 61760 + }, + { + "epoch": 8.76792051100071, + "grad_norm": 2.3497636318206787, + "learning_rate": 9.123591199432221e-05, + "loss": 0.042415973544120786, + "step": 61770 + }, + { + "epoch": 8.769339957416607, + "grad_norm": 12.238152503967285, + "learning_rate": 9.123449254790632e-05, + "loss": 0.06715450882911682, + "step": 61780 + }, + { + "epoch": 8.770759403832505, + "grad_norm": 4.415995121002197, + "learning_rate": 9.123307310149042e-05, + "loss": 0.03555461466312408, + "step": 61790 + }, + { + "epoch": 8.772178850248403, + "grad_norm": 0.2075185477733612, + "learning_rate": 9.123165365507452e-05, + "loss": 0.048576629161834715, + "step": 61800 + }, + { + "epoch": 8.7735982966643, + "grad_norm": 0.8062130808830261, + "learning_rate": 9.123023420865863e-05, + "loss": 0.0283250629901886, + "step": 61810 + }, + { + "epoch": 8.775017743080198, + "grad_norm": 0.5037146210670471, + "learning_rate": 9.122881476224273e-05, + "loss": 0.006582640111446381, + "step": 61820 + }, + { + "epoch": 8.776437189496097, + "grad_norm": 7.7386932373046875, + "learning_rate": 9.122739531582684e-05, + "loss": 0.058837884664535524, + "step": 61830 + }, + { + "epoch": 8.777856635911995, + "grad_norm": 9.621651649475098, + "learning_rate": 9.122597586941092e-05, + "loss": 0.019973933696746826, + "step": 61840 + }, + { + "epoch": 8.779276082327891, + "grad_norm": 1.2101565599441528, + "learning_rate": 9.122455642299503e-05, + "loss": 0.029185032844543456, + "step": 61850 + }, + { + "epoch": 8.78069552874379, + "grad_norm": 4.3968706130981445, + "learning_rate": 9.122313697657913e-05, + "loss": 0.06037144064903259, + "step": 61860 + }, + { + "epoch": 8.782114975159688, + "grad_norm": 0.4677600860595703, + "learning_rate": 9.122171753016324e-05, + "loss": 0.03795049488544464, + "step": 61870 + }, + { + "epoch": 8.783534421575585, + "grad_norm": 0.128379687666893, + "learning_rate": 9.122029808374735e-05, + "loss": 0.07420201301574707, + "step": 61880 + }, + { + "epoch": 8.784953867991483, + "grad_norm": 0.24324475228786469, + "learning_rate": 9.121887863733144e-05, + "loss": 0.013025546073913574, + "step": 61890 + }, + { + "epoch": 8.786373314407381, + "grad_norm": 0.14116889238357544, + "learning_rate": 9.121745919091555e-05, + "loss": 0.017498120665550232, + "step": 61900 + }, + { + "epoch": 8.78779276082328, + "grad_norm": 0.6224940419197083, + "learning_rate": 9.121603974449965e-05, + "loss": 0.028740760684013367, + "step": 61910 + }, + { + "epoch": 8.789212207239176, + "grad_norm": 0.3105393350124359, + "learning_rate": 9.121462029808376e-05, + "loss": 0.02539215385913849, + "step": 61920 + }, + { + "epoch": 8.790631653655074, + "grad_norm": 0.5749163627624512, + "learning_rate": 9.121320085166785e-05, + "loss": 0.030741649866104125, + "step": 61930 + }, + { + "epoch": 8.792051100070973, + "grad_norm": 7.524410247802734, + "learning_rate": 9.121178140525196e-05, + "loss": 0.031424522399902344, + "step": 61940 + }, + { + "epoch": 8.79347054648687, + "grad_norm": 4.195517539978027, + "learning_rate": 9.121036195883605e-05, + "loss": 0.02137700915336609, + "step": 61950 + }, + { + "epoch": 8.794889992902768, + "grad_norm": 8.744084358215332, + "learning_rate": 9.120894251242016e-05, + "loss": 0.11366484165191651, + "step": 61960 + }, + { + "epoch": 8.796309439318666, + "grad_norm": 0.06890244036912918, + "learning_rate": 9.120752306600427e-05, + "loss": 0.03876006603240967, + "step": 61970 + }, + { + "epoch": 8.797728885734564, + "grad_norm": 0.25812605023384094, + "learning_rate": 9.120610361958837e-05, + "loss": 0.06622171998023987, + "step": 61980 + }, + { + "epoch": 8.79914833215046, + "grad_norm": 0.4101802408695221, + "learning_rate": 9.120468417317248e-05, + "loss": 0.0641588568687439, + "step": 61990 + }, + { + "epoch": 8.800567778566359, + "grad_norm": 0.04798253998160362, + "learning_rate": 9.120326472675656e-05, + "loss": 0.03991687297821045, + "step": 62000 + }, + { + "epoch": 8.800567778566359, + "eval_accuracy": 0.9803522604438227, + "eval_loss": 0.06005360186100006, + "eval_runtime": 32.7621, + "eval_samples_per_second": 480.036, + "eval_steps_per_second": 15.017, + "step": 62000 + }, + { + "epoch": 8.801987224982257, + "grad_norm": 0.8698391318321228, + "learning_rate": 9.120184528034067e-05, + "loss": 0.05466843247413635, + "step": 62010 + }, + { + "epoch": 8.803406671398154, + "grad_norm": 0.9279050230979919, + "learning_rate": 9.120042583392477e-05, + "loss": 0.02461112439632416, + "step": 62020 + }, + { + "epoch": 8.804826117814052, + "grad_norm": 0.347493976354599, + "learning_rate": 9.119900638750888e-05, + "loss": 0.04751765727996826, + "step": 62030 + }, + { + "epoch": 8.80624556422995, + "grad_norm": 0.16119427978992462, + "learning_rate": 9.119758694109298e-05, + "loss": 0.04457222819328308, + "step": 62040 + }, + { + "epoch": 8.807665010645849, + "grad_norm": 0.16925406455993652, + "learning_rate": 9.119616749467708e-05, + "loss": 0.013092260062694549, + "step": 62050 + }, + { + "epoch": 8.809084457061745, + "grad_norm": 0.44262468814849854, + "learning_rate": 9.119474804826119e-05, + "loss": 0.030343493819236754, + "step": 62060 + }, + { + "epoch": 8.810503903477644, + "grad_norm": 2.201153516769409, + "learning_rate": 9.119332860184528e-05, + "loss": 0.052234995365142825, + "step": 62070 + }, + { + "epoch": 8.811923349893542, + "grad_norm": 0.5974806547164917, + "learning_rate": 9.11919091554294e-05, + "loss": 0.027004152536392212, + "step": 62080 + }, + { + "epoch": 8.813342796309438, + "grad_norm": 1.2625153064727783, + "learning_rate": 9.119048970901349e-05, + "loss": 0.04115771949291229, + "step": 62090 + }, + { + "epoch": 8.814762242725337, + "grad_norm": 5.913887977600098, + "learning_rate": 9.118907026259759e-05, + "loss": 0.034026145935058594, + "step": 62100 + }, + { + "epoch": 8.816181689141235, + "grad_norm": 1.1757041215896606, + "learning_rate": 9.118765081618169e-05, + "loss": 0.07286719679832458, + "step": 62110 + }, + { + "epoch": 8.817601135557133, + "grad_norm": 0.8042858839035034, + "learning_rate": 9.11862313697658e-05, + "loss": 0.04174315929412842, + "step": 62120 + }, + { + "epoch": 8.81902058197303, + "grad_norm": 0.46218520402908325, + "learning_rate": 9.11848119233499e-05, + "loss": 0.019040848314762115, + "step": 62130 + }, + { + "epoch": 8.820440028388928, + "grad_norm": 2.684968948364258, + "learning_rate": 9.1183392476934e-05, + "loss": 0.06549656391143799, + "step": 62140 + }, + { + "epoch": 8.821859474804826, + "grad_norm": 12.88553524017334, + "learning_rate": 9.11819730305181e-05, + "loss": 0.07638974785804749, + "step": 62150 + }, + { + "epoch": 8.823278921220723, + "grad_norm": 4.401943683624268, + "learning_rate": 9.11805535841022e-05, + "loss": 0.04739658534526825, + "step": 62160 + }, + { + "epoch": 8.824698367636621, + "grad_norm": 2.7277910709381104, + "learning_rate": 9.117913413768631e-05, + "loss": 0.04343520998954773, + "step": 62170 + }, + { + "epoch": 8.82611781405252, + "grad_norm": 0.1369256228208542, + "learning_rate": 9.117771469127041e-05, + "loss": 0.01796092838048935, + "step": 62180 + }, + { + "epoch": 8.827537260468418, + "grad_norm": 4.254459857940674, + "learning_rate": 9.117629524485452e-05, + "loss": 0.018354178965091707, + "step": 62190 + }, + { + "epoch": 8.828956706884314, + "grad_norm": 3.2595226764678955, + "learning_rate": 9.11748757984386e-05, + "loss": 0.03273018896579742, + "step": 62200 + }, + { + "epoch": 8.830376153300213, + "grad_norm": 2.787855625152588, + "learning_rate": 9.117345635202272e-05, + "loss": 0.08703206777572632, + "step": 62210 + }, + { + "epoch": 8.831795599716111, + "grad_norm": 1.6568021774291992, + "learning_rate": 9.117203690560681e-05, + "loss": 0.03767965137958527, + "step": 62220 + }, + { + "epoch": 8.833215046132008, + "grad_norm": 0.16803374886512756, + "learning_rate": 9.117061745919092e-05, + "loss": 0.015233394503593446, + "step": 62230 + }, + { + "epoch": 8.834634492547906, + "grad_norm": 5.4319024085998535, + "learning_rate": 9.116919801277502e-05, + "loss": 0.034356406331062316, + "step": 62240 + }, + { + "epoch": 8.836053938963804, + "grad_norm": 0.8553333878517151, + "learning_rate": 9.116777856635912e-05, + "loss": 0.033635425567626956, + "step": 62250 + }, + { + "epoch": 8.837473385379703, + "grad_norm": 11.308504104614258, + "learning_rate": 9.116635911994323e-05, + "loss": 0.033365875482559204, + "step": 62260 + }, + { + "epoch": 8.838892831795599, + "grad_norm": 2.8172361850738525, + "learning_rate": 9.116493967352733e-05, + "loss": 0.021276645362377167, + "step": 62270 + }, + { + "epoch": 8.840312278211497, + "grad_norm": 1.633204698562622, + "learning_rate": 9.116352022711144e-05, + "loss": 0.03485658168792725, + "step": 62280 + }, + { + "epoch": 8.841731724627396, + "grad_norm": 1.5068836212158203, + "learning_rate": 9.116210078069554e-05, + "loss": 0.03303306102752686, + "step": 62290 + }, + { + "epoch": 8.843151171043292, + "grad_norm": 2.799574613571167, + "learning_rate": 9.116068133427965e-05, + "loss": 0.04048182368278504, + "step": 62300 + }, + { + "epoch": 8.84457061745919, + "grad_norm": 5.927804946899414, + "learning_rate": 9.115926188786373e-05, + "loss": 0.03790929913520813, + "step": 62310 + }, + { + "epoch": 8.845990063875089, + "grad_norm": 0.35564693808555603, + "learning_rate": 9.115784244144784e-05, + "loss": 0.019379837810993193, + "step": 62320 + }, + { + "epoch": 8.847409510290987, + "grad_norm": 12.212952613830566, + "learning_rate": 9.115642299503194e-05, + "loss": 0.0662179708480835, + "step": 62330 + }, + { + "epoch": 8.848828956706884, + "grad_norm": 9.45881175994873, + "learning_rate": 9.115500354861605e-05, + "loss": 0.06972289681434632, + "step": 62340 + }, + { + "epoch": 8.850248403122782, + "grad_norm": 0.12846651673316956, + "learning_rate": 9.115358410220015e-05, + "loss": 0.030337435007095338, + "step": 62350 + }, + { + "epoch": 8.85166784953868, + "grad_norm": 0.08279696851968765, + "learning_rate": 9.115216465578424e-05, + "loss": 0.029838696122169495, + "step": 62360 + }, + { + "epoch": 8.853087295954577, + "grad_norm": 4.972554683685303, + "learning_rate": 9.115074520936835e-05, + "loss": 0.015542306005954742, + "step": 62370 + }, + { + "epoch": 8.854506742370475, + "grad_norm": 2.105909824371338, + "learning_rate": 9.114932576295245e-05, + "loss": 0.013818117976188659, + "step": 62380 + }, + { + "epoch": 8.855926188786373, + "grad_norm": 0.3148862421512604, + "learning_rate": 9.114790631653656e-05, + "loss": 0.054498547315597536, + "step": 62390 + }, + { + "epoch": 8.857345635202272, + "grad_norm": 3.0089094638824463, + "learning_rate": 9.114648687012066e-05, + "loss": 0.03047267496585846, + "step": 62400 + }, + { + "epoch": 8.858765081618168, + "grad_norm": 0.2803598642349243, + "learning_rate": 9.114506742370476e-05, + "loss": 0.01550055593252182, + "step": 62410 + }, + { + "epoch": 8.860184528034067, + "grad_norm": 0.1116773933172226, + "learning_rate": 9.114364797728886e-05, + "loss": 0.02507326602935791, + "step": 62420 + }, + { + "epoch": 8.861603974449965, + "grad_norm": 6.727297782897949, + "learning_rate": 9.114222853087297e-05, + "loss": 0.06734004020690917, + "step": 62430 + }, + { + "epoch": 8.863023420865863, + "grad_norm": 0.10710111260414124, + "learning_rate": 9.114080908445706e-05, + "loss": 0.030666384100914, + "step": 62440 + }, + { + "epoch": 8.86444286728176, + "grad_norm": 5.24462890625, + "learning_rate": 9.113938963804117e-05, + "loss": 0.056045109033584596, + "step": 62450 + }, + { + "epoch": 8.865862313697658, + "grad_norm": 0.958200216293335, + "learning_rate": 9.113797019162527e-05, + "loss": 0.024746541678905488, + "step": 62460 + }, + { + "epoch": 8.867281760113556, + "grad_norm": 0.7351173758506775, + "learning_rate": 9.113655074520937e-05, + "loss": 0.05286588668823242, + "step": 62470 + }, + { + "epoch": 8.868701206529453, + "grad_norm": 9.957810401916504, + "learning_rate": 9.113513129879348e-05, + "loss": 0.035145890712738034, + "step": 62480 + }, + { + "epoch": 8.870120652945351, + "grad_norm": 0.04390246421098709, + "learning_rate": 9.113371185237758e-05, + "loss": 0.019046881794929506, + "step": 62490 + }, + { + "epoch": 8.87154009936125, + "grad_norm": 0.08628535270690918, + "learning_rate": 9.113229240596169e-05, + "loss": 0.05443682670593262, + "step": 62500 + }, + { + "epoch": 8.87154009936125, + "eval_accuracy": 0.9782539581611241, + "eval_loss": 0.07185545563697815, + "eval_runtime": 32.8112, + "eval_samples_per_second": 479.317, + "eval_steps_per_second": 14.995, + "step": 62500 + }, + { + "epoch": 8.872959545777148, + "grad_norm": 1.975903034210205, + "learning_rate": 9.113087295954577e-05, + "loss": 0.049159616231918335, + "step": 62510 + }, + { + "epoch": 8.874378992193044, + "grad_norm": 0.22022274136543274, + "learning_rate": 9.112945351312988e-05, + "loss": 0.015442782640457153, + "step": 62520 + }, + { + "epoch": 8.875798438608943, + "grad_norm": 0.8125544786453247, + "learning_rate": 9.112803406671398e-05, + "loss": 0.02303740233182907, + "step": 62530 + }, + { + "epoch": 8.87721788502484, + "grad_norm": 1.2327035665512085, + "learning_rate": 9.112661462029809e-05, + "loss": 0.034694111347198485, + "step": 62540 + }, + { + "epoch": 8.878637331440737, + "grad_norm": 4.138776779174805, + "learning_rate": 9.112519517388219e-05, + "loss": 0.07216629385948181, + "step": 62550 + }, + { + "epoch": 8.880056777856636, + "grad_norm": 5.914824485778809, + "learning_rate": 9.112377572746629e-05, + "loss": 0.0611856997013092, + "step": 62560 + }, + { + "epoch": 8.881476224272534, + "grad_norm": 1.282106637954712, + "learning_rate": 9.11223562810504e-05, + "loss": 0.021294346451759337, + "step": 62570 + }, + { + "epoch": 8.882895670688432, + "grad_norm": 7.624048709869385, + "learning_rate": 9.11209368346345e-05, + "loss": 0.040926402807235716, + "step": 62580 + }, + { + "epoch": 8.884315117104329, + "grad_norm": 3.448227882385254, + "learning_rate": 9.11195173882186e-05, + "loss": 0.00841144025325775, + "step": 62590 + }, + { + "epoch": 8.885734563520227, + "grad_norm": 10.487248420715332, + "learning_rate": 9.11180979418027e-05, + "loss": 0.10912501811981201, + "step": 62600 + }, + { + "epoch": 8.887154009936125, + "grad_norm": 0.008511621505022049, + "learning_rate": 9.11166784953868e-05, + "loss": 0.07005314826965332, + "step": 62610 + }, + { + "epoch": 8.888573456352022, + "grad_norm": 0.409939169883728, + "learning_rate": 9.11152590489709e-05, + "loss": 0.05328459739685058, + "step": 62620 + }, + { + "epoch": 8.88999290276792, + "grad_norm": 8.29695987701416, + "learning_rate": 9.111383960255501e-05, + "loss": 0.04727603197097778, + "step": 62630 + }, + { + "epoch": 8.891412349183819, + "grad_norm": 3.3352630138397217, + "learning_rate": 9.11124201561391e-05, + "loss": 0.0870002806186676, + "step": 62640 + }, + { + "epoch": 8.892831795599717, + "grad_norm": 11.782904624938965, + "learning_rate": 9.111100070972322e-05, + "loss": 0.04238658845424652, + "step": 62650 + }, + { + "epoch": 8.894251242015613, + "grad_norm": 2.9484095573425293, + "learning_rate": 9.110958126330731e-05, + "loss": 0.029105091094970705, + "step": 62660 + }, + { + "epoch": 8.895670688431512, + "grad_norm": 0.40413135290145874, + "learning_rate": 9.110816181689141e-05, + "loss": 0.037084218859672544, + "step": 62670 + }, + { + "epoch": 8.89709013484741, + "grad_norm": 0.24385766685009003, + "learning_rate": 9.110674237047552e-05, + "loss": 0.01750268042087555, + "step": 62680 + }, + { + "epoch": 8.898509581263307, + "grad_norm": 0.06714774668216705, + "learning_rate": 9.110532292405962e-05, + "loss": 0.03886613845825195, + "step": 62690 + }, + { + "epoch": 8.899929027679205, + "grad_norm": 0.9303773045539856, + "learning_rate": 9.110390347764373e-05, + "loss": 0.045955890417099, + "step": 62700 + }, + { + "epoch": 8.901348474095103, + "grad_norm": 2.0016958713531494, + "learning_rate": 9.110248403122783e-05, + "loss": 0.04948480129241943, + "step": 62710 + }, + { + "epoch": 8.902767920511002, + "grad_norm": 6.1627655029296875, + "learning_rate": 9.110106458481193e-05, + "loss": 0.03391303420066834, + "step": 62720 + }, + { + "epoch": 8.904187366926898, + "grad_norm": 0.391539990901947, + "learning_rate": 9.109964513839602e-05, + "loss": 0.04015987515449524, + "step": 62730 + }, + { + "epoch": 8.905606813342796, + "grad_norm": 2.768480062484741, + "learning_rate": 9.109822569198013e-05, + "loss": 0.0364282488822937, + "step": 62740 + }, + { + "epoch": 8.907026259758695, + "grad_norm": 4.698720932006836, + "learning_rate": 9.109680624556423e-05, + "loss": 0.019236212968826293, + "step": 62750 + }, + { + "epoch": 8.908445706174591, + "grad_norm": 8.612178802490234, + "learning_rate": 9.109538679914834e-05, + "loss": 0.03959521353244781, + "step": 62760 + }, + { + "epoch": 8.90986515259049, + "grad_norm": 7.235753536224365, + "learning_rate": 9.109396735273244e-05, + "loss": 0.025758838653564452, + "step": 62770 + }, + { + "epoch": 8.911284599006388, + "grad_norm": 1.243888258934021, + "learning_rate": 9.109254790631654e-05, + "loss": 0.06227513551712036, + "step": 62780 + }, + { + "epoch": 8.912704045422286, + "grad_norm": 0.15928207337856293, + "learning_rate": 9.109112845990065e-05, + "loss": 0.047986623644828794, + "step": 62790 + }, + { + "epoch": 8.914123491838183, + "grad_norm": 0.026005161926150322, + "learning_rate": 9.108970901348475e-05, + "loss": 0.061690449714660645, + "step": 62800 + }, + { + "epoch": 8.915542938254081, + "grad_norm": 1.2915374040603638, + "learning_rate": 9.108828956706886e-05, + "loss": 0.0475294291973114, + "step": 62810 + }, + { + "epoch": 8.91696238466998, + "grad_norm": 3.0353357791900635, + "learning_rate": 9.108687012065294e-05, + "loss": 0.039467260241508484, + "step": 62820 + }, + { + "epoch": 8.918381831085876, + "grad_norm": 3.1496834754943848, + "learning_rate": 9.108545067423705e-05, + "loss": 0.036444342136383055, + "step": 62830 + }, + { + "epoch": 8.919801277501774, + "grad_norm": 5.958468914031982, + "learning_rate": 9.108403122782115e-05, + "loss": 0.02855021059513092, + "step": 62840 + }, + { + "epoch": 8.921220723917672, + "grad_norm": 1.679870843887329, + "learning_rate": 9.108261178140526e-05, + "loss": 0.03625530004501343, + "step": 62850 + }, + { + "epoch": 8.92264017033357, + "grad_norm": 1.037192463874817, + "learning_rate": 9.108119233498936e-05, + "loss": 0.029511517286300658, + "step": 62860 + }, + { + "epoch": 8.924059616749467, + "grad_norm": 4.464804649353027, + "learning_rate": 9.107977288857345e-05, + "loss": 0.0660893201828003, + "step": 62870 + }, + { + "epoch": 8.925479063165366, + "grad_norm": 2.770401954650879, + "learning_rate": 9.107849538679915e-05, + "loss": 0.06292575597763062, + "step": 62880 + }, + { + "epoch": 8.926898509581264, + "grad_norm": 6.6392011642456055, + "learning_rate": 9.107707594038325e-05, + "loss": 0.05748374462127685, + "step": 62890 + }, + { + "epoch": 8.92831795599716, + "grad_norm": 5.904137134552002, + "learning_rate": 9.107565649396736e-05, + "loss": 0.011138977110385894, + "step": 62900 + }, + { + "epoch": 8.929737402413059, + "grad_norm": 3.686323404312134, + "learning_rate": 9.107423704755146e-05, + "loss": 0.02662035822868347, + "step": 62910 + }, + { + "epoch": 8.931156848828957, + "grad_norm": 4.201900482177734, + "learning_rate": 9.107281760113557e-05, + "loss": 0.07875522971153259, + "step": 62920 + }, + { + "epoch": 8.932576295244855, + "grad_norm": 0.29425719380378723, + "learning_rate": 9.107139815471967e-05, + "loss": 0.02993255853652954, + "step": 62930 + }, + { + "epoch": 8.933995741660752, + "grad_norm": 2.1735360622406006, + "learning_rate": 9.106997870830378e-05, + "loss": 0.03278636932373047, + "step": 62940 + }, + { + "epoch": 8.93541518807665, + "grad_norm": 1.2639693021774292, + "learning_rate": 9.106855926188786e-05, + "loss": 0.13020997047424315, + "step": 62950 + }, + { + "epoch": 8.936834634492548, + "grad_norm": 3.955355167388916, + "learning_rate": 9.106713981547197e-05, + "loss": 0.030237650871276854, + "step": 62960 + }, + { + "epoch": 8.938254080908445, + "grad_norm": 1.5435599088668823, + "learning_rate": 9.106572036905607e-05, + "loss": 0.031678777933120725, + "step": 62970 + }, + { + "epoch": 8.939673527324343, + "grad_norm": 1.9179738759994507, + "learning_rate": 9.106430092264018e-05, + "loss": 0.07520507574081421, + "step": 62980 + }, + { + "epoch": 8.941092973740242, + "grad_norm": 3.3584091663360596, + "learning_rate": 9.106288147622428e-05, + "loss": 0.043991255760192874, + "step": 62990 + }, + { + "epoch": 8.94251242015614, + "grad_norm": 0.2111026793718338, + "learning_rate": 9.106146202980838e-05, + "loss": 0.016981948912143708, + "step": 63000 + }, + { + "epoch": 8.94251242015614, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.053931817412376404, + "eval_runtime": 33.1446, + "eval_samples_per_second": 474.497, + "eval_steps_per_second": 14.844, + "step": 63000 + }, + { + "epoch": 8.943931866572036, + "grad_norm": 2.602844715118408, + "learning_rate": 9.106004258339249e-05, + "loss": 0.026281210780143737, + "step": 63010 + }, + { + "epoch": 8.945351312987935, + "grad_norm": 4.518967628479004, + "learning_rate": 9.105862313697658e-05, + "loss": 0.021604365110397337, + "step": 63020 + }, + { + "epoch": 8.946770759403833, + "grad_norm": 2.052237033843994, + "learning_rate": 9.10572036905607e-05, + "loss": 0.05609427690505982, + "step": 63030 + }, + { + "epoch": 8.94819020581973, + "grad_norm": 16.218595504760742, + "learning_rate": 9.105578424414479e-05, + "loss": 0.06002562642097473, + "step": 63040 + }, + { + "epoch": 8.949609652235628, + "grad_norm": 15.071115493774414, + "learning_rate": 9.105436479772889e-05, + "loss": 0.05712893605232239, + "step": 63050 + }, + { + "epoch": 8.951029098651526, + "grad_norm": 0.1248808354139328, + "learning_rate": 9.105294535131299e-05, + "loss": 0.04929445683956146, + "step": 63060 + }, + { + "epoch": 8.952448545067424, + "grad_norm": 9.456350326538086, + "learning_rate": 9.10515259048971e-05, + "loss": 0.020241251587867735, + "step": 63070 + }, + { + "epoch": 8.953867991483321, + "grad_norm": 5.2534403800964355, + "learning_rate": 9.10501064584812e-05, + "loss": 0.1037605881690979, + "step": 63080 + }, + { + "epoch": 8.95528743789922, + "grad_norm": 0.07144279032945633, + "learning_rate": 9.10486870120653e-05, + "loss": 0.010947969555854798, + "step": 63090 + }, + { + "epoch": 8.956706884315118, + "grad_norm": 0.0726260244846344, + "learning_rate": 9.10472675656494e-05, + "loss": 0.02054280638694763, + "step": 63100 + }, + { + "epoch": 8.958126330731014, + "grad_norm": 3.0107414722442627, + "learning_rate": 9.10458481192335e-05, + "loss": 0.0355594128370285, + "step": 63110 + }, + { + "epoch": 8.959545777146912, + "grad_norm": 0.8008242845535278, + "learning_rate": 9.104442867281761e-05, + "loss": 0.020664720237255095, + "step": 63120 + }, + { + "epoch": 8.96096522356281, + "grad_norm": 0.20576325058937073, + "learning_rate": 9.104300922640171e-05, + "loss": 0.03208284378051758, + "step": 63130 + }, + { + "epoch": 8.962384669978709, + "grad_norm": 0.6825739145278931, + "learning_rate": 9.104158977998582e-05, + "loss": 0.014281252026557922, + "step": 63140 + }, + { + "epoch": 8.963804116394606, + "grad_norm": 3.512455463409424, + "learning_rate": 9.10401703335699e-05, + "loss": 0.005695473775267601, + "step": 63150 + }, + { + "epoch": 8.965223562810504, + "grad_norm": 0.5392917990684509, + "learning_rate": 9.103875088715401e-05, + "loss": 0.03396806418895722, + "step": 63160 + }, + { + "epoch": 8.966643009226402, + "grad_norm": 0.07298357039690018, + "learning_rate": 9.103733144073811e-05, + "loss": 0.011821673810482025, + "step": 63170 + }, + { + "epoch": 8.968062455642299, + "grad_norm": 0.03901802748441696, + "learning_rate": 9.103591199432222e-05, + "loss": 0.040955165028572084, + "step": 63180 + }, + { + "epoch": 8.969481902058197, + "grad_norm": 2.884178400039673, + "learning_rate": 9.103449254790632e-05, + "loss": 0.059520548582077025, + "step": 63190 + }, + { + "epoch": 8.970901348474095, + "grad_norm": 6.718408584594727, + "learning_rate": 9.103307310149042e-05, + "loss": 0.09160915017127991, + "step": 63200 + }, + { + "epoch": 8.972320794889994, + "grad_norm": 0.259737491607666, + "learning_rate": 9.103165365507453e-05, + "loss": 0.018729987740516662, + "step": 63210 + }, + { + "epoch": 8.97374024130589, + "grad_norm": 0.10244779288768768, + "learning_rate": 9.103023420865863e-05, + "loss": 0.060778087377548216, + "step": 63220 + }, + { + "epoch": 8.975159687721789, + "grad_norm": 2.154569625854492, + "learning_rate": 9.102881476224274e-05, + "loss": 0.03838706910610199, + "step": 63230 + }, + { + "epoch": 8.976579134137687, + "grad_norm": 10.858172416687012, + "learning_rate": 9.102739531582683e-05, + "loss": 0.05365590453147888, + "step": 63240 + }, + { + "epoch": 8.977998580553583, + "grad_norm": 0.09644179791212082, + "learning_rate": 9.102597586941093e-05, + "loss": 0.030293729901313782, + "step": 63250 + }, + { + "epoch": 8.979418026969482, + "grad_norm": 0.23783816397190094, + "learning_rate": 9.102455642299503e-05, + "loss": 0.04220397770404816, + "step": 63260 + }, + { + "epoch": 8.98083747338538, + "grad_norm": 7.483023166656494, + "learning_rate": 9.102313697657914e-05, + "loss": 0.04700967967510224, + "step": 63270 + }, + { + "epoch": 8.982256919801278, + "grad_norm": 0.4879629611968994, + "learning_rate": 9.102171753016324e-05, + "loss": 0.04679511487483978, + "step": 63280 + }, + { + "epoch": 8.983676366217175, + "grad_norm": 1.037855625152588, + "learning_rate": 9.102029808374735e-05, + "loss": 0.05557551383972168, + "step": 63290 + }, + { + "epoch": 8.985095812633073, + "grad_norm": 0.9630271792411804, + "learning_rate": 9.101887863733145e-05, + "loss": 0.045986443758010864, + "step": 63300 + }, + { + "epoch": 8.986515259048971, + "grad_norm": 4.669297218322754, + "learning_rate": 9.101745919091554e-05, + "loss": 0.028100493550300597, + "step": 63310 + }, + { + "epoch": 8.987934705464868, + "grad_norm": 0.6383737325668335, + "learning_rate": 9.101603974449965e-05, + "loss": 0.029157137870788573, + "step": 63320 + }, + { + "epoch": 8.989354151880766, + "grad_norm": 7.461113929748535, + "learning_rate": 9.101462029808375e-05, + "loss": 0.03222631812095642, + "step": 63330 + }, + { + "epoch": 8.990773598296665, + "grad_norm": 2.4280130863189697, + "learning_rate": 9.101320085166786e-05, + "loss": 0.019230978190898897, + "step": 63340 + }, + { + "epoch": 8.992193044712563, + "grad_norm": 1.3081002235412598, + "learning_rate": 9.101178140525196e-05, + "loss": 0.0344824880361557, + "step": 63350 + }, + { + "epoch": 8.99361249112846, + "grad_norm": 0.041129060089588165, + "learning_rate": 9.101036195883606e-05, + "loss": 0.05528616905212402, + "step": 63360 + }, + { + "epoch": 8.995031937544358, + "grad_norm": 8.482027053833008, + "learning_rate": 9.100894251242015e-05, + "loss": 0.02113038897514343, + "step": 63370 + }, + { + "epoch": 8.996451383960256, + "grad_norm": 1.0515233278274536, + "learning_rate": 9.100752306600427e-05, + "loss": 0.08732074499130249, + "step": 63380 + }, + { + "epoch": 8.997870830376153, + "grad_norm": 11.335579872131348, + "learning_rate": 9.100610361958836e-05, + "loss": 0.05290312767028808, + "step": 63390 + }, + { + "epoch": 8.99929027679205, + "grad_norm": 3.011958360671997, + "learning_rate": 9.100468417317247e-05, + "loss": 0.028017181158065795, + "step": 63400 + }, + { + "epoch": 9.00070972320795, + "grad_norm": 2.5004894733428955, + "learning_rate": 9.100326472675657e-05, + "loss": 0.007114443182945252, + "step": 63410 + }, + { + "epoch": 9.002129169623847, + "grad_norm": 0.9608315229415894, + "learning_rate": 9.100184528034067e-05, + "loss": 0.03280950784683227, + "step": 63420 + }, + { + "epoch": 9.003548616039744, + "grad_norm": 0.11002276837825775, + "learning_rate": 9.100042583392478e-05, + "loss": 0.052053457498550414, + "step": 63430 + }, + { + "epoch": 9.004968062455642, + "grad_norm": 3.397446870803833, + "learning_rate": 9.099900638750888e-05, + "loss": 0.028442218899726868, + "step": 63440 + }, + { + "epoch": 9.00638750887154, + "grad_norm": 6.959239959716797, + "learning_rate": 9.099758694109299e-05, + "loss": 0.02767481803894043, + "step": 63450 + }, + { + "epoch": 9.007806955287437, + "grad_norm": 1.9296232461929321, + "learning_rate": 9.099616749467707e-05, + "loss": 0.01916002035140991, + "step": 63460 + }, + { + "epoch": 9.009226401703335, + "grad_norm": 0.399951696395874, + "learning_rate": 9.099474804826118e-05, + "loss": 0.02451731264591217, + "step": 63470 + }, + { + "epoch": 9.010645848119234, + "grad_norm": 4.6797308921813965, + "learning_rate": 9.099332860184528e-05, + "loss": 0.011301268637180329, + "step": 63480 + }, + { + "epoch": 9.012065294535132, + "grad_norm": 10.480035781860352, + "learning_rate": 9.099190915542939e-05, + "loss": 0.019284191727638244, + "step": 63490 + }, + { + "epoch": 9.013484740951029, + "grad_norm": 1.5630741119384766, + "learning_rate": 9.099048970901349e-05, + "loss": 0.03886908292770386, + "step": 63500 + }, + { + "epoch": 9.013484740951029, + "eval_accuracy": 0.9846124499268774, + "eval_loss": 0.047866348177194595, + "eval_runtime": 32.9807, + "eval_samples_per_second": 476.855, + "eval_steps_per_second": 14.918, + "step": 63500 + }, + { + "epoch": 9.014904187366927, + "grad_norm": 2.2032713890075684, + "learning_rate": 9.098907026259759e-05, + "loss": 0.024095374345779418, + "step": 63510 + }, + { + "epoch": 9.016323633782825, + "grad_norm": 0.2933710515499115, + "learning_rate": 9.09876508161817e-05, + "loss": 0.03843706250190735, + "step": 63520 + }, + { + "epoch": 9.017743080198722, + "grad_norm": 8.106107711791992, + "learning_rate": 9.09862313697658e-05, + "loss": 0.03381499648094177, + "step": 63530 + }, + { + "epoch": 9.01916252661462, + "grad_norm": 4.045937538146973, + "learning_rate": 9.09848119233499e-05, + "loss": 0.009852905571460725, + "step": 63540 + }, + { + "epoch": 9.020581973030518, + "grad_norm": 9.243306159973145, + "learning_rate": 9.0983392476934e-05, + "loss": 0.06235023140907288, + "step": 63550 + }, + { + "epoch": 9.022001419446417, + "grad_norm": 10.42448902130127, + "learning_rate": 9.09819730305181e-05, + "loss": 0.030120083689689638, + "step": 63560 + }, + { + "epoch": 9.023420865862313, + "grad_norm": 0.11517821252346039, + "learning_rate": 9.09805535841022e-05, + "loss": 0.022688122093677522, + "step": 63570 + }, + { + "epoch": 9.024840312278211, + "grad_norm": 11.302186012268066, + "learning_rate": 9.097913413768631e-05, + "loss": 0.0606619656085968, + "step": 63580 + }, + { + "epoch": 9.02625975869411, + "grad_norm": 5.819149971008301, + "learning_rate": 9.09777146912704e-05, + "loss": 0.020539863407611846, + "step": 63590 + }, + { + "epoch": 9.027679205110006, + "grad_norm": 7.555016994476318, + "learning_rate": 9.097629524485452e-05, + "loss": 0.05828157663345337, + "step": 63600 + }, + { + "epoch": 9.029098651525905, + "grad_norm": 4.1367411613464355, + "learning_rate": 9.097487579843861e-05, + "loss": 0.06016632318496704, + "step": 63610 + }, + { + "epoch": 9.030518097941803, + "grad_norm": 1.3763189315795898, + "learning_rate": 9.097345635202271e-05, + "loss": 0.03897145688533783, + "step": 63620 + }, + { + "epoch": 9.031937544357701, + "grad_norm": 1.6536431312561035, + "learning_rate": 9.097203690560682e-05, + "loss": 0.018550589680671692, + "step": 63630 + }, + { + "epoch": 9.033356990773598, + "grad_norm": 0.18937966227531433, + "learning_rate": 9.097061745919092e-05, + "loss": 0.048836135864257814, + "step": 63640 + }, + { + "epoch": 9.034776437189496, + "grad_norm": 0.9972341060638428, + "learning_rate": 9.096919801277503e-05, + "loss": 0.018428274989128114, + "step": 63650 + }, + { + "epoch": 9.036195883605394, + "grad_norm": 0.13892175257205963, + "learning_rate": 9.096777856635913e-05, + "loss": 0.020253479480743408, + "step": 63660 + }, + { + "epoch": 9.037615330021291, + "grad_norm": 0.30342087149620056, + "learning_rate": 9.096635911994322e-05, + "loss": 0.029193535447120667, + "step": 63670 + }, + { + "epoch": 9.03903477643719, + "grad_norm": 0.5949711203575134, + "learning_rate": 9.096493967352732e-05, + "loss": 0.041388329863548276, + "step": 63680 + }, + { + "epoch": 9.040454222853088, + "grad_norm": 6.298411846160889, + "learning_rate": 9.096352022711143e-05, + "loss": 0.05257458686828613, + "step": 63690 + }, + { + "epoch": 9.041873669268986, + "grad_norm": 3.842451572418213, + "learning_rate": 9.096210078069553e-05, + "loss": 0.04259455502033234, + "step": 63700 + }, + { + "epoch": 9.043293115684882, + "grad_norm": 6.479927062988281, + "learning_rate": 9.096068133427964e-05, + "loss": 0.053349781036376956, + "step": 63710 + }, + { + "epoch": 9.04471256210078, + "grad_norm": 8.164941787719727, + "learning_rate": 9.095926188786374e-05, + "loss": 0.03254518806934357, + "step": 63720 + }, + { + "epoch": 9.046132008516679, + "grad_norm": 0.7975434064865112, + "learning_rate": 9.095784244144784e-05, + "loss": 0.013299444317817688, + "step": 63730 + }, + { + "epoch": 9.047551454932576, + "grad_norm": 1.7391208410263062, + "learning_rate": 9.095642299503195e-05, + "loss": 0.007834933698177338, + "step": 63740 + }, + { + "epoch": 9.048970901348474, + "grad_norm": 0.11377626657485962, + "learning_rate": 9.095500354861604e-05, + "loss": 0.09526675939559937, + "step": 63750 + }, + { + "epoch": 9.050390347764372, + "grad_norm": 8.344837188720703, + "learning_rate": 9.095358410220016e-05, + "loss": 0.031209063529968262, + "step": 63760 + }, + { + "epoch": 9.05180979418027, + "grad_norm": 2.4288384914398193, + "learning_rate": 9.095216465578424e-05, + "loss": 0.041136741638183594, + "step": 63770 + }, + { + "epoch": 9.053229240596167, + "grad_norm": 0.02463367208838463, + "learning_rate": 9.095074520936835e-05, + "loss": 0.018258750438690186, + "step": 63780 + }, + { + "epoch": 9.054648687012065, + "grad_norm": 0.7001953125, + "learning_rate": 9.094932576295245e-05, + "loss": 0.033777013421058655, + "step": 63790 + }, + { + "epoch": 9.056068133427964, + "grad_norm": 0.08243271708488464, + "learning_rate": 9.094790631653656e-05, + "loss": 0.05407797694206238, + "step": 63800 + }, + { + "epoch": 9.05748757984386, + "grad_norm": 6.220699310302734, + "learning_rate": 9.094648687012066e-05, + "loss": 0.019056543707847595, + "step": 63810 + }, + { + "epoch": 9.058907026259758, + "grad_norm": 1.73167085647583, + "learning_rate": 9.094506742370475e-05, + "loss": 0.024083656072616578, + "step": 63820 + }, + { + "epoch": 9.060326472675657, + "grad_norm": 0.7688294649124146, + "learning_rate": 9.094364797728886e-05, + "loss": 0.015182797610759736, + "step": 63830 + }, + { + "epoch": 9.061745919091555, + "grad_norm": 4.387524127960205, + "learning_rate": 9.094222853087296e-05, + "loss": 0.0528663158416748, + "step": 63840 + }, + { + "epoch": 9.063165365507452, + "grad_norm": 0.5699704885482788, + "learning_rate": 9.094080908445707e-05, + "loss": 0.010815832018852233, + "step": 63850 + }, + { + "epoch": 9.06458481192335, + "grad_norm": 0.7277228236198425, + "learning_rate": 9.093938963804117e-05, + "loss": 0.004810039326548577, + "step": 63860 + }, + { + "epoch": 9.066004258339248, + "grad_norm": 0.21413087844848633, + "learning_rate": 9.093797019162527e-05, + "loss": 0.018760937452316283, + "step": 63870 + }, + { + "epoch": 9.067423704755145, + "grad_norm": 0.2676238417625427, + "learning_rate": 9.093655074520936e-05, + "loss": 0.013498370349407197, + "step": 63880 + }, + { + "epoch": 9.068843151171043, + "grad_norm": 0.12514221668243408, + "learning_rate": 9.093513129879348e-05, + "loss": 0.009908372163772583, + "step": 63890 + }, + { + "epoch": 9.070262597586941, + "grad_norm": 2.2376949787139893, + "learning_rate": 9.093371185237757e-05, + "loss": 0.01928650438785553, + "step": 63900 + }, + { + "epoch": 9.07168204400284, + "grad_norm": 1.1722604036331177, + "learning_rate": 9.093229240596168e-05, + "loss": 0.06219164133071899, + "step": 63910 + }, + { + "epoch": 9.073101490418736, + "grad_norm": 0.09166330099105835, + "learning_rate": 9.093087295954578e-05, + "loss": 0.024753783643245698, + "step": 63920 + }, + { + "epoch": 9.074520936834634, + "grad_norm": 0.16517707705497742, + "learning_rate": 9.092945351312988e-05, + "loss": 0.014034570753574371, + "step": 63930 + }, + { + "epoch": 9.075940383250533, + "grad_norm": 7.7434773445129395, + "learning_rate": 9.092803406671399e-05, + "loss": 0.06656568646430969, + "step": 63940 + }, + { + "epoch": 9.07735982966643, + "grad_norm": 0.7195178270339966, + "learning_rate": 9.092661462029809e-05, + "loss": 0.023263543844223022, + "step": 63950 + }, + { + "epoch": 9.078779276082328, + "grad_norm": 0.576549768447876, + "learning_rate": 9.09251951738822e-05, + "loss": 0.03049999475479126, + "step": 63960 + }, + { + "epoch": 9.080198722498226, + "grad_norm": 2.331205368041992, + "learning_rate": 9.092377572746628e-05, + "loss": 0.03987345695495605, + "step": 63970 + }, + { + "epoch": 9.081618168914124, + "grad_norm": 2.247481346130371, + "learning_rate": 9.092235628105039e-05, + "loss": 0.034203958511352536, + "step": 63980 + }, + { + "epoch": 9.08303761533002, + "grad_norm": 3.3796920776367188, + "learning_rate": 9.092093683463449e-05, + "loss": 0.028829208016395567, + "step": 63990 + }, + { + "epoch": 9.084457061745919, + "grad_norm": 0.5988370776176453, + "learning_rate": 9.09195173882186e-05, + "loss": 0.005476556345820427, + "step": 64000 + }, + { + "epoch": 9.084457061745919, + "eval_accuracy": 0.9644560310294398, + "eval_loss": 0.1253519356250763, + "eval_runtime": 33.0057, + "eval_samples_per_second": 476.494, + "eval_steps_per_second": 14.907, + "step": 64000 + }, + { + "epoch": 9.085876508161817, + "grad_norm": 7.576418399810791, + "learning_rate": 9.09180979418027e-05, + "loss": 0.06737409830093384, + "step": 64010 + }, + { + "epoch": 9.087295954577714, + "grad_norm": 1.5139234066009521, + "learning_rate": 9.091667849538681e-05, + "loss": 0.06679987907409668, + "step": 64020 + }, + { + "epoch": 9.088715400993612, + "grad_norm": 0.34518659114837646, + "learning_rate": 9.09152590489709e-05, + "loss": 0.04123524129390717, + "step": 64030 + }, + { + "epoch": 9.09013484740951, + "grad_norm": 5.3888068199157715, + "learning_rate": 9.0913839602555e-05, + "loss": 0.05024193525314331, + "step": 64040 + }, + { + "epoch": 9.091554293825409, + "grad_norm": 1.4487290382385254, + "learning_rate": 9.091242015613911e-05, + "loss": 0.06248751878738403, + "step": 64050 + }, + { + "epoch": 9.092973740241305, + "grad_norm": 1.193361520767212, + "learning_rate": 9.091100070972321e-05, + "loss": 0.0333157479763031, + "step": 64060 + }, + { + "epoch": 9.094393186657204, + "grad_norm": 0.790325939655304, + "learning_rate": 9.090958126330732e-05, + "loss": 0.06225918531417847, + "step": 64070 + }, + { + "epoch": 9.095812633073102, + "grad_norm": 5.633969306945801, + "learning_rate": 9.09081618168914e-05, + "loss": 0.05814381837844849, + "step": 64080 + }, + { + "epoch": 9.097232079488998, + "grad_norm": 0.02727256342768669, + "learning_rate": 9.090674237047552e-05, + "loss": 0.033204466104507446, + "step": 64090 + }, + { + "epoch": 9.098651525904897, + "grad_norm": 4.997963905334473, + "learning_rate": 9.090532292405961e-05, + "loss": 0.08285855650901794, + "step": 64100 + }, + { + "epoch": 9.100070972320795, + "grad_norm": 2.4729809761047363, + "learning_rate": 9.090390347764373e-05, + "loss": 0.072034353017807, + "step": 64110 + }, + { + "epoch": 9.101490418736693, + "grad_norm": 0.2927655279636383, + "learning_rate": 9.090248403122784e-05, + "loss": 0.015156900882720948, + "step": 64120 + }, + { + "epoch": 9.10290986515259, + "grad_norm": 3.0484793186187744, + "learning_rate": 9.090106458481192e-05, + "loss": 0.032074537873268125, + "step": 64130 + }, + { + "epoch": 9.104329311568488, + "grad_norm": 0.18304117023944855, + "learning_rate": 9.089964513839603e-05, + "loss": 0.05773396492004394, + "step": 64140 + }, + { + "epoch": 9.105748757984387, + "grad_norm": 3.660144090652466, + "learning_rate": 9.089822569198013e-05, + "loss": 0.03408423960208893, + "step": 64150 + }, + { + "epoch": 9.107168204400283, + "grad_norm": 0.703398585319519, + "learning_rate": 9.089680624556424e-05, + "loss": 0.01160280853509903, + "step": 64160 + }, + { + "epoch": 9.108587650816181, + "grad_norm": 0.30307838320732117, + "learning_rate": 9.089538679914834e-05, + "loss": 0.03360556662082672, + "step": 64170 + }, + { + "epoch": 9.11000709723208, + "grad_norm": 0.2776535451412201, + "learning_rate": 9.089396735273243e-05, + "loss": 0.09121599197387695, + "step": 64180 + }, + { + "epoch": 9.111426543647978, + "grad_norm": 0.8593045473098755, + "learning_rate": 9.089254790631653e-05, + "loss": 0.016895319521427154, + "step": 64190 + }, + { + "epoch": 9.112845990063875, + "grad_norm": 2.2342422008514404, + "learning_rate": 9.089112845990064e-05, + "loss": 0.015390795469284058, + "step": 64200 + }, + { + "epoch": 9.114265436479773, + "grad_norm": 7.935168743133545, + "learning_rate": 9.088970901348475e-05, + "loss": 0.06856619715690612, + "step": 64210 + }, + { + "epoch": 9.115684882895671, + "grad_norm": 0.09339141100645065, + "learning_rate": 9.088828956706885e-05, + "loss": 0.027166441082954407, + "step": 64220 + }, + { + "epoch": 9.117104329311568, + "grad_norm": 0.7055644989013672, + "learning_rate": 9.088687012065295e-05, + "loss": 0.02743214964866638, + "step": 64230 + }, + { + "epoch": 9.118523775727466, + "grad_norm": 8.74792766571045, + "learning_rate": 9.088545067423705e-05, + "loss": 0.058396434783935545, + "step": 64240 + }, + { + "epoch": 9.119943222143364, + "grad_norm": 0.2736349105834961, + "learning_rate": 9.088403122782116e-05, + "loss": 0.020447200536727904, + "step": 64250 + }, + { + "epoch": 9.121362668559263, + "grad_norm": 0.13781176507472992, + "learning_rate": 9.088261178140525e-05, + "loss": 0.016127771139144896, + "step": 64260 + }, + { + "epoch": 9.12278211497516, + "grad_norm": 0.014397944323718548, + "learning_rate": 9.088119233498937e-05, + "loss": 0.07276721000671386, + "step": 64270 + }, + { + "epoch": 9.124201561391057, + "grad_norm": 0.040416114032268524, + "learning_rate": 9.087977288857345e-05, + "loss": 0.02326855659484863, + "step": 64280 + }, + { + "epoch": 9.125621007806956, + "grad_norm": 9.205130577087402, + "learning_rate": 9.087835344215756e-05, + "loss": 0.05591330528259277, + "step": 64290 + }, + { + "epoch": 9.127040454222852, + "grad_norm": 0.20341289043426514, + "learning_rate": 9.087693399574167e-05, + "loss": 0.025824469327926636, + "step": 64300 + }, + { + "epoch": 9.12845990063875, + "grad_norm": 0.5872713923454285, + "learning_rate": 9.087551454932577e-05, + "loss": 0.04505482614040375, + "step": 64310 + }, + { + "epoch": 9.129879347054649, + "grad_norm": 4.44802188873291, + "learning_rate": 9.087409510290988e-05, + "loss": 0.013928559422492982, + "step": 64320 + }, + { + "epoch": 9.131298793470547, + "grad_norm": 4.500983238220215, + "learning_rate": 9.087267565649396e-05, + "loss": 0.040919405221939084, + "step": 64330 + }, + { + "epoch": 9.132718239886444, + "grad_norm": 1.2529829740524292, + "learning_rate": 9.087125621007807e-05, + "loss": 0.042445436120033264, + "step": 64340 + }, + { + "epoch": 9.134137686302342, + "grad_norm": 3.7027170658111572, + "learning_rate": 9.086983676366217e-05, + "loss": 0.04291227459907532, + "step": 64350 + }, + { + "epoch": 9.13555713271824, + "grad_norm": 14.58912181854248, + "learning_rate": 9.086841731724628e-05, + "loss": 0.11172311305999756, + "step": 64360 + }, + { + "epoch": 9.136976579134137, + "grad_norm": 0.06370960175991058, + "learning_rate": 9.086699787083038e-05, + "loss": 0.005412508919835091, + "step": 64370 + }, + { + "epoch": 9.138396025550035, + "grad_norm": 0.059994593262672424, + "learning_rate": 9.086557842441449e-05, + "loss": 0.050759947299957274, + "step": 64380 + }, + { + "epoch": 9.139815471965933, + "grad_norm": 0.875908374786377, + "learning_rate": 9.086415897799859e-05, + "loss": 0.018525166809558867, + "step": 64390 + }, + { + "epoch": 9.141234918381832, + "grad_norm": 4.418863296508789, + "learning_rate": 9.086273953158269e-05, + "loss": 0.05396139621734619, + "step": 64400 + }, + { + "epoch": 9.142654364797728, + "grad_norm": 10.711448669433594, + "learning_rate": 9.08613200851668e-05, + "loss": 0.02829013466835022, + "step": 64410 + }, + { + "epoch": 9.144073811213627, + "grad_norm": 0.09981971979141235, + "learning_rate": 9.08599006387509e-05, + "loss": 0.027979806065559387, + "step": 64420 + }, + { + "epoch": 9.145493257629525, + "grad_norm": 9.851990699768066, + "learning_rate": 9.0858481192335e-05, + "loss": 0.031139957904815673, + "step": 64430 + }, + { + "epoch": 9.146912704045421, + "grad_norm": 0.539822518825531, + "learning_rate": 9.085706174591909e-05, + "loss": 0.02237287014722824, + "step": 64440 + }, + { + "epoch": 9.14833215046132, + "grad_norm": 4.454430103302002, + "learning_rate": 9.08556422995032e-05, + "loss": 0.043205234408378604, + "step": 64450 + }, + { + "epoch": 9.149751596877218, + "grad_norm": 0.11565633863210678, + "learning_rate": 9.08542228530873e-05, + "loss": 0.02587103843688965, + "step": 64460 + }, + { + "epoch": 9.151171043293116, + "grad_norm": 0.7783809900283813, + "learning_rate": 9.085280340667141e-05, + "loss": 0.039271104335784915, + "step": 64470 + }, + { + "epoch": 9.152590489709013, + "grad_norm": 0.5557367205619812, + "learning_rate": 9.08513839602555e-05, + "loss": 0.04649159014225006, + "step": 64480 + }, + { + "epoch": 9.154009936124911, + "grad_norm": 2.1434662342071533, + "learning_rate": 9.08499645138396e-05, + "loss": 0.049646627902984616, + "step": 64490 + }, + { + "epoch": 9.15542938254081, + "grad_norm": 7.786257743835449, + "learning_rate": 9.084854506742371e-05, + "loss": 0.037093961238861085, + "step": 64500 + }, + { + "epoch": 9.15542938254081, + "eval_accuracy": 0.9621033890761111, + "eval_loss": 0.14138783514499664, + "eval_runtime": 32.1947, + "eval_samples_per_second": 488.497, + "eval_steps_per_second": 15.282, + "step": 64500 + }, + { + "epoch": 9.156848828956706, + "grad_norm": 8.986673355102539, + "learning_rate": 9.084712562100781e-05, + "loss": 0.07170224785804749, + "step": 64510 + }, + { + "epoch": 9.158268275372604, + "grad_norm": 5.408350467681885, + "learning_rate": 9.084570617459192e-05, + "loss": 0.030388069152832032, + "step": 64520 + }, + { + "epoch": 9.159687721788503, + "grad_norm": 2.8842499256134033, + "learning_rate": 9.084428672817602e-05, + "loss": 0.015032586455345155, + "step": 64530 + }, + { + "epoch": 9.161107168204401, + "grad_norm": 2.7716171741485596, + "learning_rate": 9.084286728176012e-05, + "loss": 0.017233891785144805, + "step": 64540 + }, + { + "epoch": 9.162526614620297, + "grad_norm": 14.251026153564453, + "learning_rate": 9.084144783534421e-05, + "loss": 0.0279281884431839, + "step": 64550 + }, + { + "epoch": 9.163946061036196, + "grad_norm": 0.09909752011299133, + "learning_rate": 9.084002838892832e-05, + "loss": 0.01321396678686142, + "step": 64560 + }, + { + "epoch": 9.165365507452094, + "grad_norm": 0.048804301768541336, + "learning_rate": 9.083860894251242e-05, + "loss": 0.030893230438232423, + "step": 64570 + }, + { + "epoch": 9.16678495386799, + "grad_norm": 7.728274345397949, + "learning_rate": 9.083718949609653e-05, + "loss": 0.0831569790840149, + "step": 64580 + }, + { + "epoch": 9.168204400283889, + "grad_norm": 4.418582439422607, + "learning_rate": 9.083577004968063e-05, + "loss": 0.04628153443336487, + "step": 64590 + }, + { + "epoch": 9.169623846699787, + "grad_norm": 0.2589815855026245, + "learning_rate": 9.083435060326473e-05, + "loss": 0.014576169848442077, + "step": 64600 + }, + { + "epoch": 9.171043293115686, + "grad_norm": 0.2530888020992279, + "learning_rate": 9.083293115684884e-05, + "loss": 0.03431870639324188, + "step": 64610 + }, + { + "epoch": 9.172462739531582, + "grad_norm": 4.140875816345215, + "learning_rate": 9.083151171043294e-05, + "loss": 0.01830962300300598, + "step": 64620 + }, + { + "epoch": 9.17388218594748, + "grad_norm": 0.6446725130081177, + "learning_rate": 9.083009226401705e-05, + "loss": 0.024257193505764007, + "step": 64630 + }, + { + "epoch": 9.175301632363379, + "grad_norm": 0.051858942955732346, + "learning_rate": 9.082867281760113e-05, + "loss": 0.021634511649608612, + "step": 64640 + }, + { + "epoch": 9.176721078779275, + "grad_norm": 0.036192528903484344, + "learning_rate": 9.082725337118524e-05, + "loss": 0.010384272038936614, + "step": 64650 + }, + { + "epoch": 9.178140525195174, + "grad_norm": 2.0581307411193848, + "learning_rate": 9.082583392476934e-05, + "loss": 0.03748018741607666, + "step": 64660 + }, + { + "epoch": 9.179559971611072, + "grad_norm": 0.1270061433315277, + "learning_rate": 9.082441447835345e-05, + "loss": 0.05351813435554505, + "step": 64670 + }, + { + "epoch": 9.18097941802697, + "grad_norm": 8.708552360534668, + "learning_rate": 9.082299503193755e-05, + "loss": 0.036863112449646, + "step": 64680 + }, + { + "epoch": 9.182398864442867, + "grad_norm": 2.0032787322998047, + "learning_rate": 9.082157558552164e-05, + "loss": 0.008637142181396485, + "step": 64690 + }, + { + "epoch": 9.183818310858765, + "grad_norm": 1.0035320520401, + "learning_rate": 9.082015613910576e-05, + "loss": 0.04162544012069702, + "step": 64700 + }, + { + "epoch": 9.185237757274663, + "grad_norm": 6.927931308746338, + "learning_rate": 9.081873669268985e-05, + "loss": 0.05242146253585815, + "step": 64710 + }, + { + "epoch": 9.18665720369056, + "grad_norm": 0.6717086434364319, + "learning_rate": 9.081731724627396e-05, + "loss": 0.023917996883392335, + "step": 64720 + }, + { + "epoch": 9.188076650106458, + "grad_norm": 0.19253717362880707, + "learning_rate": 9.081589779985806e-05, + "loss": 0.020571285486221315, + "step": 64730 + }, + { + "epoch": 9.189496096522356, + "grad_norm": 0.1123395785689354, + "learning_rate": 9.081447835344217e-05, + "loss": 0.044478365778923036, + "step": 64740 + }, + { + "epoch": 9.190915542938255, + "grad_norm": 2.836575984954834, + "learning_rate": 9.081305890702626e-05, + "loss": 0.027955496311187746, + "step": 64750 + }, + { + "epoch": 9.192334989354151, + "grad_norm": 2.197659492492676, + "learning_rate": 9.081163946061037e-05, + "loss": 0.04038041830062866, + "step": 64760 + }, + { + "epoch": 9.19375443577005, + "grad_norm": 13.875947952270508, + "learning_rate": 9.081022001419446e-05, + "loss": 0.03770278990268707, + "step": 64770 + }, + { + "epoch": 9.195173882185948, + "grad_norm": 4.144615173339844, + "learning_rate": 9.080880056777858e-05, + "loss": 0.04378984868526459, + "step": 64780 + }, + { + "epoch": 9.196593328601844, + "grad_norm": 8.495119094848633, + "learning_rate": 9.080738112136267e-05, + "loss": 0.042510056495666505, + "step": 64790 + }, + { + "epoch": 9.198012775017743, + "grad_norm": 3.9217870235443115, + "learning_rate": 9.080596167494677e-05, + "loss": 0.05529659986495972, + "step": 64800 + }, + { + "epoch": 9.199432221433641, + "grad_norm": 7.344886302947998, + "learning_rate": 9.080454222853088e-05, + "loss": 0.05696294903755188, + "step": 64810 + }, + { + "epoch": 9.20085166784954, + "grad_norm": 4.72236442565918, + "learning_rate": 9.080312278211498e-05, + "loss": 0.04589243531227112, + "step": 64820 + }, + { + "epoch": 9.202271114265436, + "grad_norm": 12.580382347106934, + "learning_rate": 9.080170333569909e-05, + "loss": 0.06638288497924805, + "step": 64830 + }, + { + "epoch": 9.203690560681334, + "grad_norm": 1.4525302648544312, + "learning_rate": 9.080028388928319e-05, + "loss": 0.06872016787528992, + "step": 64840 + }, + { + "epoch": 9.205110007097232, + "grad_norm": 0.4770313501358032, + "learning_rate": 9.079886444286728e-05, + "loss": 0.03467918932437897, + "step": 64850 + }, + { + "epoch": 9.206529453513129, + "grad_norm": 3.841900110244751, + "learning_rate": 9.079744499645138e-05, + "loss": 0.02755351662635803, + "step": 64860 + }, + { + "epoch": 9.207948899929027, + "grad_norm": 5.2558112144470215, + "learning_rate": 9.079602555003549e-05, + "loss": 0.018217019736766815, + "step": 64870 + }, + { + "epoch": 9.209368346344926, + "grad_norm": 13.552385330200195, + "learning_rate": 9.079460610361959e-05, + "loss": 0.04716223478317261, + "step": 64880 + }, + { + "epoch": 9.210787792760824, + "grad_norm": 11.82503890991211, + "learning_rate": 9.07931866572037e-05, + "loss": 0.03754045367240906, + "step": 64890 + }, + { + "epoch": 9.21220723917672, + "grad_norm": 0.2998197078704834, + "learning_rate": 9.07917672107878e-05, + "loss": 0.04432125985622406, + "step": 64900 + }, + { + "epoch": 9.213626685592619, + "grad_norm": 0.033697254955768585, + "learning_rate": 9.07903477643719e-05, + "loss": 0.017217373847961424, + "step": 64910 + }, + { + "epoch": 9.215046132008517, + "grad_norm": 0.2636561989784241, + "learning_rate": 9.0788928317956e-05, + "loss": 0.030328923463821413, + "step": 64920 + }, + { + "epoch": 9.216465578424414, + "grad_norm": 2.3679654598236084, + "learning_rate": 9.07875088715401e-05, + "loss": 0.04711674749851227, + "step": 64930 + }, + { + "epoch": 9.217885024840312, + "grad_norm": 5.013309478759766, + "learning_rate": 9.078608942512421e-05, + "loss": 0.07335137724876403, + "step": 64940 + }, + { + "epoch": 9.21930447125621, + "grad_norm": 2.416539430618286, + "learning_rate": 9.07846699787083e-05, + "loss": 0.051906025409698485, + "step": 64950 + }, + { + "epoch": 9.220723917672109, + "grad_norm": 0.818121075630188, + "learning_rate": 9.078325053229241e-05, + "loss": 0.04151787757873535, + "step": 64960 + }, + { + "epoch": 9.222143364088005, + "grad_norm": 0.08963953703641891, + "learning_rate": 9.07818310858765e-05, + "loss": 0.03505766987800598, + "step": 64970 + }, + { + "epoch": 9.223562810503903, + "grad_norm": 0.3595544695854187, + "learning_rate": 9.078041163946062e-05, + "loss": 0.041363495588302615, + "step": 64980 + }, + { + "epoch": 9.224982256919802, + "grad_norm": 0.8693178296089172, + "learning_rate": 9.077899219304472e-05, + "loss": 0.018786983191967012, + "step": 64990 + }, + { + "epoch": 9.2264017033357, + "grad_norm": 0.8593055009841919, + "learning_rate": 9.077757274662881e-05, + "loss": 0.042645350098609924, + "step": 65000 + }, + { + "epoch": 9.2264017033357, + "eval_accuracy": 0.981814713549946, + "eval_loss": 0.056301869451999664, + "eval_runtime": 32.8156, + "eval_samples_per_second": 479.253, + "eval_steps_per_second": 14.993, + "step": 65000 + }, + { + "epoch": 9.227821149751597, + "grad_norm": 0.27893632650375366, + "learning_rate": 9.077615330021292e-05, + "loss": 0.034988516569137575, + "step": 65010 + }, + { + "epoch": 9.229240596167495, + "grad_norm": 5.20842981338501, + "learning_rate": 9.077473385379702e-05, + "loss": 0.030495092272758484, + "step": 65020 + }, + { + "epoch": 9.230660042583393, + "grad_norm": 4.230602264404297, + "learning_rate": 9.077331440738113e-05, + "loss": 0.025418007373809816, + "step": 65030 + }, + { + "epoch": 9.23207948899929, + "grad_norm": 0.7757551074028015, + "learning_rate": 9.077189496096523e-05, + "loss": 0.09136197566986085, + "step": 65040 + }, + { + "epoch": 9.233498935415188, + "grad_norm": 0.16610698401927948, + "learning_rate": 9.077047551454933e-05, + "loss": 0.04587730467319488, + "step": 65050 + }, + { + "epoch": 9.234918381831086, + "grad_norm": 0.07457069307565689, + "learning_rate": 9.076905606813342e-05, + "loss": 0.01524857133626938, + "step": 65060 + }, + { + "epoch": 9.236337828246985, + "grad_norm": 2.2226922512054443, + "learning_rate": 9.076763662171753e-05, + "loss": 0.01971132159233093, + "step": 65070 + }, + { + "epoch": 9.237757274662881, + "grad_norm": 6.34228515625, + "learning_rate": 9.076621717530163e-05, + "loss": 0.03835614919662476, + "step": 65080 + }, + { + "epoch": 9.23917672107878, + "grad_norm": 9.732237815856934, + "learning_rate": 9.076479772888574e-05, + "loss": 0.06072259545326233, + "step": 65090 + }, + { + "epoch": 9.240596167494678, + "grad_norm": 0.5269761681556702, + "learning_rate": 9.076337828246984e-05, + "loss": 0.023173244297504426, + "step": 65100 + }, + { + "epoch": 9.242015613910574, + "grad_norm": 3.5085811614990234, + "learning_rate": 9.076195883605394e-05, + "loss": 0.023165860772132875, + "step": 65110 + }, + { + "epoch": 9.243435060326473, + "grad_norm": 9.514446258544922, + "learning_rate": 9.076053938963805e-05, + "loss": 0.02069098949432373, + "step": 65120 + }, + { + "epoch": 9.24485450674237, + "grad_norm": 0.27682217955589294, + "learning_rate": 9.075911994322215e-05, + "loss": 0.01605180650949478, + "step": 65130 + }, + { + "epoch": 9.24627395315827, + "grad_norm": 0.49888181686401367, + "learning_rate": 9.075770049680626e-05, + "loss": 0.05236906409263611, + "step": 65140 + }, + { + "epoch": 9.247693399574166, + "grad_norm": 6.699531555175781, + "learning_rate": 9.075628105039035e-05, + "loss": 0.03403179347515106, + "step": 65150 + }, + { + "epoch": 9.249112845990064, + "grad_norm": 4.958370208740234, + "learning_rate": 9.075486160397445e-05, + "loss": 0.0508464515209198, + "step": 65160 + }, + { + "epoch": 9.250532292405962, + "grad_norm": 3.9333159923553467, + "learning_rate": 9.075358410220015e-05, + "loss": 0.12287168502807617, + "step": 65170 + }, + { + "epoch": 9.251951738821859, + "grad_norm": 4.830854892730713, + "learning_rate": 9.075216465578425e-05, + "loss": 0.034806248545646665, + "step": 65180 + }, + { + "epoch": 9.253371185237757, + "grad_norm": 6.123739719390869, + "learning_rate": 9.075074520936834e-05, + "loss": 0.03453691303730011, + "step": 65190 + }, + { + "epoch": 9.254790631653655, + "grad_norm": 0.9723725318908691, + "learning_rate": 9.074932576295246e-05, + "loss": 0.036412373185157776, + "step": 65200 + }, + { + "epoch": 9.256210078069554, + "grad_norm": 0.0696854367852211, + "learning_rate": 9.074790631653655e-05, + "loss": 0.03546989262104035, + "step": 65210 + }, + { + "epoch": 9.25762952448545, + "grad_norm": 4.158839702606201, + "learning_rate": 9.074648687012066e-05, + "loss": 0.05229751467704773, + "step": 65220 + }, + { + "epoch": 9.259048970901349, + "grad_norm": 5.570679664611816, + "learning_rate": 9.074506742370476e-05, + "loss": 0.05541685223579407, + "step": 65230 + }, + { + "epoch": 9.260468417317247, + "grad_norm": 0.8031320571899414, + "learning_rate": 9.074364797728886e-05, + "loss": 0.024064990878105163, + "step": 65240 + }, + { + "epoch": 9.261887863733143, + "grad_norm": 1.7579777240753174, + "learning_rate": 9.074222853087297e-05, + "loss": 0.08455089926719665, + "step": 65250 + }, + { + "epoch": 9.263307310149042, + "grad_norm": 0.7289173007011414, + "learning_rate": 9.074080908445707e-05, + "loss": 0.022726500034332277, + "step": 65260 + }, + { + "epoch": 9.26472675656494, + "grad_norm": 5.2052459716796875, + "learning_rate": 9.073938963804118e-05, + "loss": 0.017752929031848906, + "step": 65270 + }, + { + "epoch": 9.266146202980838, + "grad_norm": 2.6849477291107178, + "learning_rate": 9.073797019162526e-05, + "loss": 0.0206844300031662, + "step": 65280 + }, + { + "epoch": 9.267565649396735, + "grad_norm": 0.08539886772632599, + "learning_rate": 9.073655074520937e-05, + "loss": 0.03450865149497986, + "step": 65290 + }, + { + "epoch": 9.268985095812633, + "grad_norm": 5.187475681304932, + "learning_rate": 9.073513129879347e-05, + "loss": 0.054791712760925294, + "step": 65300 + }, + { + "epoch": 9.270404542228531, + "grad_norm": 1.5241824388504028, + "learning_rate": 9.073371185237758e-05, + "loss": 0.06886662840843201, + "step": 65310 + }, + { + "epoch": 9.271823988644428, + "grad_norm": 9.872211456298828, + "learning_rate": 9.073229240596168e-05, + "loss": 0.03144813776016235, + "step": 65320 + }, + { + "epoch": 9.273243435060326, + "grad_norm": 3.142368793487549, + "learning_rate": 9.073087295954578e-05, + "loss": 0.010226437449455261, + "step": 65330 + }, + { + "epoch": 9.274662881476225, + "grad_norm": 1.4327269792556763, + "learning_rate": 9.072945351312989e-05, + "loss": 0.032859033346176146, + "step": 65340 + }, + { + "epoch": 9.276082327892123, + "grad_norm": 2.3611676692962646, + "learning_rate": 9.072803406671398e-05, + "loss": 0.013543438911437989, + "step": 65350 + }, + { + "epoch": 9.27750177430802, + "grad_norm": 0.41527459025382996, + "learning_rate": 9.07266146202981e-05, + "loss": 0.02646762728691101, + "step": 65360 + }, + { + "epoch": 9.278921220723918, + "grad_norm": 0.8477177619934082, + "learning_rate": 9.072519517388219e-05, + "loss": 0.012412407249212266, + "step": 65370 + }, + { + "epoch": 9.280340667139816, + "grad_norm": 0.10813146084547043, + "learning_rate": 9.072377572746629e-05, + "loss": 0.04115345776081085, + "step": 65380 + }, + { + "epoch": 9.281760113555713, + "grad_norm": 0.7527191638946533, + "learning_rate": 9.072235628105039e-05, + "loss": 0.02868193984031677, + "step": 65390 + }, + { + "epoch": 9.283179559971611, + "grad_norm": 0.4436936378479004, + "learning_rate": 9.07209368346345e-05, + "loss": 0.009298932552337647, + "step": 65400 + }, + { + "epoch": 9.28459900638751, + "grad_norm": 1.910849690437317, + "learning_rate": 9.07195173882186e-05, + "loss": 0.024878501892089844, + "step": 65410 + }, + { + "epoch": 9.286018452803408, + "grad_norm": 0.1165904849767685, + "learning_rate": 9.07180979418027e-05, + "loss": 0.011494255065917969, + "step": 65420 + }, + { + "epoch": 9.287437899219304, + "grad_norm": 9.035189628601074, + "learning_rate": 9.07166784953868e-05, + "loss": 0.054299116134643555, + "step": 65430 + }, + { + "epoch": 9.288857345635202, + "grad_norm": 0.0229057427495718, + "learning_rate": 9.07152590489709e-05, + "loss": 0.038271555304527284, + "step": 65440 + }, + { + "epoch": 9.2902767920511, + "grad_norm": 3.428968667984009, + "learning_rate": 9.071383960255501e-05, + "loss": 0.022773563861846924, + "step": 65450 + }, + { + "epoch": 9.291696238466997, + "grad_norm": 1.1164143085479736, + "learning_rate": 9.071242015613911e-05, + "loss": 0.04025732278823853, + "step": 65460 + }, + { + "epoch": 9.293115684882896, + "grad_norm": 3.3081722259521484, + "learning_rate": 9.071100070972322e-05, + "loss": 0.05027411580085754, + "step": 65470 + }, + { + "epoch": 9.294535131298794, + "grad_norm": 0.058389388024806976, + "learning_rate": 9.070958126330732e-05, + "loss": 0.01988658607006073, + "step": 65480 + }, + { + "epoch": 9.295954577714692, + "grad_norm": 0.3912332355976105, + "learning_rate": 9.070816181689142e-05, + "loss": 0.03847215473651886, + "step": 65490 + }, + { + "epoch": 9.297374024130589, + "grad_norm": 3.076023578643799, + "learning_rate": 9.070674237047551e-05, + "loss": 0.036669176816940305, + "step": 65500 + }, + { + "epoch": 9.297374024130589, + "eval_accuracy": 0.9775545240668914, + "eval_loss": 0.07455466687679291, + "eval_runtime": 33.3145, + "eval_samples_per_second": 472.077, + "eval_steps_per_second": 14.768, + "step": 65500 + }, + { + "epoch": 9.298793470546487, + "grad_norm": 6.249359607696533, + "learning_rate": 9.070532292405962e-05, + "loss": 0.05166963934898376, + "step": 65510 + }, + { + "epoch": 9.300212916962385, + "grad_norm": 3.9852261543273926, + "learning_rate": 9.070390347764372e-05, + "loss": 0.016864025592803956, + "step": 65520 + }, + { + "epoch": 9.301632363378282, + "grad_norm": 8.56318473815918, + "learning_rate": 9.070248403122783e-05, + "loss": 0.060342812538146974, + "step": 65530 + }, + { + "epoch": 9.30305180979418, + "grad_norm": 0.9962877035140991, + "learning_rate": 9.070106458481193e-05, + "loss": 0.030284777283668518, + "step": 65540 + }, + { + "epoch": 9.304471256210078, + "grad_norm": 0.08095641434192657, + "learning_rate": 9.069964513839603e-05, + "loss": 0.011772031337022782, + "step": 65550 + }, + { + "epoch": 9.305890702625977, + "grad_norm": 3.3373711109161377, + "learning_rate": 9.069822569198014e-05, + "loss": 0.026495721936225892, + "step": 65560 + }, + { + "epoch": 9.307310149041873, + "grad_norm": 8.643132209777832, + "learning_rate": 9.069680624556424e-05, + "loss": 0.060178011655807495, + "step": 65570 + }, + { + "epoch": 9.308729595457772, + "grad_norm": 1.415281891822815, + "learning_rate": 9.069538679914835e-05, + "loss": 0.013044501841068267, + "step": 65580 + }, + { + "epoch": 9.31014904187367, + "grad_norm": 5.128058433532715, + "learning_rate": 9.069396735273243e-05, + "loss": 0.033860421180725096, + "step": 65590 + }, + { + "epoch": 9.311568488289566, + "grad_norm": 0.04642212390899658, + "learning_rate": 9.069254790631654e-05, + "loss": 0.041500359773635864, + "step": 65600 + }, + { + "epoch": 9.312987934705465, + "grad_norm": 0.04170290380716324, + "learning_rate": 9.069112845990064e-05, + "loss": 0.0672379732131958, + "step": 65610 + }, + { + "epoch": 9.314407381121363, + "grad_norm": 0.4078945517539978, + "learning_rate": 9.068970901348475e-05, + "loss": 0.018945935368537902, + "step": 65620 + }, + { + "epoch": 9.315826827537261, + "grad_norm": 6.94992733001709, + "learning_rate": 9.068828956706885e-05, + "loss": 0.04940144419670105, + "step": 65630 + }, + { + "epoch": 9.317246273953158, + "grad_norm": 0.44048407673835754, + "learning_rate": 9.068687012065294e-05, + "loss": 0.03995745182037354, + "step": 65640 + }, + { + "epoch": 9.318665720369056, + "grad_norm": 4.295497894287109, + "learning_rate": 9.068545067423705e-05, + "loss": 0.02688758969306946, + "step": 65650 + }, + { + "epoch": 9.320085166784954, + "grad_norm": 5.7648396492004395, + "learning_rate": 9.068403122782115e-05, + "loss": 0.013780666887760163, + "step": 65660 + }, + { + "epoch": 9.321504613200851, + "grad_norm": 0.3595585227012634, + "learning_rate": 9.068261178140526e-05, + "loss": 0.0018398284912109375, + "step": 65670 + }, + { + "epoch": 9.32292405961675, + "grad_norm": 0.18886813521385193, + "learning_rate": 9.068119233498936e-05, + "loss": 0.014816860854625701, + "step": 65680 + }, + { + "epoch": 9.324343506032648, + "grad_norm": 1.3162140846252441, + "learning_rate": 9.067977288857346e-05, + "loss": 0.019825367629528044, + "step": 65690 + }, + { + "epoch": 9.325762952448546, + "grad_norm": 1.3076903820037842, + "learning_rate": 9.067835344215756e-05, + "loss": 0.008933990448713302, + "step": 65700 + }, + { + "epoch": 9.327182398864442, + "grad_norm": 7.359697341918945, + "learning_rate": 9.067693399574167e-05, + "loss": 0.04452669322490692, + "step": 65710 + }, + { + "epoch": 9.32860184528034, + "grad_norm": 0.31702089309692383, + "learning_rate": 9.067551454932576e-05, + "loss": 0.05820360779762268, + "step": 65720 + }, + { + "epoch": 9.330021291696239, + "grad_norm": 0.04736144468188286, + "learning_rate": 9.067409510290987e-05, + "loss": 0.055652981996536253, + "step": 65730 + }, + { + "epoch": 9.331440738112136, + "grad_norm": 0.2538986802101135, + "learning_rate": 9.067267565649397e-05, + "loss": 0.020092563331127168, + "step": 65740 + }, + { + "epoch": 9.332860184528034, + "grad_norm": 10.348010063171387, + "learning_rate": 9.067125621007807e-05, + "loss": 0.05498163104057312, + "step": 65750 + }, + { + "epoch": 9.334279630943932, + "grad_norm": 10.26076889038086, + "learning_rate": 9.066983676366218e-05, + "loss": 0.06374727487564087, + "step": 65760 + }, + { + "epoch": 9.33569907735983, + "grad_norm": 0.08166330307722092, + "learning_rate": 9.066841731724628e-05, + "loss": 0.03318239748477936, + "step": 65770 + }, + { + "epoch": 9.337118523775727, + "grad_norm": 2.647860288619995, + "learning_rate": 9.066699787083039e-05, + "loss": 0.03421743810176849, + "step": 65780 + }, + { + "epoch": 9.338537970191625, + "grad_norm": 0.08706337213516235, + "learning_rate": 9.066557842441449e-05, + "loss": 0.02070632576942444, + "step": 65790 + }, + { + "epoch": 9.339957416607524, + "grad_norm": 1.6088865995407104, + "learning_rate": 9.066415897799858e-05, + "loss": 0.0166518896818161, + "step": 65800 + }, + { + "epoch": 9.34137686302342, + "grad_norm": 0.1422835737466812, + "learning_rate": 9.066273953158268e-05, + "loss": 0.05076624751091004, + "step": 65810 + }, + { + "epoch": 9.342796309439318, + "grad_norm": 0.6648264527320862, + "learning_rate": 9.066132008516679e-05, + "loss": 0.023625385761260987, + "step": 65820 + }, + { + "epoch": 9.344215755855217, + "grad_norm": 3.993276357650757, + "learning_rate": 9.065990063875089e-05, + "loss": 0.01901327967643738, + "step": 65830 + }, + { + "epoch": 9.345635202271115, + "grad_norm": 5.106206893920898, + "learning_rate": 9.0658481192335e-05, + "loss": 0.03103363811969757, + "step": 65840 + }, + { + "epoch": 9.347054648687012, + "grad_norm": 0.7105145454406738, + "learning_rate": 9.06570617459191e-05, + "loss": 0.04525960385799408, + "step": 65850 + }, + { + "epoch": 9.34847409510291, + "grad_norm": 2.2576792240142822, + "learning_rate": 9.06556422995032e-05, + "loss": 0.06363716125488281, + "step": 65860 + }, + { + "epoch": 9.349893541518808, + "grad_norm": 0.34290191531181335, + "learning_rate": 9.06542228530873e-05, + "loss": 0.0342064768075943, + "step": 65870 + }, + { + "epoch": 9.351312987934705, + "grad_norm": 6.060485363006592, + "learning_rate": 9.06528034066714e-05, + "loss": 0.020967322587966918, + "step": 65880 + }, + { + "epoch": 9.352732434350603, + "grad_norm": 2.8930869102478027, + "learning_rate": 9.065138396025551e-05, + "loss": 0.053651803731918336, + "step": 65890 + }, + { + "epoch": 9.354151880766501, + "grad_norm": 5.113282680511475, + "learning_rate": 9.06499645138396e-05, + "loss": 0.04868954718112946, + "step": 65900 + }, + { + "epoch": 9.3555713271824, + "grad_norm": 9.573057174682617, + "learning_rate": 9.064854506742371e-05, + "loss": 0.054132658243179324, + "step": 65910 + }, + { + "epoch": 9.356990773598296, + "grad_norm": 2.009981870651245, + "learning_rate": 9.06471256210078e-05, + "loss": 0.036932623386383055, + "step": 65920 + }, + { + "epoch": 9.358410220014195, + "grad_norm": 0.15597857534885406, + "learning_rate": 9.064570617459192e-05, + "loss": 0.014807553589344024, + "step": 65930 + }, + { + "epoch": 9.359829666430093, + "grad_norm": 0.6595566272735596, + "learning_rate": 9.064428672817601e-05, + "loss": 0.02623018026351929, + "step": 65940 + }, + { + "epoch": 9.36124911284599, + "grad_norm": 1.553436517715454, + "learning_rate": 9.064286728176011e-05, + "loss": 0.02999439835548401, + "step": 65950 + }, + { + "epoch": 9.362668559261888, + "grad_norm": 7.259316444396973, + "learning_rate": 9.064144783534422e-05, + "loss": 0.031955486536026, + "step": 65960 + }, + { + "epoch": 9.364088005677786, + "grad_norm": 0.7037152647972107, + "learning_rate": 9.064002838892832e-05, + "loss": 0.0743894875049591, + "step": 65970 + }, + { + "epoch": 9.365507452093684, + "grad_norm": 2.922961473464966, + "learning_rate": 9.063860894251243e-05, + "loss": 0.05946826934814453, + "step": 65980 + }, + { + "epoch": 9.36692689850958, + "grad_norm": 6.301733016967773, + "learning_rate": 9.063718949609653e-05, + "loss": 0.026286065578460693, + "step": 65990 + }, + { + "epoch": 9.36834634492548, + "grad_norm": 12.488518714904785, + "learning_rate": 9.063577004968063e-05, + "loss": 0.05177741050720215, + "step": 66000 + }, + { + "epoch": 9.36834634492548, + "eval_accuracy": 0.9830864119030965, + "eval_loss": 0.05414308235049248, + "eval_runtime": 32.7903, + "eval_samples_per_second": 479.623, + "eval_steps_per_second": 15.004, + "step": 66000 + }, + { + "epoch": 9.369765791341377, + "grad_norm": 5.741583824157715, + "learning_rate": 9.063435060326472e-05, + "loss": 0.022419145703315733, + "step": 66010 + }, + { + "epoch": 9.371185237757274, + "grad_norm": 1.4365832805633545, + "learning_rate": 9.063293115684883e-05, + "loss": 0.03936209380626678, + "step": 66020 + }, + { + "epoch": 9.372604684173172, + "grad_norm": 3.806220531463623, + "learning_rate": 9.063151171043293e-05, + "loss": 0.040252089500427246, + "step": 66030 + }, + { + "epoch": 9.37402413058907, + "grad_norm": 0.21159783005714417, + "learning_rate": 9.063009226401704e-05, + "loss": 0.043931758403778075, + "step": 66040 + }, + { + "epoch": 9.375443577004969, + "grad_norm": 0.23096901178359985, + "learning_rate": 9.062867281760114e-05, + "loss": 0.04873138964176178, + "step": 66050 + }, + { + "epoch": 9.376863023420865, + "grad_norm": 0.8391821384429932, + "learning_rate": 9.062725337118524e-05, + "loss": 0.026393789052963256, + "step": 66060 + }, + { + "epoch": 9.378282469836764, + "grad_norm": 0.03785236179828644, + "learning_rate": 9.062583392476935e-05, + "loss": 0.036301881074905396, + "step": 66070 + }, + { + "epoch": 9.379701916252662, + "grad_norm": 0.046141933649778366, + "learning_rate": 9.062441447835345e-05, + "loss": 0.06375334262847901, + "step": 66080 + }, + { + "epoch": 9.381121362668559, + "grad_norm": 6.387070178985596, + "learning_rate": 9.062299503193756e-05, + "loss": 0.03702278733253479, + "step": 66090 + }, + { + "epoch": 9.382540809084457, + "grad_norm": 0.19831642508506775, + "learning_rate": 9.062157558552165e-05, + "loss": 0.017668356001377106, + "step": 66100 + }, + { + "epoch": 9.383960255500355, + "grad_norm": 4.372278213500977, + "learning_rate": 9.062015613910575e-05, + "loss": 0.025728854537010192, + "step": 66110 + }, + { + "epoch": 9.385379701916253, + "grad_norm": 9.222113609313965, + "learning_rate": 9.061873669268985e-05, + "loss": 0.04258951544761658, + "step": 66120 + }, + { + "epoch": 9.38679914833215, + "grad_norm": 8.19491958618164, + "learning_rate": 9.061731724627396e-05, + "loss": 0.02233922928571701, + "step": 66130 + }, + { + "epoch": 9.388218594748048, + "grad_norm": 4.794766902923584, + "learning_rate": 9.061589779985806e-05, + "loss": 0.06196138858795166, + "step": 66140 + }, + { + "epoch": 9.389638041163947, + "grad_norm": 3.5216760635375977, + "learning_rate": 9.061447835344217e-05, + "loss": 0.010813835263252258, + "step": 66150 + }, + { + "epoch": 9.391057487579843, + "grad_norm": 0.24806861579418182, + "learning_rate": 9.061305890702626e-05, + "loss": 0.020065225660800934, + "step": 66160 + }, + { + "epoch": 9.392476933995741, + "grad_norm": 0.3911595344543457, + "learning_rate": 9.061163946061036e-05, + "loss": 0.0663827896118164, + "step": 66170 + }, + { + "epoch": 9.39389638041164, + "grad_norm": 1.4187536239624023, + "learning_rate": 9.061022001419447e-05, + "loss": 0.010493065416812896, + "step": 66180 + }, + { + "epoch": 9.395315826827538, + "grad_norm": 0.6327175498008728, + "learning_rate": 9.060880056777857e-05, + "loss": 0.026095515489578246, + "step": 66190 + }, + { + "epoch": 9.396735273243435, + "grad_norm": 4.311141490936279, + "learning_rate": 9.060738112136268e-05, + "loss": 0.022125279903411864, + "step": 66200 + }, + { + "epoch": 9.398154719659333, + "grad_norm": 0.7823535203933716, + "learning_rate": 9.060596167494677e-05, + "loss": 0.08423017859458923, + "step": 66210 + }, + { + "epoch": 9.399574166075231, + "grad_norm": 2.879866361618042, + "learning_rate": 9.060454222853088e-05, + "loss": 0.055038821697235105, + "step": 66220 + }, + { + "epoch": 9.400993612491128, + "grad_norm": 0.12221132963895798, + "learning_rate": 9.060312278211497e-05, + "loss": 0.05022150278091431, + "step": 66230 + }, + { + "epoch": 9.402413058907026, + "grad_norm": 5.080928325653076, + "learning_rate": 9.060170333569908e-05, + "loss": 0.05644909143447876, + "step": 66240 + }, + { + "epoch": 9.403832505322924, + "grad_norm": 1.559996485710144, + "learning_rate": 9.060028388928318e-05, + "loss": 0.052948832511901855, + "step": 66250 + }, + { + "epoch": 9.405251951738823, + "grad_norm": 0.12736868858337402, + "learning_rate": 9.059886444286728e-05, + "loss": 0.009814755618572235, + "step": 66260 + }, + { + "epoch": 9.40667139815472, + "grad_norm": 0.41833576560020447, + "learning_rate": 9.059744499645139e-05, + "loss": 0.06674405336380004, + "step": 66270 + }, + { + "epoch": 9.408090844570618, + "grad_norm": 0.0420779325067997, + "learning_rate": 9.059602555003549e-05, + "loss": 0.010850544273853301, + "step": 66280 + }, + { + "epoch": 9.409510290986516, + "grad_norm": 7.900463104248047, + "learning_rate": 9.05946061036196e-05, + "loss": 0.026490825414657592, + "step": 66290 + }, + { + "epoch": 9.410929737402412, + "grad_norm": 2.7261571884155273, + "learning_rate": 9.05931866572037e-05, + "loss": 0.039139583706855774, + "step": 66300 + }, + { + "epoch": 9.41234918381831, + "grad_norm": 1.2342069149017334, + "learning_rate": 9.059176721078779e-05, + "loss": 0.06747770309448242, + "step": 66310 + }, + { + "epoch": 9.413768630234209, + "grad_norm": 2.4731056690216064, + "learning_rate": 9.059034776437189e-05, + "loss": 0.10458157062530518, + "step": 66320 + }, + { + "epoch": 9.415188076650107, + "grad_norm": 8.4277982711792, + "learning_rate": 9.0588928317956e-05, + "loss": 0.09690378308296203, + "step": 66330 + }, + { + "epoch": 9.416607523066004, + "grad_norm": 5.209705829620361, + "learning_rate": 9.05875088715401e-05, + "loss": 0.03505201935768128, + "step": 66340 + }, + { + "epoch": 9.418026969481902, + "grad_norm": 7.857044219970703, + "learning_rate": 9.058608942512421e-05, + "loss": 0.060685038566589355, + "step": 66350 + }, + { + "epoch": 9.4194464158978, + "grad_norm": 0.43560779094696045, + "learning_rate": 9.058466997870831e-05, + "loss": 0.058388322591781616, + "step": 66360 + }, + { + "epoch": 9.420865862313697, + "grad_norm": 0.05114549398422241, + "learning_rate": 9.05832505322924e-05, + "loss": 0.019231802225112914, + "step": 66370 + }, + { + "epoch": 9.422285308729595, + "grad_norm": 9.367727279663086, + "learning_rate": 9.058183108587652e-05, + "loss": 0.038724538683891294, + "step": 66380 + }, + { + "epoch": 9.423704755145494, + "grad_norm": 9.952641487121582, + "learning_rate": 9.058041163946061e-05, + "loss": 0.03757392466068268, + "step": 66390 + }, + { + "epoch": 9.425124201561392, + "grad_norm": 0.6738945245742798, + "learning_rate": 9.057899219304472e-05, + "loss": 0.04065401554107666, + "step": 66400 + }, + { + "epoch": 9.426543647977288, + "grad_norm": 7.547946453094482, + "learning_rate": 9.057757274662881e-05, + "loss": 0.052340525388717654, + "step": 66410 + }, + { + "epoch": 9.427963094393187, + "grad_norm": 3.0092594623565674, + "learning_rate": 9.057615330021292e-05, + "loss": 0.012151481211185455, + "step": 66420 + }, + { + "epoch": 9.429382540809085, + "grad_norm": 1.3092892169952393, + "learning_rate": 9.057473385379702e-05, + "loss": 0.011581452190876007, + "step": 66430 + }, + { + "epoch": 9.430801987224982, + "grad_norm": 3.9269165992736816, + "learning_rate": 9.057331440738113e-05, + "loss": 0.03567465841770172, + "step": 66440 + }, + { + "epoch": 9.43222143364088, + "grad_norm": 6.909012794494629, + "learning_rate": 9.057189496096524e-05, + "loss": 0.008584094047546387, + "step": 66450 + }, + { + "epoch": 9.433640880056778, + "grad_norm": 8.529704093933105, + "learning_rate": 9.057047551454934e-05, + "loss": 0.08972499370574952, + "step": 66460 + }, + { + "epoch": 9.435060326472676, + "grad_norm": 0.0556497648358345, + "learning_rate": 9.056905606813343e-05, + "loss": 0.04341354668140411, + "step": 66470 + }, + { + "epoch": 9.436479772888573, + "grad_norm": 0.9770888686180115, + "learning_rate": 9.056763662171753e-05, + "loss": 0.010035674273967742, + "step": 66480 + }, + { + "epoch": 9.437899219304471, + "grad_norm": 1.1458122730255127, + "learning_rate": 9.056621717530164e-05, + "loss": 0.014818742871284485, + "step": 66490 + }, + { + "epoch": 9.43931866572037, + "grad_norm": 0.08403871953487396, + "learning_rate": 9.056479772888574e-05, + "loss": 0.023149700462818147, + "step": 66500 + }, + { + "epoch": 9.43931866572037, + "eval_accuracy": 0.9823233928912062, + "eval_loss": 0.06090559810400009, + "eval_runtime": 32.9512, + "eval_samples_per_second": 477.282, + "eval_steps_per_second": 14.931, + "step": 66500 + }, + { + "epoch": 9.440738112136266, + "grad_norm": 0.5672199726104736, + "learning_rate": 9.056337828246985e-05, + "loss": 0.005511279031634331, + "step": 66510 + }, + { + "epoch": 9.442157558552164, + "grad_norm": 0.029063401743769646, + "learning_rate": 9.056195883605393e-05, + "loss": 0.018873117864131927, + "step": 66520 + }, + { + "epoch": 9.443577004968063, + "grad_norm": 16.47539520263672, + "learning_rate": 9.056053938963804e-05, + "loss": 0.032866987586021426, + "step": 66530 + }, + { + "epoch": 9.444996451383961, + "grad_norm": 5.749122142791748, + "learning_rate": 9.055911994322215e-05, + "loss": 0.0254135400056839, + "step": 66540 + }, + { + "epoch": 9.446415897799858, + "grad_norm": 11.743706703186035, + "learning_rate": 9.055770049680625e-05, + "loss": 0.06181545257568359, + "step": 66550 + }, + { + "epoch": 9.447835344215756, + "grad_norm": 4.427557945251465, + "learning_rate": 9.055628105039036e-05, + "loss": 0.04300175309181213, + "step": 66560 + }, + { + "epoch": 9.449254790631654, + "grad_norm": 0.4895526170730591, + "learning_rate": 9.055486160397445e-05, + "loss": 0.006782171130180359, + "step": 66570 + }, + { + "epoch": 9.45067423704755, + "grad_norm": 6.674429893493652, + "learning_rate": 9.055344215755856e-05, + "loss": 0.028919672966003417, + "step": 66580 + }, + { + "epoch": 9.452093683463449, + "grad_norm": 2.419039487838745, + "learning_rate": 9.055202271114266e-05, + "loss": 0.07479876279830933, + "step": 66590 + }, + { + "epoch": 9.453513129879347, + "grad_norm": 0.10058147460222244, + "learning_rate": 9.055060326472677e-05, + "loss": 0.020939281582832335, + "step": 66600 + }, + { + "epoch": 9.454932576295246, + "grad_norm": 0.43330129981040955, + "learning_rate": 9.054918381831086e-05, + "loss": 0.027914851903915405, + "step": 66610 + }, + { + "epoch": 9.456352022711142, + "grad_norm": 3.174302339553833, + "learning_rate": 9.054776437189496e-05, + "loss": 0.018397895991802214, + "step": 66620 + }, + { + "epoch": 9.45777146912704, + "grad_norm": 0.3704909682273865, + "learning_rate": 9.054634492547907e-05, + "loss": 0.012019617855548859, + "step": 66630 + }, + { + "epoch": 9.459190915542939, + "grad_norm": 3.7187345027923584, + "learning_rate": 9.054492547906317e-05, + "loss": 0.016837552189826965, + "step": 66640 + }, + { + "epoch": 9.460610361958835, + "grad_norm": 2.88364315032959, + "learning_rate": 9.054350603264728e-05, + "loss": 0.0451316237449646, + "step": 66650 + }, + { + "epoch": 9.462029808374734, + "grad_norm": 0.3306542634963989, + "learning_rate": 9.054208658623138e-05, + "loss": 0.09204464554786682, + "step": 66660 + }, + { + "epoch": 9.463449254790632, + "grad_norm": 3.2169761657714844, + "learning_rate": 9.054066713981547e-05, + "loss": 0.018502812087535857, + "step": 66670 + }, + { + "epoch": 9.46486870120653, + "grad_norm": 0.5467488765716553, + "learning_rate": 9.053924769339957e-05, + "loss": 0.01434284746646881, + "step": 66680 + }, + { + "epoch": 9.466288147622427, + "grad_norm": 4.416463851928711, + "learning_rate": 9.053782824698368e-05, + "loss": 0.03661760985851288, + "step": 66690 + }, + { + "epoch": 9.467707594038325, + "grad_norm": 0.4002857506275177, + "learning_rate": 9.053640880056778e-05, + "loss": 0.10666844844818116, + "step": 66700 + }, + { + "epoch": 9.469127040454223, + "grad_norm": 1.848854422569275, + "learning_rate": 9.053498935415189e-05, + "loss": 0.036550584435462954, + "step": 66710 + }, + { + "epoch": 9.47054648687012, + "grad_norm": 8.114850997924805, + "learning_rate": 9.053356990773599e-05, + "loss": 0.0459254264831543, + "step": 66720 + }, + { + "epoch": 9.471965933286018, + "grad_norm": 2.8109030723571777, + "learning_rate": 9.053215046132009e-05, + "loss": 0.013741156458854676, + "step": 66730 + }, + { + "epoch": 9.473385379701917, + "grad_norm": 0.42820295691490173, + "learning_rate": 9.05307310149042e-05, + "loss": 0.035875517129898074, + "step": 66740 + }, + { + "epoch": 9.474804826117815, + "grad_norm": 1.107942819595337, + "learning_rate": 9.05293115684883e-05, + "loss": 0.018605363368988038, + "step": 66750 + }, + { + "epoch": 9.476224272533711, + "grad_norm": 0.3636723458766937, + "learning_rate": 9.05278921220724e-05, + "loss": 0.09011185765266419, + "step": 66760 + }, + { + "epoch": 9.47764371894961, + "grad_norm": 4.078773021697998, + "learning_rate": 9.052647267565649e-05, + "loss": 0.06534717082977295, + "step": 66770 + }, + { + "epoch": 9.479063165365508, + "grad_norm": 6.753770351409912, + "learning_rate": 9.05250532292406e-05, + "loss": 0.05468626618385315, + "step": 66780 + }, + { + "epoch": 9.480482611781405, + "grad_norm": 0.03908902779221535, + "learning_rate": 9.05236337828247e-05, + "loss": 0.030472838878631593, + "step": 66790 + }, + { + "epoch": 9.481902058197303, + "grad_norm": 0.09945162385702133, + "learning_rate": 9.052221433640881e-05, + "loss": 0.019605173170566557, + "step": 66800 + }, + { + "epoch": 9.483321504613201, + "grad_norm": 2.562227964401245, + "learning_rate": 9.05207948899929e-05, + "loss": 0.04079928696155548, + "step": 66810 + }, + { + "epoch": 9.4847409510291, + "grad_norm": 6.359819412231445, + "learning_rate": 9.051937544357702e-05, + "loss": 0.03491916060447693, + "step": 66820 + }, + { + "epoch": 9.486160397444996, + "grad_norm": 1.103685975074768, + "learning_rate": 9.051795599716111e-05, + "loss": 0.045868688821792604, + "step": 66830 + }, + { + "epoch": 9.487579843860894, + "grad_norm": 3.343043088912964, + "learning_rate": 9.051653655074521e-05, + "loss": 0.060406225919723514, + "step": 66840 + }, + { + "epoch": 9.488999290276793, + "grad_norm": 7.082602500915527, + "learning_rate": 9.051511710432932e-05, + "loss": 0.05447771549224854, + "step": 66850 + }, + { + "epoch": 9.490418736692689, + "grad_norm": 0.6787410378456116, + "learning_rate": 9.051369765791342e-05, + "loss": 0.012434682250022889, + "step": 66860 + }, + { + "epoch": 9.491838183108587, + "grad_norm": 6.561841011047363, + "learning_rate": 9.051227821149753e-05, + "loss": 0.016962145268917084, + "step": 66870 + }, + { + "epoch": 9.493257629524486, + "grad_norm": 3.4125189781188965, + "learning_rate": 9.051085876508161e-05, + "loss": 0.035667648911476134, + "step": 66880 + }, + { + "epoch": 9.494677075940384, + "grad_norm": 1.578565001487732, + "learning_rate": 9.050943931866573e-05, + "loss": 0.023421129584312438, + "step": 66890 + }, + { + "epoch": 9.49609652235628, + "grad_norm": 0.06974627077579498, + "learning_rate": 9.050801987224982e-05, + "loss": 0.0638196587562561, + "step": 66900 + }, + { + "epoch": 9.497515968772179, + "grad_norm": 0.9934460520744324, + "learning_rate": 9.050660042583393e-05, + "loss": 0.03134672641754151, + "step": 66910 + }, + { + "epoch": 9.498935415188077, + "grad_norm": 6.786892890930176, + "learning_rate": 9.050518097941803e-05, + "loss": 0.04574669897556305, + "step": 66920 + }, + { + "epoch": 9.500354861603974, + "grad_norm": 0.9338919520378113, + "learning_rate": 9.050376153300213e-05, + "loss": 0.035261183977127075, + "step": 66930 + }, + { + "epoch": 9.501774308019872, + "grad_norm": 3.783557653427124, + "learning_rate": 9.050234208658624e-05, + "loss": 0.03577567338943481, + "step": 66940 + }, + { + "epoch": 9.50319375443577, + "grad_norm": 0.2062395215034485, + "learning_rate": 9.050092264017034e-05, + "loss": 0.05642724633216858, + "step": 66950 + }, + { + "epoch": 9.504613200851669, + "grad_norm": 0.6609073281288147, + "learning_rate": 9.049950319375445e-05, + "loss": 0.035305237770080565, + "step": 66960 + }, + { + "epoch": 9.506032647267565, + "grad_norm": 0.12313273549079895, + "learning_rate": 9.049808374733855e-05, + "loss": 0.020653310418128967, + "step": 66970 + }, + { + "epoch": 9.507452093683463, + "grad_norm": 5.6952996253967285, + "learning_rate": 9.049666430092264e-05, + "loss": 0.027990663051605226, + "step": 66980 + }, + { + "epoch": 9.508871540099362, + "grad_norm": 5.661862850189209, + "learning_rate": 9.049524485450674e-05, + "loss": 0.014997878670692444, + "step": 66990 + }, + { + "epoch": 9.510290986515258, + "grad_norm": 2.4455645084381104, + "learning_rate": 9.049382540809085e-05, + "loss": 0.005252880230545998, + "step": 67000 + }, + { + "epoch": 9.510290986515258, + "eval_accuracy": 0.9795892414319324, + "eval_loss": 0.07509937882423401, + "eval_runtime": 32.595, + "eval_samples_per_second": 482.498, + "eval_steps_per_second": 15.094, + "step": 67000 + }, + { + "epoch": 9.511710432931157, + "grad_norm": 0.44350701570510864, + "learning_rate": 9.049240596167495e-05, + "loss": 0.07043783068656921, + "step": 67010 + }, + { + "epoch": 9.513129879347055, + "grad_norm": 0.09790097177028656, + "learning_rate": 9.049098651525906e-05, + "loss": 0.03614462614059448, + "step": 67020 + }, + { + "epoch": 9.514549325762953, + "grad_norm": 4.326782703399658, + "learning_rate": 9.048956706884316e-05, + "loss": 0.055505746603012086, + "step": 67030 + }, + { + "epoch": 9.51596877217885, + "grad_norm": 1.6764984130859375, + "learning_rate": 9.048814762242725e-05, + "loss": 0.04317740499973297, + "step": 67040 + }, + { + "epoch": 9.517388218594748, + "grad_norm": 11.991434097290039, + "learning_rate": 9.048672817601136e-05, + "loss": 0.033396673202514646, + "step": 67050 + }, + { + "epoch": 9.518807665010646, + "grad_norm": 0.31179875135421753, + "learning_rate": 9.048530872959546e-05, + "loss": 0.007232289761304855, + "step": 67060 + }, + { + "epoch": 9.520227111426543, + "grad_norm": 2.5120632648468018, + "learning_rate": 9.048388928317957e-05, + "loss": 0.07684400081634521, + "step": 67070 + }, + { + "epoch": 9.521646557842441, + "grad_norm": 1.4562314748764038, + "learning_rate": 9.048246983676366e-05, + "loss": 0.026847514510154723, + "step": 67080 + }, + { + "epoch": 9.52306600425834, + "grad_norm": 1.2154396772384644, + "learning_rate": 9.048105039034777e-05, + "loss": 0.015740375220775604, + "step": 67090 + }, + { + "epoch": 9.524485450674238, + "grad_norm": 1.455774188041687, + "learning_rate": 9.047963094393187e-05, + "loss": 0.01216294914484024, + "step": 67100 + }, + { + "epoch": 9.525904897090134, + "grad_norm": 4.984959125518799, + "learning_rate": 9.047821149751598e-05, + "loss": 0.06393689513206482, + "step": 67110 + }, + { + "epoch": 9.527324343506033, + "grad_norm": 1.871148705482483, + "learning_rate": 9.047679205110007e-05, + "loss": 0.02973054051399231, + "step": 67120 + }, + { + "epoch": 9.528743789921931, + "grad_norm": 6.94720983505249, + "learning_rate": 9.047537260468417e-05, + "loss": 0.03340931236743927, + "step": 67130 + }, + { + "epoch": 9.530163236337827, + "grad_norm": 8.067058563232422, + "learning_rate": 9.047395315826828e-05, + "loss": 0.027840656042099, + "step": 67140 + }, + { + "epoch": 9.531582682753726, + "grad_norm": 0.03339483216404915, + "learning_rate": 9.047253371185238e-05, + "loss": 0.03909151256084442, + "step": 67150 + }, + { + "epoch": 9.533002129169624, + "grad_norm": 0.03475292772054672, + "learning_rate": 9.047111426543649e-05, + "loss": 0.017502933740615845, + "step": 67160 + }, + { + "epoch": 9.534421575585522, + "grad_norm": 8.537821769714355, + "learning_rate": 9.046983676366218e-05, + "loss": 0.0501953661441803, + "step": 67170 + }, + { + "epoch": 9.535841022001419, + "grad_norm": 8.778392791748047, + "learning_rate": 9.046841731724627e-05, + "loss": 0.0493028998374939, + "step": 67180 + }, + { + "epoch": 9.537260468417317, + "grad_norm": 0.6387729644775391, + "learning_rate": 9.046699787083038e-05, + "loss": 0.021017967164516448, + "step": 67190 + }, + { + "epoch": 9.538679914833216, + "grad_norm": 0.1580241322517395, + "learning_rate": 9.04655784244145e-05, + "loss": 0.0033407047390937804, + "step": 67200 + }, + { + "epoch": 9.540099361249112, + "grad_norm": 10.30275821685791, + "learning_rate": 9.046415897799858e-05, + "loss": 0.030325162410736083, + "step": 67210 + }, + { + "epoch": 9.54151880766501, + "grad_norm": 8.054594039916992, + "learning_rate": 9.046273953158269e-05, + "loss": 0.026592501997947694, + "step": 67220 + }, + { + "epoch": 9.542938254080909, + "grad_norm": 5.245704174041748, + "learning_rate": 9.046132008516679e-05, + "loss": 0.041508796811103824, + "step": 67230 + }, + { + "epoch": 9.544357700496807, + "grad_norm": 8.013243675231934, + "learning_rate": 9.04599006387509e-05, + "loss": 0.0365505576133728, + "step": 67240 + }, + { + "epoch": 9.545777146912704, + "grad_norm": 7.659332752227783, + "learning_rate": 9.0458481192335e-05, + "loss": 0.07635741233825684, + "step": 67250 + }, + { + "epoch": 9.547196593328602, + "grad_norm": 8.488255500793457, + "learning_rate": 9.045706174591909e-05, + "loss": 0.017268522083759306, + "step": 67260 + }, + { + "epoch": 9.5486160397445, + "grad_norm": 0.7536658048629761, + "learning_rate": 9.045564229950319e-05, + "loss": 0.049896126985549925, + "step": 67270 + }, + { + "epoch": 9.550035486160397, + "grad_norm": 0.050682421773672104, + "learning_rate": 9.04542228530873e-05, + "loss": 0.09405158758163452, + "step": 67280 + }, + { + "epoch": 9.551454932576295, + "grad_norm": 2.3281517028808594, + "learning_rate": 9.045280340667141e-05, + "loss": 0.030408984422683714, + "step": 67290 + }, + { + "epoch": 9.552874378992193, + "grad_norm": 4.442478656768799, + "learning_rate": 9.045138396025551e-05, + "loss": 0.04440068006515503, + "step": 67300 + }, + { + "epoch": 9.554293825408092, + "grad_norm": 1.6404725313186646, + "learning_rate": 9.04499645138396e-05, + "loss": 0.057572323083877566, + "step": 67310 + }, + { + "epoch": 9.555713271823988, + "grad_norm": 0.7896831035614014, + "learning_rate": 9.04485450674237e-05, + "loss": 0.0089239239692688, + "step": 67320 + }, + { + "epoch": 9.557132718239886, + "grad_norm": 1.7920589447021484, + "learning_rate": 9.044712562100781e-05, + "loss": 0.01239323690533638, + "step": 67330 + }, + { + "epoch": 9.558552164655785, + "grad_norm": 0.457255095243454, + "learning_rate": 9.044570617459191e-05, + "loss": 0.02275240421295166, + "step": 67340 + }, + { + "epoch": 9.559971611071681, + "grad_norm": 1.4812309741973877, + "learning_rate": 9.044428672817602e-05, + "loss": 0.022527748346328737, + "step": 67350 + }, + { + "epoch": 9.56139105748758, + "grad_norm": 0.7697468400001526, + "learning_rate": 9.04428672817601e-05, + "loss": 0.022563908994197846, + "step": 67360 + }, + { + "epoch": 9.562810503903478, + "grad_norm": 0.5426087975502014, + "learning_rate": 9.044144783534422e-05, + "loss": 0.032289788126945496, + "step": 67370 + }, + { + "epoch": 9.564229950319376, + "grad_norm": 0.11037856340408325, + "learning_rate": 9.044002838892833e-05, + "loss": 0.029405874013900758, + "step": 67380 + }, + { + "epoch": 9.565649396735273, + "grad_norm": 6.742136001586914, + "learning_rate": 9.043860894251243e-05, + "loss": 0.03677998483181, + "step": 67390 + }, + { + "epoch": 9.567068843151171, + "grad_norm": 0.12115723639726639, + "learning_rate": 9.043718949609654e-05, + "loss": 0.03549255430698395, + "step": 67400 + }, + { + "epoch": 9.56848828956707, + "grad_norm": 2.4344263076782227, + "learning_rate": 9.043577004968062e-05, + "loss": 0.025998961925506592, + "step": 67410 + }, + { + "epoch": 9.569907735982966, + "grad_norm": 9.523209571838379, + "learning_rate": 9.043435060326473e-05, + "loss": 0.05110321044921875, + "step": 67420 + }, + { + "epoch": 9.571327182398864, + "grad_norm": 8.926665306091309, + "learning_rate": 9.043293115684883e-05, + "loss": 0.038002151250839236, + "step": 67430 + }, + { + "epoch": 9.572746628814762, + "grad_norm": 7.105165958404541, + "learning_rate": 9.043151171043294e-05, + "loss": 0.022980785369873045, + "step": 67440 + }, + { + "epoch": 9.57416607523066, + "grad_norm": 4.546730041503906, + "learning_rate": 9.043009226401704e-05, + "loss": 0.04310915470123291, + "step": 67450 + }, + { + "epoch": 9.575585521646557, + "grad_norm": 0.07590577006340027, + "learning_rate": 9.042867281760113e-05, + "loss": 0.024601469933986663, + "step": 67460 + }, + { + "epoch": 9.577004968062456, + "grad_norm": 5.404637336730957, + "learning_rate": 9.042725337118525e-05, + "loss": 0.03879334032535553, + "step": 67470 + }, + { + "epoch": 9.578424414478354, + "grad_norm": 7.526176452636719, + "learning_rate": 9.042583392476934e-05, + "loss": 0.05651354193687439, + "step": 67480 + }, + { + "epoch": 9.57984386089425, + "grad_norm": 2.2813119888305664, + "learning_rate": 9.042441447835345e-05, + "loss": 0.06973368525505066, + "step": 67490 + }, + { + "epoch": 9.581263307310149, + "grad_norm": 0.2586059272289276, + "learning_rate": 9.042299503193755e-05, + "loss": 0.01707226037979126, + "step": 67500 + }, + { + "epoch": 9.581263307310149, + "eval_accuracy": 0.9779360335728365, + "eval_loss": 0.07528796792030334, + "eval_runtime": 34.2604, + "eval_samples_per_second": 459.043, + "eval_steps_per_second": 14.361, + "step": 67500 + }, + { + "epoch": 9.582682753726047, + "grad_norm": 0.9092231392860413, + "learning_rate": 9.042157558552166e-05, + "loss": 0.057588744163513186, + "step": 67510 + }, + { + "epoch": 9.584102200141945, + "grad_norm": 0.9042927622795105, + "learning_rate": 9.042015613910575e-05, + "loss": 0.048612546920776364, + "step": 67520 + }, + { + "epoch": 9.585521646557842, + "grad_norm": 9.093894004821777, + "learning_rate": 9.041873669268986e-05, + "loss": 0.049756836891174314, + "step": 67530 + }, + { + "epoch": 9.58694109297374, + "grad_norm": 0.08922228217124939, + "learning_rate": 9.041731724627395e-05, + "loss": 0.02695835828781128, + "step": 67540 + }, + { + "epoch": 9.588360539389639, + "grad_norm": 0.0805022269487381, + "learning_rate": 9.041589779985807e-05, + "loss": 0.03669121265411377, + "step": 67550 + }, + { + "epoch": 9.589779985805535, + "grad_norm": 7.025901794433594, + "learning_rate": 9.041447835344216e-05, + "loss": 0.04222923517227173, + "step": 67560 + }, + { + "epoch": 9.591199432221433, + "grad_norm": 0.37235766649246216, + "learning_rate": 9.041305890702626e-05, + "loss": 0.05251051783561707, + "step": 67570 + }, + { + "epoch": 9.592618878637332, + "grad_norm": 0.1628137230873108, + "learning_rate": 9.041163946061037e-05, + "loss": 0.003060714900493622, + "step": 67580 + }, + { + "epoch": 9.59403832505323, + "grad_norm": 0.05945875868201256, + "learning_rate": 9.041022001419447e-05, + "loss": 0.012935033440589905, + "step": 67590 + }, + { + "epoch": 9.595457771469126, + "grad_norm": 4.107996940612793, + "learning_rate": 9.040880056777858e-05, + "loss": 0.04241987466812134, + "step": 67600 + }, + { + "epoch": 9.596877217885025, + "grad_norm": 0.3481174111366272, + "learning_rate": 9.040738112136268e-05, + "loss": 0.04319833219051361, + "step": 67610 + }, + { + "epoch": 9.598296664300923, + "grad_norm": 0.14052341878414154, + "learning_rate": 9.040596167494677e-05, + "loss": 0.030803701281547545, + "step": 67620 + }, + { + "epoch": 9.59971611071682, + "grad_norm": 0.5670154094696045, + "learning_rate": 9.040454222853087e-05, + "loss": 0.024802103638648987, + "step": 67630 + }, + { + "epoch": 9.601135557132718, + "grad_norm": 1.3572872877120972, + "learning_rate": 9.040312278211498e-05, + "loss": 0.005655725300312042, + "step": 67640 + }, + { + "epoch": 9.602555003548616, + "grad_norm": 0.18713583052158356, + "learning_rate": 9.040170333569908e-05, + "loss": 0.02235008031129837, + "step": 67650 + }, + { + "epoch": 9.603974449964515, + "grad_norm": 0.08048121631145477, + "learning_rate": 9.040028388928319e-05, + "loss": 0.03131297528743744, + "step": 67660 + }, + { + "epoch": 9.605393896380411, + "grad_norm": 0.01172435563057661, + "learning_rate": 9.039886444286729e-05, + "loss": 0.015043826401233673, + "step": 67670 + }, + { + "epoch": 9.60681334279631, + "grad_norm": 0.03790862113237381, + "learning_rate": 9.039744499645139e-05, + "loss": 0.036029329895973204, + "step": 67680 + }, + { + "epoch": 9.608232789212208, + "grad_norm": 1.4519929885864258, + "learning_rate": 9.03960255500355e-05, + "loss": 0.02968733012676239, + "step": 67690 + }, + { + "epoch": 9.609652235628104, + "grad_norm": 6.6910552978515625, + "learning_rate": 9.03946061036196e-05, + "loss": 0.029774963855743408, + "step": 67700 + }, + { + "epoch": 9.611071682044003, + "grad_norm": 1.6743693351745605, + "learning_rate": 9.03931866572037e-05, + "loss": 0.033174237608909606, + "step": 67710 + }, + { + "epoch": 9.6124911284599, + "grad_norm": 0.9147611856460571, + "learning_rate": 9.039176721078779e-05, + "loss": 0.034508511424064636, + "step": 67720 + }, + { + "epoch": 9.6139105748758, + "grad_norm": 0.6280707716941833, + "learning_rate": 9.03903477643719e-05, + "loss": 0.016519129276275635, + "step": 67730 + }, + { + "epoch": 9.615330021291696, + "grad_norm": 0.500525712966919, + "learning_rate": 9.0388928317956e-05, + "loss": 0.02264741063117981, + "step": 67740 + }, + { + "epoch": 9.616749467707594, + "grad_norm": 5.839907646179199, + "learning_rate": 9.038750887154011e-05, + "loss": 0.015506619215011596, + "step": 67750 + }, + { + "epoch": 9.618168914123492, + "grad_norm": 7.888493061065674, + "learning_rate": 9.03860894251242e-05, + "loss": 0.0280093789100647, + "step": 67760 + }, + { + "epoch": 9.619588360539389, + "grad_norm": 0.25900712609291077, + "learning_rate": 9.03846699787083e-05, + "loss": 0.03482568860054016, + "step": 67770 + }, + { + "epoch": 9.621007806955287, + "grad_norm": 1.0274555683135986, + "learning_rate": 9.038325053229241e-05, + "loss": 0.0375358372926712, + "step": 67780 + }, + { + "epoch": 9.622427253371185, + "grad_norm": 2.778224468231201, + "learning_rate": 9.038183108587651e-05, + "loss": 0.023729130625724792, + "step": 67790 + }, + { + "epoch": 9.623846699787084, + "grad_norm": 7.2992939949035645, + "learning_rate": 9.038041163946062e-05, + "loss": 0.028829315304756166, + "step": 67800 + }, + { + "epoch": 9.62526614620298, + "grad_norm": 0.026992222294211388, + "learning_rate": 9.037899219304472e-05, + "loss": 0.04649405777454376, + "step": 67810 + }, + { + "epoch": 9.626685592618879, + "grad_norm": 3.8848114013671875, + "learning_rate": 9.037757274662882e-05, + "loss": 0.04889317154884339, + "step": 67820 + }, + { + "epoch": 9.628105039034777, + "grad_norm": 1.3498327732086182, + "learning_rate": 9.037615330021291e-05, + "loss": 0.027367666363716125, + "step": 67830 + }, + { + "epoch": 9.629524485450673, + "grad_norm": 0.817976713180542, + "learning_rate": 9.037473385379702e-05, + "loss": 0.04165278971195221, + "step": 67840 + }, + { + "epoch": 9.630943931866572, + "grad_norm": 1.5236300230026245, + "learning_rate": 9.037331440738112e-05, + "loss": 0.019522148370742797, + "step": 67850 + }, + { + "epoch": 9.63236337828247, + "grad_norm": 2.0810041427612305, + "learning_rate": 9.037189496096523e-05, + "loss": 0.03534324169158935, + "step": 67860 + }, + { + "epoch": 9.633782824698368, + "grad_norm": 2.70131778717041, + "learning_rate": 9.037047551454933e-05, + "loss": 0.01612260490655899, + "step": 67870 + }, + { + "epoch": 9.635202271114265, + "grad_norm": 0.48344776034355164, + "learning_rate": 9.036905606813343e-05, + "loss": 0.04688323438167572, + "step": 67880 + }, + { + "epoch": 9.636621717530163, + "grad_norm": 5.539488792419434, + "learning_rate": 9.036763662171754e-05, + "loss": 0.02970455288887024, + "step": 67890 + }, + { + "epoch": 9.638041163946061, + "grad_norm": 0.35806792974472046, + "learning_rate": 9.036621717530164e-05, + "loss": 0.0037241220474243165, + "step": 67900 + }, + { + "epoch": 9.639460610361958, + "grad_norm": 7.574960708618164, + "learning_rate": 9.036479772888575e-05, + "loss": 0.030697906017303468, + "step": 67910 + }, + { + "epoch": 9.640880056777856, + "grad_norm": 3.901747941970825, + "learning_rate": 9.036337828246984e-05, + "loss": 0.030141755938529968, + "step": 67920 + }, + { + "epoch": 9.642299503193755, + "grad_norm": 1.6139155626296997, + "learning_rate": 9.036195883605394e-05, + "loss": 0.04500369131565094, + "step": 67930 + }, + { + "epoch": 9.643718949609653, + "grad_norm": 11.200180053710938, + "learning_rate": 9.036053938963804e-05, + "loss": 0.024948553740978242, + "step": 67940 + }, + { + "epoch": 9.64513839602555, + "grad_norm": 5.555013656616211, + "learning_rate": 9.035911994322215e-05, + "loss": 0.0150326669216156, + "step": 67950 + }, + { + "epoch": 9.646557842441448, + "grad_norm": 0.21982567012310028, + "learning_rate": 9.035770049680625e-05, + "loss": 0.041651162505149844, + "step": 67960 + }, + { + "epoch": 9.647977288857346, + "grad_norm": 4.405213356018066, + "learning_rate": 9.035628105039036e-05, + "loss": 0.022198933362960815, + "step": 67970 + }, + { + "epoch": 9.649396735273243, + "grad_norm": 0.15988144278526306, + "learning_rate": 9.035486160397446e-05, + "loss": 0.025725823640823365, + "step": 67980 + }, + { + "epoch": 9.650816181689141, + "grad_norm": 0.0842059925198555, + "learning_rate": 9.035344215755855e-05, + "loss": 0.025674444437026978, + "step": 67990 + }, + { + "epoch": 9.65223562810504, + "grad_norm": 0.30113139748573303, + "learning_rate": 9.035202271114266e-05, + "loss": 0.05994483232498169, + "step": 68000 + }, + { + "epoch": 9.65223562810504, + "eval_accuracy": 0.9802250906085077, + "eval_loss": 0.06837733089923859, + "eval_runtime": 32.9171, + "eval_samples_per_second": 477.777, + "eval_steps_per_second": 14.947, + "step": 68000 + }, + { + "epoch": 9.653655074520938, + "grad_norm": 0.037046462297439575, + "learning_rate": 9.035060326472676e-05, + "loss": 0.015805913507938384, + "step": 68010 + }, + { + "epoch": 9.655074520936834, + "grad_norm": 0.9824765920639038, + "learning_rate": 9.034918381831087e-05, + "loss": 0.021517379581928252, + "step": 68020 + }, + { + "epoch": 9.656493967352732, + "grad_norm": 0.04214629903435707, + "learning_rate": 9.034776437189496e-05, + "loss": 0.04769000113010406, + "step": 68030 + }, + { + "epoch": 9.65791341376863, + "grad_norm": 0.6568806171417236, + "learning_rate": 9.034634492547907e-05, + "loss": 0.01876375675201416, + "step": 68040 + }, + { + "epoch": 9.659332860184527, + "grad_norm": 2.5729522705078125, + "learning_rate": 9.034492547906316e-05, + "loss": 0.014666444063186646, + "step": 68050 + }, + { + "epoch": 9.660752306600425, + "grad_norm": 0.447099506855011, + "learning_rate": 9.034350603264728e-05, + "loss": 0.07560728788375855, + "step": 68060 + }, + { + "epoch": 9.662171753016324, + "grad_norm": 12.226578712463379, + "learning_rate": 9.034208658623137e-05, + "loss": 0.04010969698429108, + "step": 68070 + }, + { + "epoch": 9.663591199432222, + "grad_norm": 6.650430202484131, + "learning_rate": 9.034066713981547e-05, + "loss": 0.046042519807815555, + "step": 68080 + }, + { + "epoch": 9.665010645848119, + "grad_norm": 5.328957557678223, + "learning_rate": 9.033924769339958e-05, + "loss": 0.02840046286582947, + "step": 68090 + }, + { + "epoch": 9.666430092264017, + "grad_norm": 3.466201066970825, + "learning_rate": 9.033782824698368e-05, + "loss": 0.028611963987350462, + "step": 68100 + }, + { + "epoch": 9.667849538679915, + "grad_norm": 4.610507965087891, + "learning_rate": 9.033640880056779e-05, + "loss": 0.061683553457260135, + "step": 68110 + }, + { + "epoch": 9.669268985095812, + "grad_norm": 4.94869327545166, + "learning_rate": 9.033498935415189e-05, + "loss": 0.03219112753868103, + "step": 68120 + }, + { + "epoch": 9.67068843151171, + "grad_norm": 0.040585510432720184, + "learning_rate": 9.033356990773598e-05, + "loss": 0.08483388423919677, + "step": 68130 + }, + { + "epoch": 9.672107877927608, + "grad_norm": 0.7158527970314026, + "learning_rate": 9.033215046132008e-05, + "loss": 0.012271341681480408, + "step": 68140 + }, + { + "epoch": 9.673527324343507, + "grad_norm": 1.417113184928894, + "learning_rate": 9.033073101490419e-05, + "loss": 0.027994108200073243, + "step": 68150 + }, + { + "epoch": 9.674946770759403, + "grad_norm": 0.13532933592796326, + "learning_rate": 9.032931156848829e-05, + "loss": 0.0153349369764328, + "step": 68160 + }, + { + "epoch": 9.676366217175302, + "grad_norm": 6.49558687210083, + "learning_rate": 9.03278921220724e-05, + "loss": 0.05084382295608521, + "step": 68170 + }, + { + "epoch": 9.6777856635912, + "grad_norm": 3.603513717651367, + "learning_rate": 9.03264726756565e-05, + "loss": 0.02989896833896637, + "step": 68180 + }, + { + "epoch": 9.679205110007096, + "grad_norm": 6.750855922698975, + "learning_rate": 9.03250532292406e-05, + "loss": 0.04880207479000091, + "step": 68190 + }, + { + "epoch": 9.680624556422995, + "grad_norm": 0.35573068261146545, + "learning_rate": 9.03236337828247e-05, + "loss": 0.023963749408721924, + "step": 68200 + }, + { + "epoch": 9.682044002838893, + "grad_norm": 3.9779114723205566, + "learning_rate": 9.03222143364088e-05, + "loss": 0.012136822938919068, + "step": 68210 + }, + { + "epoch": 9.683463449254791, + "grad_norm": 3.566655397415161, + "learning_rate": 9.032079488999291e-05, + "loss": 0.030810701847076415, + "step": 68220 + }, + { + "epoch": 9.684882895670688, + "grad_norm": 0.5124838948249817, + "learning_rate": 9.031937544357701e-05, + "loss": 0.032623404264450075, + "step": 68230 + }, + { + "epoch": 9.686302342086586, + "grad_norm": 0.8495200872421265, + "learning_rate": 9.031795599716111e-05, + "loss": 0.04098401665687561, + "step": 68240 + }, + { + "epoch": 9.687721788502484, + "grad_norm": 2.748220205307007, + "learning_rate": 9.03165365507452e-05, + "loss": 0.038351207971572876, + "step": 68250 + }, + { + "epoch": 9.689141234918381, + "grad_norm": 2.5813050270080566, + "learning_rate": 9.031511710432932e-05, + "loss": 0.027606451511383058, + "step": 68260 + }, + { + "epoch": 9.69056068133428, + "grad_norm": 0.36798134446144104, + "learning_rate": 9.031369765791342e-05, + "loss": 0.041548147797584534, + "step": 68270 + }, + { + "epoch": 9.691980127750178, + "grad_norm": 0.32126396894454956, + "learning_rate": 9.031227821149753e-05, + "loss": 0.013662393391132354, + "step": 68280 + }, + { + "epoch": 9.693399574166076, + "grad_norm": 0.9041917324066162, + "learning_rate": 9.031085876508162e-05, + "loss": 0.030037564039230347, + "step": 68290 + }, + { + "epoch": 9.694819020581972, + "grad_norm": 15.671060562133789, + "learning_rate": 9.030943931866572e-05, + "loss": 0.04284512996673584, + "step": 68300 + }, + { + "epoch": 9.69623846699787, + "grad_norm": 2.6367321014404297, + "learning_rate": 9.030801987224983e-05, + "loss": 0.024413591623306273, + "step": 68310 + }, + { + "epoch": 9.697657913413769, + "grad_norm": 0.5867029428482056, + "learning_rate": 9.030660042583393e-05, + "loss": 0.09276617169380189, + "step": 68320 + }, + { + "epoch": 9.699077359829666, + "grad_norm": 0.19113053381443024, + "learning_rate": 9.030518097941804e-05, + "loss": 0.016789191961288454, + "step": 68330 + }, + { + "epoch": 9.700496806245564, + "grad_norm": 5.9239654541015625, + "learning_rate": 9.030376153300212e-05, + "loss": 0.015049314498901368, + "step": 68340 + }, + { + "epoch": 9.701916252661462, + "grad_norm": 1.9666008949279785, + "learning_rate": 9.030234208658623e-05, + "loss": 0.034362810850143435, + "step": 68350 + }, + { + "epoch": 9.70333569907736, + "grad_norm": 0.094641774892807, + "learning_rate": 9.030092264017033e-05, + "loss": 0.024285507202148438, + "step": 68360 + }, + { + "epoch": 9.704755145493257, + "grad_norm": 0.024246973916888237, + "learning_rate": 9.029950319375444e-05, + "loss": 0.04042229950428009, + "step": 68370 + }, + { + "epoch": 9.706174591909155, + "grad_norm": 11.903240203857422, + "learning_rate": 9.029808374733854e-05, + "loss": 0.05841625332832336, + "step": 68380 + }, + { + "epoch": 9.707594038325054, + "grad_norm": 8.026371955871582, + "learning_rate": 9.029666430092264e-05, + "loss": 0.0390622079372406, + "step": 68390 + }, + { + "epoch": 9.70901348474095, + "grad_norm": 0.35515421628952026, + "learning_rate": 9.029524485450675e-05, + "loss": 0.04484111368656159, + "step": 68400 + }, + { + "epoch": 9.710432931156848, + "grad_norm": 0.31352072954177856, + "learning_rate": 9.029382540809085e-05, + "loss": 0.024947115778923036, + "step": 68410 + }, + { + "epoch": 9.711852377572747, + "grad_norm": 0.3414263427257538, + "learning_rate": 9.029240596167496e-05, + "loss": 0.007836203277111053, + "step": 68420 + }, + { + "epoch": 9.713271823988645, + "grad_norm": 1.5273081064224243, + "learning_rate": 9.029098651525905e-05, + "loss": 0.018830813467502594, + "step": 68430 + }, + { + "epoch": 9.714691270404542, + "grad_norm": 0.6126468181610107, + "learning_rate": 9.028956706884315e-05, + "loss": 0.01927516460418701, + "step": 68440 + }, + { + "epoch": 9.71611071682044, + "grad_norm": 0.3930172026157379, + "learning_rate": 9.028814762242725e-05, + "loss": 0.01516784429550171, + "step": 68450 + }, + { + "epoch": 9.717530163236338, + "grad_norm": 0.051625289022922516, + "learning_rate": 9.028672817601136e-05, + "loss": 0.015130971372127534, + "step": 68460 + }, + { + "epoch": 9.718949609652235, + "grad_norm": 14.71915340423584, + "learning_rate": 9.028530872959546e-05, + "loss": 0.05186105966567993, + "step": 68470 + }, + { + "epoch": 9.720369056068133, + "grad_norm": 1.3532153367996216, + "learning_rate": 9.028388928317957e-05, + "loss": 0.04229007363319397, + "step": 68480 + }, + { + "epoch": 9.721788502484031, + "grad_norm": 11.942264556884766, + "learning_rate": 9.028246983676367e-05, + "loss": 0.04268681704998016, + "step": 68490 + }, + { + "epoch": 9.72320794889993, + "grad_norm": 9.604484558105469, + "learning_rate": 9.028105039034776e-05, + "loss": 0.0274705708026886, + "step": 68500 + }, + { + "epoch": 9.72320794889993, + "eval_accuracy": 0.9760284860431105, + "eval_loss": 0.07992067188024521, + "eval_runtime": 32.7675, + "eval_samples_per_second": 479.957, + "eval_steps_per_second": 15.015, + "step": 68500 + }, + { + "epoch": 9.724627395315826, + "grad_norm": 0.07708755880594254, + "learning_rate": 9.027963094393187e-05, + "loss": 0.054282760620117186, + "step": 68510 + }, + { + "epoch": 9.726046841731725, + "grad_norm": 0.1621921807527542, + "learning_rate": 9.027821149751597e-05, + "loss": 0.05011662244796753, + "step": 68520 + }, + { + "epoch": 9.727466288147623, + "grad_norm": 3.5815699100494385, + "learning_rate": 9.027679205110008e-05, + "loss": 0.022197265923023225, + "step": 68530 + }, + { + "epoch": 9.72888573456352, + "grad_norm": 3.2508788108825684, + "learning_rate": 9.027537260468418e-05, + "loss": 0.06533399224281311, + "step": 68540 + }, + { + "epoch": 9.730305180979418, + "grad_norm": 1.647470474243164, + "learning_rate": 9.027395315826828e-05, + "loss": 0.01592020094394684, + "step": 68550 + }, + { + "epoch": 9.731724627395316, + "grad_norm": 8.036884307861328, + "learning_rate": 9.027253371185237e-05, + "loss": 0.029971572756767272, + "step": 68560 + }, + { + "epoch": 9.733144073811214, + "grad_norm": 1.3174266815185547, + "learning_rate": 9.027111426543649e-05, + "loss": 0.03688263297080994, + "step": 68570 + }, + { + "epoch": 9.73456352022711, + "grad_norm": 0.30061954259872437, + "learning_rate": 9.026969481902058e-05, + "loss": 0.01946229040622711, + "step": 68580 + }, + { + "epoch": 9.735982966643009, + "grad_norm": 0.08620461076498032, + "learning_rate": 9.02682753726047e-05, + "loss": 0.027223414182662962, + "step": 68590 + }, + { + "epoch": 9.737402413058907, + "grad_norm": 9.125005722045898, + "learning_rate": 9.026685592618879e-05, + "loss": 0.03019559383392334, + "step": 68600 + }, + { + "epoch": 9.738821859474804, + "grad_norm": 0.14172478020191193, + "learning_rate": 9.026543647977289e-05, + "loss": 0.04983446002006531, + "step": 68610 + }, + { + "epoch": 9.740241305890702, + "grad_norm": 0.6675568222999573, + "learning_rate": 9.0264017033357e-05, + "loss": 0.026645490527153017, + "step": 68620 + }, + { + "epoch": 9.7416607523066, + "grad_norm": 6.146869659423828, + "learning_rate": 9.02625975869411e-05, + "loss": 0.02890416979789734, + "step": 68630 + }, + { + "epoch": 9.743080198722499, + "grad_norm": 6.365429878234863, + "learning_rate": 9.026117814052521e-05, + "loss": 0.05891613364219665, + "step": 68640 + }, + { + "epoch": 9.744499645138395, + "grad_norm": 0.1611870676279068, + "learning_rate": 9.025975869410929e-05, + "loss": 0.01408046782016754, + "step": 68650 + }, + { + "epoch": 9.745919091554294, + "grad_norm": 0.32437679171562195, + "learning_rate": 9.02583392476934e-05, + "loss": 0.056261765956878665, + "step": 68660 + }, + { + "epoch": 9.747338537970192, + "grad_norm": 8.895389556884766, + "learning_rate": 9.02569198012775e-05, + "loss": 0.06496468782424927, + "step": 68670 + }, + { + "epoch": 9.748757984386089, + "grad_norm": 0.5776633024215698, + "learning_rate": 9.025550035486161e-05, + "loss": 0.03514551818370819, + "step": 68680 + }, + { + "epoch": 9.750177430801987, + "grad_norm": 7.958167552947998, + "learning_rate": 9.025408090844572e-05, + "loss": 0.10595873594284058, + "step": 68690 + }, + { + "epoch": 9.751596877217885, + "grad_norm": 2.3124489784240723, + "learning_rate": 9.02526614620298e-05, + "loss": 0.0070076905190944675, + "step": 68700 + }, + { + "epoch": 9.753016323633783, + "grad_norm": 8.000714302062988, + "learning_rate": 9.025124201561392e-05, + "loss": 0.061643147468566896, + "step": 68710 + }, + { + "epoch": 9.75443577004968, + "grad_norm": 6.112409591674805, + "learning_rate": 9.024982256919801e-05, + "loss": 0.02671927809715271, + "step": 68720 + }, + { + "epoch": 9.755855216465578, + "grad_norm": 0.09724577516317368, + "learning_rate": 9.024840312278212e-05, + "loss": 0.011995351314544678, + "step": 68730 + }, + { + "epoch": 9.757274662881477, + "grad_norm": 0.11631211638450623, + "learning_rate": 9.024698367636622e-05, + "loss": 0.026068294048309328, + "step": 68740 + }, + { + "epoch": 9.758694109297373, + "grad_norm": 5.72356653213501, + "learning_rate": 9.024556422995032e-05, + "loss": 0.04342763423919678, + "step": 68750 + }, + { + "epoch": 9.760113555713271, + "grad_norm": 0.0995788425207138, + "learning_rate": 9.024414478353442e-05, + "loss": 0.04173007309436798, + "step": 68760 + }, + { + "epoch": 9.76153300212917, + "grad_norm": 0.45903313159942627, + "learning_rate": 9.024272533711853e-05, + "loss": 0.046264901757240295, + "step": 68770 + }, + { + "epoch": 9.762952448545068, + "grad_norm": 0.18278242647647858, + "learning_rate": 9.024130589070264e-05, + "loss": 0.015135698020458221, + "step": 68780 + }, + { + "epoch": 9.764371894960965, + "grad_norm": 11.565159797668457, + "learning_rate": 9.023988644428674e-05, + "loss": 0.025888818502426147, + "step": 68790 + }, + { + "epoch": 9.765791341376863, + "grad_norm": 10.3805570602417, + "learning_rate": 9.023846699787083e-05, + "loss": 0.0638617753982544, + "step": 68800 + }, + { + "epoch": 9.767210787792761, + "grad_norm": 4.832674980163574, + "learning_rate": 9.023704755145493e-05, + "loss": 0.027055513858795167, + "step": 68810 + }, + { + "epoch": 9.768630234208658, + "grad_norm": 6.372629165649414, + "learning_rate": 9.023562810503904e-05, + "loss": 0.04173833131790161, + "step": 68820 + }, + { + "epoch": 9.770049680624556, + "grad_norm": 3.9561760425567627, + "learning_rate": 9.023420865862314e-05, + "loss": 0.028818246722221375, + "step": 68830 + }, + { + "epoch": 9.771469127040454, + "grad_norm": 0.03159647807478905, + "learning_rate": 9.023278921220725e-05, + "loss": 0.05534324049949646, + "step": 68840 + }, + { + "epoch": 9.772888573456353, + "grad_norm": 0.6982274055480957, + "learning_rate": 9.023136976579133e-05, + "loss": 0.0524729311466217, + "step": 68850 + }, + { + "epoch": 9.77430801987225, + "grad_norm": 0.20039992034435272, + "learning_rate": 9.022995031937544e-05, + "loss": 0.11307457685470582, + "step": 68860 + }, + { + "epoch": 9.775727466288147, + "grad_norm": 0.5845163464546204, + "learning_rate": 9.022853087295956e-05, + "loss": 0.03681440949440003, + "step": 68870 + }, + { + "epoch": 9.777146912704046, + "grad_norm": 0.35227110981941223, + "learning_rate": 9.022711142654365e-05, + "loss": 0.05711947083473205, + "step": 68880 + }, + { + "epoch": 9.778566359119942, + "grad_norm": 0.7870414853096008, + "learning_rate": 9.022569198012776e-05, + "loss": 0.08433527350425721, + "step": 68890 + }, + { + "epoch": 9.77998580553584, + "grad_norm": 4.1868085861206055, + "learning_rate": 9.022427253371186e-05, + "loss": 0.03198896646499634, + "step": 68900 + }, + { + "epoch": 9.781405251951739, + "grad_norm": 3.233597755432129, + "learning_rate": 9.022285308729596e-05, + "loss": 0.019787636399269105, + "step": 68910 + }, + { + "epoch": 9.782824698367637, + "grad_norm": 0.9820303916931152, + "learning_rate": 9.022143364088006e-05, + "loss": 0.05114408135414124, + "step": 68920 + }, + { + "epoch": 9.784244144783534, + "grad_norm": 2.048151731491089, + "learning_rate": 9.022001419446417e-05, + "loss": 0.028573969006538393, + "step": 68930 + }, + { + "epoch": 9.785663591199432, + "grad_norm": 6.333803176879883, + "learning_rate": 9.021859474804826e-05, + "loss": 0.04720070958137512, + "step": 68940 + }, + { + "epoch": 9.78708303761533, + "grad_norm": 4.3006768226623535, + "learning_rate": 9.021717530163238e-05, + "loss": 0.028902134299278258, + "step": 68950 + }, + { + "epoch": 9.788502484031227, + "grad_norm": 0.016134673729538918, + "learning_rate": 9.021575585521647e-05, + "loss": 0.06601372957229615, + "step": 68960 + }, + { + "epoch": 9.789921930447125, + "grad_norm": 0.956045925617218, + "learning_rate": 9.021433640880057e-05, + "loss": 0.007774467766284943, + "step": 68970 + }, + { + "epoch": 9.791341376863024, + "grad_norm": 0.9125019311904907, + "learning_rate": 9.021291696238468e-05, + "loss": 0.017521186172962187, + "step": 68980 + }, + { + "epoch": 9.792760823278922, + "grad_norm": 0.5352014899253845, + "learning_rate": 9.021149751596878e-05, + "loss": 0.04417323470115662, + "step": 68990 + }, + { + "epoch": 9.794180269694818, + "grad_norm": 2.805211305618286, + "learning_rate": 9.021007806955289e-05, + "loss": 0.013200858235359192, + "step": 69000 + }, + { + "epoch": 9.794180269694818, + "eval_accuracy": 0.9781903732434667, + "eval_loss": 0.07315292954444885, + "eval_runtime": 33.3338, + "eval_samples_per_second": 471.804, + "eval_steps_per_second": 14.76, + "step": 69000 + }, + { + "epoch": 9.795599716110717, + "grad_norm": 5.417062282562256, + "learning_rate": 9.020865862313697e-05, + "loss": 0.03514570593833923, + "step": 69010 + }, + { + "epoch": 9.797019162526615, + "grad_norm": 16.58610725402832, + "learning_rate": 9.020723917672108e-05, + "loss": 0.048204198479652405, + "step": 69020 + }, + { + "epoch": 9.798438608942512, + "grad_norm": 1.3934992551803589, + "learning_rate": 9.020581973030518e-05, + "loss": 0.03688859939575195, + "step": 69030 + }, + { + "epoch": 9.79985805535841, + "grad_norm": 2.5325000286102295, + "learning_rate": 9.020440028388929e-05, + "loss": 0.060921496152877806, + "step": 69040 + }, + { + "epoch": 9.801277501774308, + "grad_norm": 2.1352107524871826, + "learning_rate": 9.020298083747339e-05, + "loss": 0.05504473447799683, + "step": 69050 + }, + { + "epoch": 9.802696948190206, + "grad_norm": 0.06280253082513809, + "learning_rate": 9.020156139105749e-05, + "loss": 0.05045873522758484, + "step": 69060 + }, + { + "epoch": 9.804116394606103, + "grad_norm": 2.113204002380371, + "learning_rate": 9.02001419446416e-05, + "loss": 0.027818793058395387, + "step": 69070 + }, + { + "epoch": 9.805535841022001, + "grad_norm": 0.8788726329803467, + "learning_rate": 9.01987224982257e-05, + "loss": 0.0661763608455658, + "step": 69080 + }, + { + "epoch": 9.8069552874379, + "grad_norm": 1.3066922426223755, + "learning_rate": 9.01973030518098e-05, + "loss": 0.013252317905426025, + "step": 69090 + }, + { + "epoch": 9.808374733853796, + "grad_norm": 5.125593662261963, + "learning_rate": 9.01958836053939e-05, + "loss": 0.016374337673187255, + "step": 69100 + }, + { + "epoch": 9.809794180269694, + "grad_norm": 1.2147674560546875, + "learning_rate": 9.0194464158978e-05, + "loss": 0.043211477994918826, + "step": 69110 + }, + { + "epoch": 9.811213626685593, + "grad_norm": 1.1660009622573853, + "learning_rate": 9.01930447125621e-05, + "loss": 0.03999923467636109, + "step": 69120 + }, + { + "epoch": 9.812633073101491, + "grad_norm": 0.08482635766267776, + "learning_rate": 9.019162526614621e-05, + "loss": 0.005695971474051475, + "step": 69130 + }, + { + "epoch": 9.814052519517388, + "grad_norm": 6.350915431976318, + "learning_rate": 9.019020581973031e-05, + "loss": 0.03713645339012146, + "step": 69140 + }, + { + "epoch": 9.815471965933286, + "grad_norm": 0.1697952151298523, + "learning_rate": 9.018878637331442e-05, + "loss": 0.05905236601829529, + "step": 69150 + }, + { + "epoch": 9.816891412349184, + "grad_norm": 6.711185455322266, + "learning_rate": 9.018736692689852e-05, + "loss": 0.04255087375640869, + "step": 69160 + }, + { + "epoch": 9.81831085876508, + "grad_norm": 0.012068729847669601, + "learning_rate": 9.018594748048261e-05, + "loss": 0.03160058557987213, + "step": 69170 + }, + { + "epoch": 9.819730305180979, + "grad_norm": 0.011125321500003338, + "learning_rate": 9.018452803406672e-05, + "loss": 0.040947556495666504, + "step": 69180 + }, + { + "epoch": 9.821149751596877, + "grad_norm": 2.1126627922058105, + "learning_rate": 9.018310858765082e-05, + "loss": 0.09066906571388245, + "step": 69190 + }, + { + "epoch": 9.822569198012776, + "grad_norm": 0.3607144057750702, + "learning_rate": 9.018168914123493e-05, + "loss": 0.03679504990577698, + "step": 69200 + }, + { + "epoch": 9.823988644428672, + "grad_norm": 8.781025886535645, + "learning_rate": 9.018026969481902e-05, + "loss": 0.05192814469337463, + "step": 69210 + }, + { + "epoch": 9.82540809084457, + "grad_norm": 0.5350939035415649, + "learning_rate": 9.017885024840313e-05, + "loss": 0.04593566954135895, + "step": 69220 + }, + { + "epoch": 9.826827537260469, + "grad_norm": 11.871026039123535, + "learning_rate": 9.017743080198722e-05, + "loss": 0.044462653994560244, + "step": 69230 + }, + { + "epoch": 9.828246983676365, + "grad_norm": 0.27279049158096313, + "learning_rate": 9.017601135557133e-05, + "loss": 0.019877782464027403, + "step": 69240 + }, + { + "epoch": 9.829666430092264, + "grad_norm": 1.0789798498153687, + "learning_rate": 9.017459190915543e-05, + "loss": 0.0799859881401062, + "step": 69250 + }, + { + "epoch": 9.831085876508162, + "grad_norm": 1.5498286485671997, + "learning_rate": 9.017317246273954e-05, + "loss": 0.01284824013710022, + "step": 69260 + }, + { + "epoch": 9.83250532292406, + "grad_norm": 0.40647804737091064, + "learning_rate": 9.017175301632364e-05, + "loss": 0.07362874746322631, + "step": 69270 + }, + { + "epoch": 9.833924769339957, + "grad_norm": 5.722687721252441, + "learning_rate": 9.017033356990774e-05, + "loss": 0.012973059713840485, + "step": 69280 + }, + { + "epoch": 9.835344215755855, + "grad_norm": 4.900279998779297, + "learning_rate": 9.016891412349185e-05, + "loss": 0.03889691233634949, + "step": 69290 + }, + { + "epoch": 9.836763662171753, + "grad_norm": 2.8748764991760254, + "learning_rate": 9.016749467707595e-05, + "loss": 0.02030760645866394, + "step": 69300 + }, + { + "epoch": 9.83818310858765, + "grad_norm": 5.670338153839111, + "learning_rate": 9.016607523066006e-05, + "loss": 0.05160186290740967, + "step": 69310 + }, + { + "epoch": 9.839602555003548, + "grad_norm": 0.7216220498085022, + "learning_rate": 9.016465578424414e-05, + "loss": 0.02686397135257721, + "step": 69320 + }, + { + "epoch": 9.841022001419446, + "grad_norm": 0.06797802448272705, + "learning_rate": 9.016323633782825e-05, + "loss": 0.019428203999996185, + "step": 69330 + }, + { + "epoch": 9.842441447835345, + "grad_norm": 0.248221755027771, + "learning_rate": 9.016181689141235e-05, + "loss": 0.015675346553325652, + "step": 69340 + }, + { + "epoch": 9.843860894251241, + "grad_norm": 9.568998336791992, + "learning_rate": 9.016039744499646e-05, + "loss": 0.06899868249893189, + "step": 69350 + }, + { + "epoch": 9.84528034066714, + "grad_norm": 0.03419743478298187, + "learning_rate": 9.015897799858056e-05, + "loss": 0.03135853111743927, + "step": 69360 + }, + { + "epoch": 9.846699787083038, + "grad_norm": 0.8306211829185486, + "learning_rate": 9.015755855216465e-05, + "loss": 0.021877869963645935, + "step": 69370 + }, + { + "epoch": 9.848119233498934, + "grad_norm": 7.840019226074219, + "learning_rate": 9.015613910574877e-05, + "loss": 0.06136330366134644, + "step": 69380 + }, + { + "epoch": 9.849538679914833, + "grad_norm": 6.342380523681641, + "learning_rate": 9.015471965933286e-05, + "loss": 0.012593349814414978, + "step": 69390 + }, + { + "epoch": 9.850958126330731, + "grad_norm": 11.976713180541992, + "learning_rate": 9.015330021291697e-05, + "loss": 0.05724484324455261, + "step": 69400 + }, + { + "epoch": 9.85237757274663, + "grad_norm": 1.395046353340149, + "learning_rate": 9.015188076650107e-05, + "loss": 0.019671787321567536, + "step": 69410 + }, + { + "epoch": 9.853797019162526, + "grad_norm": 2.9691336154937744, + "learning_rate": 9.015046132008517e-05, + "loss": 0.022539976239204406, + "step": 69420 + }, + { + "epoch": 9.855216465578424, + "grad_norm": 0.34182435274124146, + "learning_rate": 9.014904187366927e-05, + "loss": 0.02716274857521057, + "step": 69430 + }, + { + "epoch": 9.856635911994323, + "grad_norm": 0.361360102891922, + "learning_rate": 9.014762242725338e-05, + "loss": 0.021220579743385315, + "step": 69440 + }, + { + "epoch": 9.858055358410219, + "grad_norm": 0.6431921720504761, + "learning_rate": 9.014620298083747e-05, + "loss": 0.024566707015037537, + "step": 69450 + }, + { + "epoch": 9.859474804826117, + "grad_norm": 0.2722751200199127, + "learning_rate": 9.014478353442159e-05, + "loss": 0.03161434531211853, + "step": 69460 + }, + { + "epoch": 9.860894251242016, + "grad_norm": 0.27604079246520996, + "learning_rate": 9.014336408800568e-05, + "loss": 0.04619441032409668, + "step": 69470 + }, + { + "epoch": 9.862313697657914, + "grad_norm": 8.648920059204102, + "learning_rate": 9.014194464158978e-05, + "loss": 0.09676159620285034, + "step": 69480 + }, + { + "epoch": 9.86373314407381, + "grad_norm": 0.4807116687297821, + "learning_rate": 9.014052519517389e-05, + "loss": 0.04626038074493408, + "step": 69490 + }, + { + "epoch": 9.865152590489709, + "grad_norm": 0.0991576537489891, + "learning_rate": 9.013910574875799e-05, + "loss": 0.044435915350914, + "step": 69500 + }, + { + "epoch": 9.865152590489709, + "eval_accuracy": 0.9850575443504801, + "eval_loss": 0.04669315740466118, + "eval_runtime": 33.0656, + "eval_samples_per_second": 475.63, + "eval_steps_per_second": 14.88, + "step": 69500 + }, + { + "epoch": 9.866572036905607, + "grad_norm": 0.023003293201327324, + "learning_rate": 9.01376863023421e-05, + "loss": 0.023037827014923094, + "step": 69510 + }, + { + "epoch": 9.867991483321505, + "grad_norm": 0.8228864669799805, + "learning_rate": 9.013640880056778e-05, + "loss": 0.08946434259414673, + "step": 69520 + }, + { + "epoch": 9.869410929737402, + "grad_norm": 2.343458414077759, + "learning_rate": 9.01349893541519e-05, + "loss": 0.022932544350624084, + "step": 69530 + }, + { + "epoch": 9.8708303761533, + "grad_norm": 1.5184394121170044, + "learning_rate": 9.013356990773598e-05, + "loss": 0.010104528069496155, + "step": 69540 + }, + { + "epoch": 9.872249822569199, + "grad_norm": 0.2630665600299835, + "learning_rate": 9.013215046132009e-05, + "loss": 0.024105256795883177, + "step": 69550 + }, + { + "epoch": 9.873669268985095, + "grad_norm": 0.013559470884501934, + "learning_rate": 9.013073101490419e-05, + "loss": 0.02312069237232208, + "step": 69560 + }, + { + "epoch": 9.875088715400993, + "grad_norm": 0.2757904827594757, + "learning_rate": 9.01293115684883e-05, + "loss": 0.03828359246253967, + "step": 69570 + }, + { + "epoch": 9.876508161816892, + "grad_norm": 1.3558392524719238, + "learning_rate": 9.01278921220724e-05, + "loss": 0.0364987313747406, + "step": 69580 + }, + { + "epoch": 9.87792760823279, + "grad_norm": 0.9141183495521545, + "learning_rate": 9.01264726756565e-05, + "loss": 0.05945742130279541, + "step": 69590 + }, + { + "epoch": 9.879347054648687, + "grad_norm": 10.217071533203125, + "learning_rate": 9.012505322924059e-05, + "loss": 0.0646911084651947, + "step": 69600 + }, + { + "epoch": 9.880766501064585, + "grad_norm": 1.882594108581543, + "learning_rate": 9.01236337828247e-05, + "loss": 0.013017700612545013, + "step": 69610 + }, + { + "epoch": 9.882185947480483, + "grad_norm": 1.8000305891036987, + "learning_rate": 9.012221433640881e-05, + "loss": 0.02973770201206207, + "step": 69620 + }, + { + "epoch": 9.88360539389638, + "grad_norm": 2.1482441425323486, + "learning_rate": 9.012079488999291e-05, + "loss": 0.043613281846046445, + "step": 69630 + }, + { + "epoch": 9.885024840312278, + "grad_norm": 1.8047417402267456, + "learning_rate": 9.011937544357702e-05, + "loss": 0.010743890702724457, + "step": 69640 + }, + { + "epoch": 9.886444286728176, + "grad_norm": 0.356742799282074, + "learning_rate": 9.01179559971611e-05, + "loss": 0.04975705146789551, + "step": 69650 + }, + { + "epoch": 9.887863733144075, + "grad_norm": 0.8651915788650513, + "learning_rate": 9.011653655074522e-05, + "loss": 0.03396418988704682, + "step": 69660 + }, + { + "epoch": 9.889283179559971, + "grad_norm": 5.231709957122803, + "learning_rate": 9.011511710432931e-05, + "loss": 0.035730010271072386, + "step": 69670 + }, + { + "epoch": 9.89070262597587, + "grad_norm": 19.371747970581055, + "learning_rate": 9.011369765791342e-05, + "loss": 0.035460355877876285, + "step": 69680 + }, + { + "epoch": 9.892122072391768, + "grad_norm": 5.615591526031494, + "learning_rate": 9.011227821149752e-05, + "loss": 0.06311107277870179, + "step": 69690 + }, + { + "epoch": 9.893541518807664, + "grad_norm": 0.6916624307632446, + "learning_rate": 9.011085876508162e-05, + "loss": 0.01275596022605896, + "step": 69700 + }, + { + "epoch": 9.894960965223563, + "grad_norm": 0.03846505656838417, + "learning_rate": 9.010943931866573e-05, + "loss": 0.019737032055854798, + "step": 69710 + }, + { + "epoch": 9.896380411639461, + "grad_norm": 12.611459732055664, + "learning_rate": 9.010801987224983e-05, + "loss": 0.025944510102272035, + "step": 69720 + }, + { + "epoch": 9.89779985805536, + "grad_norm": 0.03990490362048149, + "learning_rate": 9.010660042583394e-05, + "loss": 0.05266411900520325, + "step": 69730 + }, + { + "epoch": 9.899219304471256, + "grad_norm": 8.41258716583252, + "learning_rate": 9.010518097941804e-05, + "loss": 0.03398913741111755, + "step": 69740 + }, + { + "epoch": 9.900638750887154, + "grad_norm": 0.4625275731086731, + "learning_rate": 9.010376153300213e-05, + "loss": 0.028921571373939515, + "step": 69750 + }, + { + "epoch": 9.902058197303052, + "grad_norm": 2.8440163135528564, + "learning_rate": 9.010234208658623e-05, + "loss": 0.026828548312187193, + "step": 69760 + }, + { + "epoch": 9.903477643718949, + "grad_norm": 0.30509254336357117, + "learning_rate": 9.010092264017034e-05, + "loss": 0.02407469302415848, + "step": 69770 + }, + { + "epoch": 9.904897090134847, + "grad_norm": 0.16248999536037445, + "learning_rate": 9.009950319375444e-05, + "loss": 0.032114657759666446, + "step": 69780 + }, + { + "epoch": 9.906316536550746, + "grad_norm": 9.057454109191895, + "learning_rate": 9.009808374733855e-05, + "loss": 0.05952262282371521, + "step": 69790 + }, + { + "epoch": 9.907735982966644, + "grad_norm": 0.25342628359794617, + "learning_rate": 9.009666430092265e-05, + "loss": 0.012016575783491135, + "step": 69800 + }, + { + "epoch": 9.90915542938254, + "grad_norm": 5.12114143371582, + "learning_rate": 9.009524485450674e-05, + "loss": 0.018399667739868165, + "step": 69810 + }, + { + "epoch": 9.910574875798439, + "grad_norm": 0.6642940044403076, + "learning_rate": 9.009382540809085e-05, + "loss": 0.023203255236148836, + "step": 69820 + }, + { + "epoch": 9.911994322214337, + "grad_norm": 0.4026901423931122, + "learning_rate": 9.009240596167495e-05, + "loss": 0.03522194027900696, + "step": 69830 + }, + { + "epoch": 9.913413768630233, + "grad_norm": 0.011791981756687164, + "learning_rate": 9.009098651525906e-05, + "loss": 0.02517768442630768, + "step": 69840 + }, + { + "epoch": 9.914833215046132, + "grad_norm": 6.152438640594482, + "learning_rate": 9.008956706884315e-05, + "loss": 0.06228979229927063, + "step": 69850 + }, + { + "epoch": 9.91625266146203, + "grad_norm": 0.020824043080210686, + "learning_rate": 9.008814762242726e-05, + "loss": 0.0055544193834066394, + "step": 69860 + }, + { + "epoch": 9.917672107877928, + "grad_norm": 10.714980125427246, + "learning_rate": 9.008672817601136e-05, + "loss": 0.0562599778175354, + "step": 69870 + }, + { + "epoch": 9.919091554293825, + "grad_norm": 0.71759033203125, + "learning_rate": 9.008530872959547e-05, + "loss": 0.015090197324752808, + "step": 69880 + }, + { + "epoch": 9.920511000709723, + "grad_norm": 7.554685592651367, + "learning_rate": 9.008388928317956e-05, + "loss": 0.04340165555477142, + "step": 69890 + }, + { + "epoch": 9.921930447125622, + "grad_norm": 16.142648696899414, + "learning_rate": 9.008246983676366e-05, + "loss": 0.06557026505470276, + "step": 69900 + }, + { + "epoch": 9.923349893541518, + "grad_norm": 2.8899734020233154, + "learning_rate": 9.008105039034777e-05, + "loss": 0.05298327207565308, + "step": 69910 + }, + { + "epoch": 9.924769339957416, + "grad_norm": 0.2371891736984253, + "learning_rate": 9.007963094393187e-05, + "loss": 0.020034009218215944, + "step": 69920 + }, + { + "epoch": 9.926188786373315, + "grad_norm": 0.7864499688148499, + "learning_rate": 9.007821149751598e-05, + "loss": 0.005942384526133537, + "step": 69930 + }, + { + "epoch": 9.927608232789213, + "grad_norm": 1.570546269416809, + "learning_rate": 9.007679205110008e-05, + "loss": 0.02644781768321991, + "step": 69940 + }, + { + "epoch": 9.92902767920511, + "grad_norm": 0.30083009600639343, + "learning_rate": 9.007537260468419e-05, + "loss": 0.012635472416877746, + "step": 69950 + }, + { + "epoch": 9.930447125621008, + "grad_norm": 0.0356457456946373, + "learning_rate": 9.007395315826827e-05, + "loss": 0.03070034384727478, + "step": 69960 + }, + { + "epoch": 9.931866572036906, + "grad_norm": 0.558660089969635, + "learning_rate": 9.007253371185238e-05, + "loss": 0.020227883756160737, + "step": 69970 + }, + { + "epoch": 9.933286018452803, + "grad_norm": 7.517212390899658, + "learning_rate": 9.007111426543648e-05, + "loss": 0.13844293355941772, + "step": 69980 + }, + { + "epoch": 9.934705464868701, + "grad_norm": 0.020189205184578896, + "learning_rate": 9.006969481902059e-05, + "loss": 0.018152689933776854, + "step": 69990 + }, + { + "epoch": 9.9361249112846, + "grad_norm": 11.410741806030273, + "learning_rate": 9.006827537260469e-05, + "loss": 0.029291608929634096, + "step": 70000 + }, + { + "epoch": 9.9361249112846, + "eval_accuracy": 0.9816239587969734, + "eval_loss": 0.06005491688847542, + "eval_runtime": 33.3087, + "eval_samples_per_second": 472.159, + "eval_steps_per_second": 14.771, + "step": 70000 + }, + { + "epoch": 9.937544357700498, + "grad_norm": 0.02899947017431259, + "learning_rate": 9.006685592618879e-05, + "loss": 0.05207945704460144, + "step": 70010 + }, + { + "epoch": 9.938963804116394, + "grad_norm": 2.290057420730591, + "learning_rate": 9.00654364797729e-05, + "loss": 0.03104778826236725, + "step": 70020 + }, + { + "epoch": 9.940383250532292, + "grad_norm": 4.646543502807617, + "learning_rate": 9.0064017033357e-05, + "loss": 0.026638334989547728, + "step": 70030 + }, + { + "epoch": 9.94180269694819, + "grad_norm": 0.7747595310211182, + "learning_rate": 9.00625975869411e-05, + "loss": 0.057200342416763306, + "step": 70040 + }, + { + "epoch": 9.943222143364087, + "grad_norm": 13.665855407714844, + "learning_rate": 9.00611781405252e-05, + "loss": 0.041640186309814455, + "step": 70050 + }, + { + "epoch": 9.944641589779986, + "grad_norm": 0.8320010900497437, + "learning_rate": 9.00597586941093e-05, + "loss": 0.03797149658203125, + "step": 70060 + }, + { + "epoch": 9.946061036195884, + "grad_norm": 0.46569153666496277, + "learning_rate": 9.00583392476934e-05, + "loss": 0.018002772331237794, + "step": 70070 + }, + { + "epoch": 9.947480482611782, + "grad_norm": 0.32417356967926025, + "learning_rate": 9.005691980127751e-05, + "loss": 0.07594256997108459, + "step": 70080 + }, + { + "epoch": 9.948899929027679, + "grad_norm": 2.290781259536743, + "learning_rate": 9.00555003548616e-05, + "loss": 0.02159411907196045, + "step": 70090 + }, + { + "epoch": 9.950319375443577, + "grad_norm": 1.5285451412200928, + "learning_rate": 9.005408090844572e-05, + "loss": 0.05258944034576416, + "step": 70100 + }, + { + "epoch": 9.951738821859475, + "grad_norm": 6.222751617431641, + "learning_rate": 9.005266146202981e-05, + "loss": 0.021117933094501495, + "step": 70110 + }, + { + "epoch": 9.953158268275372, + "grad_norm": 6.542359828948975, + "learning_rate": 9.005124201561391e-05, + "loss": 0.03911808133125305, + "step": 70120 + }, + { + "epoch": 9.95457771469127, + "grad_norm": 0.1534150093793869, + "learning_rate": 9.004982256919802e-05, + "loss": 0.10638805627822875, + "step": 70130 + }, + { + "epoch": 9.955997161107168, + "grad_norm": 0.11701802909374237, + "learning_rate": 9.004840312278212e-05, + "loss": 0.01432393044233322, + "step": 70140 + }, + { + "epoch": 9.957416607523067, + "grad_norm": 0.2519495189189911, + "learning_rate": 9.004698367636623e-05, + "loss": 0.03227570950984955, + "step": 70150 + }, + { + "epoch": 9.958836053938963, + "grad_norm": 5.316908359527588, + "learning_rate": 9.004556422995031e-05, + "loss": 0.055128943920135495, + "step": 70160 + }, + { + "epoch": 9.960255500354862, + "grad_norm": 0.15168063342571259, + "learning_rate": 9.004414478353443e-05, + "loss": 0.01621713936328888, + "step": 70170 + }, + { + "epoch": 9.96167494677076, + "grad_norm": 1.0486849546432495, + "learning_rate": 9.004272533711852e-05, + "loss": 0.02327948361635208, + "step": 70180 + }, + { + "epoch": 9.963094393186656, + "grad_norm": 0.1740347445011139, + "learning_rate": 9.004130589070263e-05, + "loss": 0.02720318138599396, + "step": 70190 + }, + { + "epoch": 9.964513839602555, + "grad_norm": 0.25774815678596497, + "learning_rate": 9.003988644428673e-05, + "loss": 0.04783263504505157, + "step": 70200 + }, + { + "epoch": 9.965933286018453, + "grad_norm": 6.590445041656494, + "learning_rate": 9.003846699787083e-05, + "loss": 0.03629983365535736, + "step": 70210 + }, + { + "epoch": 9.967352732434351, + "grad_norm": 1.5156173706054688, + "learning_rate": 9.003704755145494e-05, + "loss": 0.07823289632797241, + "step": 70220 + }, + { + "epoch": 9.968772178850248, + "grad_norm": 0.08737379312515259, + "learning_rate": 9.003562810503904e-05, + "loss": 0.01150958240032196, + "step": 70230 + }, + { + "epoch": 9.970191625266146, + "grad_norm": 6.838068962097168, + "learning_rate": 9.003420865862315e-05, + "loss": 0.05327457785606384, + "step": 70240 + }, + { + "epoch": 9.971611071682045, + "grad_norm": 7.426832675933838, + "learning_rate": 9.003278921220725e-05, + "loss": 0.030006197094917298, + "step": 70250 + }, + { + "epoch": 9.973030518097941, + "grad_norm": 5.167942523956299, + "learning_rate": 9.003136976579134e-05, + "loss": 0.08623776435852051, + "step": 70260 + }, + { + "epoch": 9.97444996451384, + "grad_norm": 4.396407604217529, + "learning_rate": 9.002995031937544e-05, + "loss": 0.03299508690834045, + "step": 70270 + }, + { + "epoch": 9.975869410929738, + "grad_norm": 8.55590534210205, + "learning_rate": 9.002853087295955e-05, + "loss": 0.04857953190803528, + "step": 70280 + }, + { + "epoch": 9.977288857345636, + "grad_norm": 0.14077264070510864, + "learning_rate": 9.002711142654365e-05, + "loss": 0.04843473136425018, + "step": 70290 + }, + { + "epoch": 9.978708303761533, + "grad_norm": 11.530470848083496, + "learning_rate": 9.002569198012776e-05, + "loss": 0.042692869901657104, + "step": 70300 + }, + { + "epoch": 9.98012775017743, + "grad_norm": 1.876141905784607, + "learning_rate": 9.002427253371186e-05, + "loss": 0.06830405592918395, + "step": 70310 + }, + { + "epoch": 9.98154719659333, + "grad_norm": 2.4454851150512695, + "learning_rate": 9.002285308729595e-05, + "loss": 0.035274538397789004, + "step": 70320 + }, + { + "epoch": 9.982966643009226, + "grad_norm": 3.194749593734741, + "learning_rate": 9.002143364088006e-05, + "loss": 0.01803991198539734, + "step": 70330 + }, + { + "epoch": 9.984386089425124, + "grad_norm": 0.22273701429367065, + "learning_rate": 9.002001419446416e-05, + "loss": 0.04039207696914673, + "step": 70340 + }, + { + "epoch": 9.985805535841022, + "grad_norm": 6.443853855133057, + "learning_rate": 9.001859474804827e-05, + "loss": 0.031891047954559326, + "step": 70350 + }, + { + "epoch": 9.98722498225692, + "grad_norm": 11.825727462768555, + "learning_rate": 9.001717530163237e-05, + "loss": 0.03135543167591095, + "step": 70360 + }, + { + "epoch": 9.988644428672817, + "grad_norm": 1.0881600379943848, + "learning_rate": 9.001575585521647e-05, + "loss": 0.021712112426757812, + "step": 70370 + }, + { + "epoch": 9.990063875088715, + "grad_norm": 0.020460011437535286, + "learning_rate": 9.001433640880057e-05, + "loss": 0.007388191670179367, + "step": 70380 + }, + { + "epoch": 9.991483321504614, + "grad_norm": 0.5041184425354004, + "learning_rate": 9.001291696238468e-05, + "loss": 0.016400963068008423, + "step": 70390 + }, + { + "epoch": 9.99290276792051, + "grad_norm": 11.30899429321289, + "learning_rate": 9.001149751596877e-05, + "loss": 0.04193665981292725, + "step": 70400 + }, + { + "epoch": 9.994322214336409, + "grad_norm": 3.974252700805664, + "learning_rate": 9.001007806955288e-05, + "loss": 0.034135401248931885, + "step": 70410 + }, + { + "epoch": 9.995741660752307, + "grad_norm": 0.414074569940567, + "learning_rate": 9.000865862313698e-05, + "loss": 0.022105370461940766, + "step": 70420 + }, + { + "epoch": 9.997161107168205, + "grad_norm": 4.395871639251709, + "learning_rate": 9.000723917672108e-05, + "loss": 0.04791556596755982, + "step": 70430 + }, + { + "epoch": 9.998580553584102, + "grad_norm": 5.924459934234619, + "learning_rate": 9.000581973030519e-05, + "loss": 0.03285888731479645, + "step": 70440 + }, + { + "epoch": 10.0, + "grad_norm": 12.073928833007812, + "learning_rate": 9.000440028388929e-05, + "loss": 0.023960676789283753, + "step": 70450 + }, + { + "epoch": 10.001419446415898, + "grad_norm": 0.24992632865905762, + "learning_rate": 9.00029808374734e-05, + "loss": 0.028036636114120484, + "step": 70460 + }, + { + "epoch": 10.002838892831795, + "grad_norm": 0.2592790424823761, + "learning_rate": 9.000156139105748e-05, + "loss": 0.09887195825576782, + "step": 70470 + }, + { + "epoch": 10.004258339247693, + "grad_norm": 0.7219700813293457, + "learning_rate": 9.000014194464159e-05, + "loss": 0.02016732543706894, + "step": 70480 + }, + { + "epoch": 10.005677785663591, + "grad_norm": 0.675056517124176, + "learning_rate": 8.999872249822569e-05, + "loss": 0.04064895510673523, + "step": 70490 + }, + { + "epoch": 10.00709723207949, + "grad_norm": 2.294806718826294, + "learning_rate": 8.99973030518098e-05, + "loss": 0.06091110110282898, + "step": 70500 + }, + { + "epoch": 10.00709723207949, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.05429995805025101, + "eval_runtime": 33.3699, + "eval_samples_per_second": 471.293, + "eval_steps_per_second": 14.744, + "step": 70500 + }, + { + "epoch": 10.008516678495386, + "grad_norm": 4.840185642242432, + "learning_rate": 8.99958836053939e-05, + "loss": 0.025154224038124083, + "step": 70510 + }, + { + "epoch": 10.009936124911285, + "grad_norm": 0.6732443571090698, + "learning_rate": 8.9994464158978e-05, + "loss": 0.017550435662269593, + "step": 70520 + }, + { + "epoch": 10.011355571327183, + "grad_norm": 3.3090858459472656, + "learning_rate": 8.999304471256211e-05, + "loss": 0.04830752909183502, + "step": 70530 + }, + { + "epoch": 10.01277501774308, + "grad_norm": 3.5346755981445312, + "learning_rate": 8.99916252661462e-05, + "loss": 0.041277503967285155, + "step": 70540 + }, + { + "epoch": 10.014194464158978, + "grad_norm": 0.1413526087999344, + "learning_rate": 8.999020581973032e-05, + "loss": 0.050315022468566895, + "step": 70550 + }, + { + "epoch": 10.015613910574876, + "grad_norm": 0.13002030551433563, + "learning_rate": 8.998878637331441e-05, + "loss": 0.011780694127082825, + "step": 70560 + }, + { + "epoch": 10.017033356990774, + "grad_norm": 3.0763373374938965, + "learning_rate": 8.998736692689851e-05, + "loss": 0.05392890572547913, + "step": 70570 + }, + { + "epoch": 10.01845280340667, + "grad_norm": 2.0271835327148438, + "learning_rate": 8.998594748048261e-05, + "loss": 0.03189515769481659, + "step": 70580 + }, + { + "epoch": 10.01987224982257, + "grad_norm": 10.38420295715332, + "learning_rate": 8.998452803406672e-05, + "loss": 0.03412808775901795, + "step": 70590 + }, + { + "epoch": 10.021291696238467, + "grad_norm": 0.4704546630382538, + "learning_rate": 8.998310858765082e-05, + "loss": 0.015590818226337433, + "step": 70600 + }, + { + "epoch": 10.022711142654364, + "grad_norm": 0.04271915927529335, + "learning_rate": 8.998168914123493e-05, + "loss": 0.03360204696655274, + "step": 70610 + }, + { + "epoch": 10.024130589070262, + "grad_norm": 0.23193983733654022, + "learning_rate": 8.998026969481902e-05, + "loss": 0.019712990522384642, + "step": 70620 + }, + { + "epoch": 10.02555003548616, + "grad_norm": 3.272385358810425, + "learning_rate": 8.997885024840312e-05, + "loss": 0.019542354345321655, + "step": 70630 + }, + { + "epoch": 10.026969481902059, + "grad_norm": 0.11366137862205505, + "learning_rate": 8.997743080198723e-05, + "loss": 0.016868501901626587, + "step": 70640 + }, + { + "epoch": 10.028388928317955, + "grad_norm": 1.392044186592102, + "learning_rate": 8.997601135557133e-05, + "loss": 0.05533118844032288, + "step": 70650 + }, + { + "epoch": 10.029808374733854, + "grad_norm": 0.035225555300712585, + "learning_rate": 8.997459190915544e-05, + "loss": 0.014872361719608308, + "step": 70660 + }, + { + "epoch": 10.031227821149752, + "grad_norm": 0.07124058157205582, + "learning_rate": 8.997317246273954e-05, + "loss": 0.028601282835006715, + "step": 70670 + }, + { + "epoch": 10.032647267565649, + "grad_norm": 4.94709587097168, + "learning_rate": 8.997175301632364e-05, + "loss": 0.04412299692630768, + "step": 70680 + }, + { + "epoch": 10.034066713981547, + "grad_norm": 8.149444580078125, + "learning_rate": 8.997033356990773e-05, + "loss": 0.036119335889816286, + "step": 70690 + }, + { + "epoch": 10.035486160397445, + "grad_norm": 6.122970104217529, + "learning_rate": 8.996891412349184e-05, + "loss": 0.014646390080451965, + "step": 70700 + }, + { + "epoch": 10.036905606813344, + "grad_norm": 0.029342738911509514, + "learning_rate": 8.996749467707594e-05, + "loss": 0.03428757190704346, + "step": 70710 + }, + { + "epoch": 10.03832505322924, + "grad_norm": 0.2870967388153076, + "learning_rate": 8.996607523066005e-05, + "loss": 0.014554601907730103, + "step": 70720 + }, + { + "epoch": 10.039744499645138, + "grad_norm": 0.4357489049434662, + "learning_rate": 8.996465578424415e-05, + "loss": 0.02026190161705017, + "step": 70730 + }, + { + "epoch": 10.041163946061037, + "grad_norm": 1.4404624700546265, + "learning_rate": 8.996323633782825e-05, + "loss": 0.07623617053031921, + "step": 70740 + }, + { + "epoch": 10.042583392476933, + "grad_norm": 9.76617431640625, + "learning_rate": 8.996181689141236e-05, + "loss": 0.04643445312976837, + "step": 70750 + }, + { + "epoch": 10.044002838892832, + "grad_norm": 0.0906580463051796, + "learning_rate": 8.996039744499646e-05, + "loss": 0.045586833357810976, + "step": 70760 + }, + { + "epoch": 10.04542228530873, + "grad_norm": 0.23739679157733917, + "learning_rate": 8.995897799858057e-05, + "loss": 0.017312943935394287, + "step": 70770 + }, + { + "epoch": 10.046841731724628, + "grad_norm": 0.541393518447876, + "learning_rate": 8.995755855216465e-05, + "loss": 0.013014046847820282, + "step": 70780 + }, + { + "epoch": 10.048261178140525, + "grad_norm": 0.27587655186653137, + "learning_rate": 8.995613910574876e-05, + "loss": 0.0030714210122823717, + "step": 70790 + }, + { + "epoch": 10.049680624556423, + "grad_norm": 0.08097139745950699, + "learning_rate": 8.995471965933286e-05, + "loss": 0.01676403880119324, + "step": 70800 + }, + { + "epoch": 10.051100070972321, + "grad_norm": 10.044310569763184, + "learning_rate": 8.995330021291697e-05, + "loss": 0.011085320264101028, + "step": 70810 + }, + { + "epoch": 10.052519517388218, + "grad_norm": 1.3171991109848022, + "learning_rate": 8.995188076650107e-05, + "loss": 0.007198108732700348, + "step": 70820 + }, + { + "epoch": 10.053938963804116, + "grad_norm": 0.6077939867973328, + "learning_rate": 8.995046132008516e-05, + "loss": 0.019241389632225037, + "step": 70830 + }, + { + "epoch": 10.055358410220014, + "grad_norm": 1.554423213005066, + "learning_rate": 8.994904187366927e-05, + "loss": 0.009505119919776917, + "step": 70840 + }, + { + "epoch": 10.056777856635913, + "grad_norm": 0.0959242656826973, + "learning_rate": 8.994762242725337e-05, + "loss": 0.01176130622625351, + "step": 70850 + }, + { + "epoch": 10.05819730305181, + "grad_norm": 19.056350708007812, + "learning_rate": 8.994620298083748e-05, + "loss": 0.039687898755073545, + "step": 70860 + }, + { + "epoch": 10.059616749467708, + "grad_norm": 0.17465567588806152, + "learning_rate": 8.994478353442158e-05, + "loss": 0.010702335834503173, + "step": 70870 + }, + { + "epoch": 10.061036195883606, + "grad_norm": 1.6593632698059082, + "learning_rate": 8.994336408800568e-05, + "loss": 0.016061322391033174, + "step": 70880 + }, + { + "epoch": 10.062455642299502, + "grad_norm": 14.231645584106445, + "learning_rate": 8.994194464158978e-05, + "loss": 0.04368197321891785, + "step": 70890 + }, + { + "epoch": 10.0638750887154, + "grad_norm": 1.333464503288269, + "learning_rate": 8.994052519517389e-05, + "loss": 0.012594687938690185, + "step": 70900 + }, + { + "epoch": 10.065294535131299, + "grad_norm": 11.049664497375488, + "learning_rate": 8.993910574875798e-05, + "loss": 0.051619219779968264, + "step": 70910 + }, + { + "epoch": 10.066713981547197, + "grad_norm": 0.08700914680957794, + "learning_rate": 8.99376863023421e-05, + "loss": 0.049428775906562805, + "step": 70920 + }, + { + "epoch": 10.068133427963094, + "grad_norm": 0.23135952651500702, + "learning_rate": 8.993626685592619e-05, + "loss": 0.023401886224746704, + "step": 70930 + }, + { + "epoch": 10.069552874378992, + "grad_norm": 0.07406525313854218, + "learning_rate": 8.993484740951029e-05, + "loss": 0.060725486278533934, + "step": 70940 + }, + { + "epoch": 10.07097232079489, + "grad_norm": 2.472419261932373, + "learning_rate": 8.99334279630944e-05, + "loss": 0.01986803412437439, + "step": 70950 + }, + { + "epoch": 10.072391767210787, + "grad_norm": 0.574719250202179, + "learning_rate": 8.99320085166785e-05, + "loss": 0.043755120038986205, + "step": 70960 + }, + { + "epoch": 10.073811213626685, + "grad_norm": 0.35062700510025024, + "learning_rate": 8.993058907026261e-05, + "loss": 0.016467268764972686, + "step": 70970 + }, + { + "epoch": 10.075230660042584, + "grad_norm": 3.5769553184509277, + "learning_rate": 8.99291696238467e-05, + "loss": 0.02966910004615784, + "step": 70980 + }, + { + "epoch": 10.076650106458482, + "grad_norm": 0.09693639725446701, + "learning_rate": 8.99277501774308e-05, + "loss": 0.021013087034225462, + "step": 70990 + }, + { + "epoch": 10.078069552874378, + "grad_norm": 0.028018489480018616, + "learning_rate": 8.99263307310149e-05, + "loss": 0.04916905760765076, + "step": 71000 + }, + { + "epoch": 10.078069552874378, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.05193416774272919, + "eval_runtime": 32.7864, + "eval_samples_per_second": 479.681, + "eval_steps_per_second": 15.006, + "step": 71000 + }, + { + "epoch": 10.079488999290277, + "grad_norm": 11.105819702148438, + "learning_rate": 8.992491128459901e-05, + "loss": 0.0558125376701355, + "step": 71010 + }, + { + "epoch": 10.080908445706175, + "grad_norm": 5.16521692276001, + "learning_rate": 8.992349183818312e-05, + "loss": 0.027687478065490722, + "step": 71020 + }, + { + "epoch": 10.082327892122072, + "grad_norm": 1.2156544923782349, + "learning_rate": 8.992207239176722e-05, + "loss": 0.05835610628128052, + "step": 71030 + }, + { + "epoch": 10.08374733853797, + "grad_norm": 11.860342979431152, + "learning_rate": 8.992065294535132e-05, + "loss": 0.06005517244338989, + "step": 71040 + }, + { + "epoch": 10.085166784953868, + "grad_norm": 1.9857722520828247, + "learning_rate": 8.991923349893541e-05, + "loss": 0.03460575938224793, + "step": 71050 + }, + { + "epoch": 10.086586231369767, + "grad_norm": 12.068072319030762, + "learning_rate": 8.991781405251953e-05, + "loss": 0.0850683569908142, + "step": 71060 + }, + { + "epoch": 10.088005677785663, + "grad_norm": 0.45789551734924316, + "learning_rate": 8.991639460610362e-05, + "loss": 0.07147586941719056, + "step": 71070 + }, + { + "epoch": 10.089425124201561, + "grad_norm": 0.7118040323257446, + "learning_rate": 8.991497515968773e-05, + "loss": 0.07116876244544983, + "step": 71080 + }, + { + "epoch": 10.09084457061746, + "grad_norm": 0.2730395495891571, + "learning_rate": 8.991355571327182e-05, + "loss": 0.011208267509937286, + "step": 71090 + }, + { + "epoch": 10.092264017033356, + "grad_norm": 0.616715669631958, + "learning_rate": 8.991213626685593e-05, + "loss": 0.01879979819059372, + "step": 71100 + }, + { + "epoch": 10.093683463449254, + "grad_norm": 0.4260369539260864, + "learning_rate": 8.991071682044004e-05, + "loss": 0.02226797640323639, + "step": 71110 + }, + { + "epoch": 10.095102909865153, + "grad_norm": 1.1055042743682861, + "learning_rate": 8.990929737402414e-05, + "loss": 0.04241999387741089, + "step": 71120 + }, + { + "epoch": 10.096522356281051, + "grad_norm": 0.2439938485622406, + "learning_rate": 8.990787792760825e-05, + "loss": 0.014829432964324952, + "step": 71130 + }, + { + "epoch": 10.097941802696948, + "grad_norm": 2.2205474376678467, + "learning_rate": 8.990645848119233e-05, + "loss": 0.043992698192596436, + "step": 71140 + }, + { + "epoch": 10.099361249112846, + "grad_norm": 3.324392318725586, + "learning_rate": 8.990503903477644e-05, + "loss": 0.007843706011772155, + "step": 71150 + }, + { + "epoch": 10.100780695528744, + "grad_norm": 5.776381015777588, + "learning_rate": 8.990361958836054e-05, + "loss": 0.06609423756599427, + "step": 71160 + }, + { + "epoch": 10.10220014194464, + "grad_norm": 3.8756496906280518, + "learning_rate": 8.990220014194465e-05, + "loss": 0.025007185339927674, + "step": 71170 + }, + { + "epoch": 10.103619588360539, + "grad_norm": 9.75920295715332, + "learning_rate": 8.990078069552875e-05, + "loss": 0.04297915697097778, + "step": 71180 + }, + { + "epoch": 10.105039034776437, + "grad_norm": 6.752058029174805, + "learning_rate": 8.989936124911285e-05, + "loss": 0.07576794624328613, + "step": 71190 + }, + { + "epoch": 10.106458481192336, + "grad_norm": 0.22006545960903168, + "learning_rate": 8.989794180269696e-05, + "loss": 0.08119171261787414, + "step": 71200 + }, + { + "epoch": 10.107877927608232, + "grad_norm": 12.223731994628906, + "learning_rate": 8.989652235628105e-05, + "loss": 0.025416919589042665, + "step": 71210 + }, + { + "epoch": 10.10929737402413, + "grad_norm": 1.204849362373352, + "learning_rate": 8.989510290986516e-05, + "loss": 0.0057471167296171185, + "step": 71220 + }, + { + "epoch": 10.110716820440029, + "grad_norm": 5.5199408531188965, + "learning_rate": 8.989368346344926e-05, + "loss": 0.02445787936449051, + "step": 71230 + }, + { + "epoch": 10.112136266855925, + "grad_norm": 0.5732960104942322, + "learning_rate": 8.989226401703336e-05, + "loss": 0.06102269887924194, + "step": 71240 + }, + { + "epoch": 10.113555713271824, + "grad_norm": 0.24469658732414246, + "learning_rate": 8.989084457061746e-05, + "loss": 0.07163866758346557, + "step": 71250 + }, + { + "epoch": 10.114975159687722, + "grad_norm": 10.344715118408203, + "learning_rate": 8.988942512420157e-05, + "loss": 0.020954841375350954, + "step": 71260 + }, + { + "epoch": 10.11639460610362, + "grad_norm": 0.6934405565261841, + "learning_rate": 8.988800567778567e-05, + "loss": 0.052423101663589475, + "step": 71270 + }, + { + "epoch": 10.117814052519517, + "grad_norm": 3.2811639308929443, + "learning_rate": 8.988658623136978e-05, + "loss": 0.05588293671607971, + "step": 71280 + }, + { + "epoch": 10.119233498935415, + "grad_norm": 0.6066336631774902, + "learning_rate": 8.988516678495387e-05, + "loss": 0.03658683001995087, + "step": 71290 + }, + { + "epoch": 10.120652945351313, + "grad_norm": 13.91519546508789, + "learning_rate": 8.988374733853797e-05, + "loss": 0.09927948117256165, + "step": 71300 + }, + { + "epoch": 10.12207239176721, + "grad_norm": 5.7359514236450195, + "learning_rate": 8.988232789212208e-05, + "loss": 0.05882952809333801, + "step": 71310 + }, + { + "epoch": 10.123491838183108, + "grad_norm": 9.058098793029785, + "learning_rate": 8.988090844570618e-05, + "loss": 0.0466866672039032, + "step": 71320 + }, + { + "epoch": 10.124911284599007, + "grad_norm": 0.7956753969192505, + "learning_rate": 8.987948899929029e-05, + "loss": 0.04202900826931, + "step": 71330 + }, + { + "epoch": 10.126330731014905, + "grad_norm": 1.190938949584961, + "learning_rate": 8.987806955287439e-05, + "loss": 0.05764939785003662, + "step": 71340 + }, + { + "epoch": 10.127750177430801, + "grad_norm": 0.05666103586554527, + "learning_rate": 8.987665010645849e-05, + "loss": 0.024899232387542724, + "step": 71350 + }, + { + "epoch": 10.1291696238467, + "grad_norm": 4.87168025970459, + "learning_rate": 8.987523066004258e-05, + "loss": 0.06777219772338867, + "step": 71360 + }, + { + "epoch": 10.130589070262598, + "grad_norm": 1.0274242162704468, + "learning_rate": 8.98738112136267e-05, + "loss": 0.01196812316775322, + "step": 71370 + }, + { + "epoch": 10.132008516678495, + "grad_norm": 0.10885701328516006, + "learning_rate": 8.987239176721079e-05, + "loss": 0.0149383544921875, + "step": 71380 + }, + { + "epoch": 10.133427963094393, + "grad_norm": 4.033825397491455, + "learning_rate": 8.98709723207949e-05, + "loss": 0.013677287101745605, + "step": 71390 + }, + { + "epoch": 10.134847409510291, + "grad_norm": 0.4323374032974243, + "learning_rate": 8.9869552874379e-05, + "loss": 0.03762426972389221, + "step": 71400 + }, + { + "epoch": 10.13626685592619, + "grad_norm": 0.06930834800004959, + "learning_rate": 8.98681334279631e-05, + "loss": 0.009782253205776215, + "step": 71410 + }, + { + "epoch": 10.137686302342086, + "grad_norm": 0.7295514345169067, + "learning_rate": 8.986671398154721e-05, + "loss": 0.016273342072963715, + "step": 71420 + }, + { + "epoch": 10.139105748757984, + "grad_norm": 7.975281238555908, + "learning_rate": 8.98652945351313e-05, + "loss": 0.05889239311218262, + "step": 71430 + }, + { + "epoch": 10.140525195173883, + "grad_norm": 0.8045636415481567, + "learning_rate": 8.986387508871542e-05, + "loss": 0.04868484139442444, + "step": 71440 + }, + { + "epoch": 10.14194464158978, + "grad_norm": 8.876813888549805, + "learning_rate": 8.98624556422995e-05, + "loss": 0.03207354545593262, + "step": 71450 + }, + { + "epoch": 10.143364088005677, + "grad_norm": 8.953425407409668, + "learning_rate": 8.986103619588361e-05, + "loss": 0.05620843172073364, + "step": 71460 + }, + { + "epoch": 10.144783534421576, + "grad_norm": 8.15450382232666, + "learning_rate": 8.985961674946771e-05, + "loss": 0.0524405837059021, + "step": 71470 + }, + { + "epoch": 10.146202980837474, + "grad_norm": 8.996185302734375, + "learning_rate": 8.985819730305182e-05, + "loss": 0.011744706332683564, + "step": 71480 + }, + { + "epoch": 10.14762242725337, + "grad_norm": 0.6976650357246399, + "learning_rate": 8.985677785663592e-05, + "loss": 0.006831346452236176, + "step": 71490 + }, + { + "epoch": 10.149041873669269, + "grad_norm": 1.2598222494125366, + "learning_rate": 8.985535841022001e-05, + "loss": 0.018995559215545653, + "step": 71500 + }, + { + "epoch": 10.149041873669269, + "eval_accuracy": 0.9872194315508361, + "eval_loss": 0.042244430631399155, + "eval_runtime": 32.7379, + "eval_samples_per_second": 480.392, + "eval_steps_per_second": 15.028, + "step": 71500 + }, + { + "epoch": 10.150461320085167, + "grad_norm": 0.495564728975296, + "learning_rate": 8.985393896380412e-05, + "loss": 0.0047418631613254545, + "step": 71510 + }, + { + "epoch": 10.151880766501064, + "grad_norm": 3.843820095062256, + "learning_rate": 8.985251951738822e-05, + "loss": 0.032132938504219055, + "step": 71520 + }, + { + "epoch": 10.153300212916962, + "grad_norm": 1.3583863973617554, + "learning_rate": 8.985110007097233e-05, + "loss": 0.013863271474838257, + "step": 71530 + }, + { + "epoch": 10.15471965933286, + "grad_norm": 9.500256538391113, + "learning_rate": 8.984968062455643e-05, + "loss": 0.024224182963371275, + "step": 71540 + }, + { + "epoch": 10.156139105748759, + "grad_norm": 5.837913990020752, + "learning_rate": 8.984826117814053e-05, + "loss": 0.015855035185813902, + "step": 71550 + }, + { + "epoch": 10.157558552164655, + "grad_norm": 2.1019768714904785, + "learning_rate": 8.984684173172462e-05, + "loss": 0.01872989535331726, + "step": 71560 + }, + { + "epoch": 10.158977998580554, + "grad_norm": 4.689863681793213, + "learning_rate": 8.984542228530874e-05, + "loss": 0.011310167610645294, + "step": 71570 + }, + { + "epoch": 10.160397444996452, + "grad_norm": 3.9470810890197754, + "learning_rate": 8.984400283889283e-05, + "loss": 0.019275748729705812, + "step": 71580 + }, + { + "epoch": 10.161816891412348, + "grad_norm": 6.351150989532471, + "learning_rate": 8.984258339247694e-05, + "loss": 0.016378867626190185, + "step": 71590 + }, + { + "epoch": 10.163236337828247, + "grad_norm": 6.803494930267334, + "learning_rate": 8.984116394606104e-05, + "loss": 0.03146334290504456, + "step": 71600 + }, + { + "epoch": 10.164655784244145, + "grad_norm": 10.133057594299316, + "learning_rate": 8.983974449964514e-05, + "loss": 0.06289324760437012, + "step": 71610 + }, + { + "epoch": 10.166075230660043, + "grad_norm": 0.8119639158248901, + "learning_rate": 8.983832505322925e-05, + "loss": 0.03100758194923401, + "step": 71620 + }, + { + "epoch": 10.16749467707594, + "grad_norm": 0.7174460291862488, + "learning_rate": 8.983690560681335e-05, + "loss": 0.013892364501953126, + "step": 71630 + }, + { + "epoch": 10.168914123491838, + "grad_norm": 4.801692962646484, + "learning_rate": 8.983548616039746e-05, + "loss": 0.012847928702831269, + "step": 71640 + }, + { + "epoch": 10.170333569907736, + "grad_norm": 0.08614791929721832, + "learning_rate": 8.983406671398154e-05, + "loss": 0.03195511400699615, + "step": 71650 + }, + { + "epoch": 10.171753016323633, + "grad_norm": 0.3636281490325928, + "learning_rate": 8.983264726756565e-05, + "loss": 0.014664243161678314, + "step": 71660 + }, + { + "epoch": 10.173172462739531, + "grad_norm": 0.8303320407867432, + "learning_rate": 8.983122782114975e-05, + "loss": 0.023724818229675294, + "step": 71670 + }, + { + "epoch": 10.17459190915543, + "grad_norm": 0.06572652608156204, + "learning_rate": 8.982980837473386e-05, + "loss": 0.027840864658355714, + "step": 71680 + }, + { + "epoch": 10.176011355571328, + "grad_norm": 0.05816236510872841, + "learning_rate": 8.982838892831796e-05, + "loss": 0.009276087582111358, + "step": 71690 + }, + { + "epoch": 10.177430801987224, + "grad_norm": 0.15733098983764648, + "learning_rate": 8.982711142654366e-05, + "loss": 0.04615362286567688, + "step": 71700 + }, + { + "epoch": 10.178850248403123, + "grad_norm": 0.4171859323978424, + "learning_rate": 8.982569198012775e-05, + "loss": 0.06214319467544556, + "step": 71710 + }, + { + "epoch": 10.180269694819021, + "grad_norm": 0.051216304302215576, + "learning_rate": 8.982427253371187e-05, + "loss": 0.04424688518047333, + "step": 71720 + }, + { + "epoch": 10.181689141234918, + "grad_norm": 1.8618922233581543, + "learning_rate": 8.982285308729595e-05, + "loss": 0.015009069442749023, + "step": 71730 + }, + { + "epoch": 10.183108587650816, + "grad_norm": 7.354833602905273, + "learning_rate": 8.982143364088006e-05, + "loss": 0.06012805700302124, + "step": 71740 + }, + { + "epoch": 10.184528034066714, + "grad_norm": 0.7485067248344421, + "learning_rate": 8.982001419446416e-05, + "loss": 0.04865095615386963, + "step": 71750 + }, + { + "epoch": 10.185947480482612, + "grad_norm": 0.6087238788604736, + "learning_rate": 8.981859474804827e-05, + "loss": 0.052143925428390504, + "step": 71760 + }, + { + "epoch": 10.187366926898509, + "grad_norm": 2.8317716121673584, + "learning_rate": 8.981717530163238e-05, + "loss": 0.0342563271522522, + "step": 71770 + }, + { + "epoch": 10.188786373314407, + "grad_norm": 0.08310432732105255, + "learning_rate": 8.981575585521646e-05, + "loss": 0.04148037135601044, + "step": 71780 + }, + { + "epoch": 10.190205819730306, + "grad_norm": 4.928909778594971, + "learning_rate": 8.981433640880057e-05, + "loss": 0.04418375790119171, + "step": 71790 + }, + { + "epoch": 10.191625266146202, + "grad_norm": 0.2593403160572052, + "learning_rate": 8.981291696238467e-05, + "loss": 0.039079904556274414, + "step": 71800 + }, + { + "epoch": 10.1930447125621, + "grad_norm": 0.35453835129737854, + "learning_rate": 8.981149751596878e-05, + "loss": 0.02273627519607544, + "step": 71810 + }, + { + "epoch": 10.194464158977999, + "grad_norm": 14.605096817016602, + "learning_rate": 8.981007806955288e-05, + "loss": 0.07609878778457642, + "step": 71820 + }, + { + "epoch": 10.195883605393897, + "grad_norm": 0.33558687567710876, + "learning_rate": 8.980865862313698e-05, + "loss": 0.027788797020912172, + "step": 71830 + }, + { + "epoch": 10.197303051809794, + "grad_norm": 0.15005749464035034, + "learning_rate": 8.980723917672107e-05, + "loss": 0.017736873030662535, + "step": 71840 + }, + { + "epoch": 10.198722498225692, + "grad_norm": 11.601777076721191, + "learning_rate": 8.980581973030519e-05, + "loss": 0.051431238651275635, + "step": 71850 + }, + { + "epoch": 10.20014194464159, + "grad_norm": 0.3003895878791809, + "learning_rate": 8.98044002838893e-05, + "loss": 0.009282226115465165, + "step": 71860 + }, + { + "epoch": 10.201561391057487, + "grad_norm": 3.5037968158721924, + "learning_rate": 8.98029808374734e-05, + "loss": 0.029530704021453857, + "step": 71870 + }, + { + "epoch": 10.202980837473385, + "grad_norm": 11.290355682373047, + "learning_rate": 8.980156139105749e-05, + "loss": 0.02697465717792511, + "step": 71880 + }, + { + "epoch": 10.204400283889283, + "grad_norm": 6.993745803833008, + "learning_rate": 8.980014194464159e-05, + "loss": 0.021236954629421233, + "step": 71890 + }, + { + "epoch": 10.205819730305182, + "grad_norm": 2.8136534690856934, + "learning_rate": 8.97987224982257e-05, + "loss": 0.02050168514251709, + "step": 71900 + }, + { + "epoch": 10.207239176721078, + "grad_norm": 1.45379638671875, + "learning_rate": 8.97973030518098e-05, + "loss": 0.02213282585144043, + "step": 71910 + }, + { + "epoch": 10.208658623136976, + "grad_norm": 0.5076923370361328, + "learning_rate": 8.979588360539391e-05, + "loss": 0.030324968695640563, + "step": 71920 + }, + { + "epoch": 10.210078069552875, + "grad_norm": 7.311733245849609, + "learning_rate": 8.979446415897799e-05, + "loss": 0.03139882981777191, + "step": 71930 + }, + { + "epoch": 10.211497515968771, + "grad_norm": 0.35405999422073364, + "learning_rate": 8.97930447125621e-05, + "loss": 0.014943568408489228, + "step": 71940 + }, + { + "epoch": 10.21291696238467, + "grad_norm": 8.592523574829102, + "learning_rate": 8.979162526614621e-05, + "loss": 0.04479727745056152, + "step": 71950 + }, + { + "epoch": 10.214336408800568, + "grad_norm": 0.3105320334434509, + "learning_rate": 8.979020581973031e-05, + "loss": 0.041830265522003175, + "step": 71960 + }, + { + "epoch": 10.215755855216466, + "grad_norm": 0.29249778389930725, + "learning_rate": 8.978878637331442e-05, + "loss": 0.0695478618144989, + "step": 71970 + }, + { + "epoch": 10.217175301632363, + "grad_norm": 5.5391764640808105, + "learning_rate": 8.97873669268985e-05, + "loss": 0.06145778298377991, + "step": 71980 + }, + { + "epoch": 10.218594748048261, + "grad_norm": 1.5610414743423462, + "learning_rate": 8.978594748048262e-05, + "loss": 0.02754751741886139, + "step": 71990 + }, + { + "epoch": 10.22001419446416, + "grad_norm": 0.09087704867124557, + "learning_rate": 8.978452803406671e-05, + "loss": 0.034285178780555724, + "step": 72000 + }, + { + "epoch": 10.22001419446416, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.05170145630836487, + "eval_runtime": 32.8652, + "eval_samples_per_second": 478.531, + "eval_steps_per_second": 14.97, + "step": 72000 + }, + { + "epoch": 10.221433640880056, + "grad_norm": 0.021180758252739906, + "learning_rate": 8.978310858765082e-05, + "loss": 0.057250940799713136, + "step": 72010 + }, + { + "epoch": 10.222853087295954, + "grad_norm": 6.0435075759887695, + "learning_rate": 8.978168914123492e-05, + "loss": 0.04329615831375122, + "step": 72020 + }, + { + "epoch": 10.224272533711853, + "grad_norm": 0.007519877981394529, + "learning_rate": 8.978026969481903e-05, + "loss": 0.023879942297935487, + "step": 72030 + }, + { + "epoch": 10.22569198012775, + "grad_norm": 4.61924934387207, + "learning_rate": 8.977885024840312e-05, + "loss": 0.05088481307029724, + "step": 72040 + }, + { + "epoch": 10.227111426543647, + "grad_norm": 4.466940402984619, + "learning_rate": 8.977743080198723e-05, + "loss": 0.015369561314582825, + "step": 72050 + }, + { + "epoch": 10.228530872959546, + "grad_norm": 0.04262328892946243, + "learning_rate": 8.977601135557134e-05, + "loss": 0.031910020112991336, + "step": 72060 + }, + { + "epoch": 10.229950319375444, + "grad_norm": 1.2850419282913208, + "learning_rate": 8.977459190915544e-05, + "loss": 0.04279256463050842, + "step": 72070 + }, + { + "epoch": 10.231369765791342, + "grad_norm": 1.3325414657592773, + "learning_rate": 8.977317246273955e-05, + "loss": 0.020931917428970336, + "step": 72080 + }, + { + "epoch": 10.232789212207239, + "grad_norm": 1.0753977298736572, + "learning_rate": 8.977175301632363e-05, + "loss": 0.008612241595983505, + "step": 72090 + }, + { + "epoch": 10.234208658623137, + "grad_norm": 0.07489815354347229, + "learning_rate": 8.977033356990774e-05, + "loss": 0.034203407168388364, + "step": 72100 + }, + { + "epoch": 10.235628105039035, + "grad_norm": 3.5092709064483643, + "learning_rate": 8.976891412349184e-05, + "loss": 0.02857227623462677, + "step": 72110 + }, + { + "epoch": 10.237047551454932, + "grad_norm": 4.87595796585083, + "learning_rate": 8.976749467707595e-05, + "loss": 0.03527629375457764, + "step": 72120 + }, + { + "epoch": 10.23846699787083, + "grad_norm": 2.1418490409851074, + "learning_rate": 8.976607523066005e-05, + "loss": 0.05000826716423035, + "step": 72130 + }, + { + "epoch": 10.239886444286729, + "grad_norm": 0.8071398138999939, + "learning_rate": 8.976465578424414e-05, + "loss": 0.009712295234203338, + "step": 72140 + }, + { + "epoch": 10.241305890702627, + "grad_norm": 0.32705071568489075, + "learning_rate": 8.976323633782826e-05, + "loss": 0.05061535239219665, + "step": 72150 + }, + { + "epoch": 10.242725337118523, + "grad_norm": 0.4660518765449524, + "learning_rate": 8.976181689141235e-05, + "loss": 0.05338441133499146, + "step": 72160 + }, + { + "epoch": 10.244144783534422, + "grad_norm": 0.027236519381403923, + "learning_rate": 8.976039744499646e-05, + "loss": 0.022108878195285796, + "step": 72170 + }, + { + "epoch": 10.24556422995032, + "grad_norm": 0.14898504316806793, + "learning_rate": 8.975897799858056e-05, + "loss": 0.025085175037384035, + "step": 72180 + }, + { + "epoch": 10.246983676366217, + "grad_norm": 2.2338013648986816, + "learning_rate": 8.975755855216466e-05, + "loss": 0.01661747694015503, + "step": 72190 + }, + { + "epoch": 10.248403122782115, + "grad_norm": 9.279471397399902, + "learning_rate": 8.975613910574876e-05, + "loss": 0.06610844731330871, + "step": 72200 + }, + { + "epoch": 10.249822569198013, + "grad_norm": 0.07761373370885849, + "learning_rate": 8.975471965933287e-05, + "loss": 0.02264375686645508, + "step": 72210 + }, + { + "epoch": 10.251242015613911, + "grad_norm": 8.15989875793457, + "learning_rate": 8.975330021291696e-05, + "loss": 0.048688432574272154, + "step": 72220 + }, + { + "epoch": 10.252661462029808, + "grad_norm": 6.95366096496582, + "learning_rate": 8.975188076650108e-05, + "loss": 0.032186472415924074, + "step": 72230 + }, + { + "epoch": 10.254080908445706, + "grad_norm": 14.51134204864502, + "learning_rate": 8.975046132008517e-05, + "loss": 0.022816789150238038, + "step": 72240 + }, + { + "epoch": 10.255500354861605, + "grad_norm": 0.10701218247413635, + "learning_rate": 8.974904187366927e-05, + "loss": 0.03494252264499664, + "step": 72250 + }, + { + "epoch": 10.256919801277501, + "grad_norm": 3.7930524349212646, + "learning_rate": 8.974762242725338e-05, + "loss": 0.024190935492515563, + "step": 72260 + }, + { + "epoch": 10.2583392476934, + "grad_norm": 1.5398367643356323, + "learning_rate": 8.974620298083748e-05, + "loss": 0.01877433657646179, + "step": 72270 + }, + { + "epoch": 10.259758694109298, + "grad_norm": 7.618338584899902, + "learning_rate": 8.974478353442159e-05, + "loss": 0.0600216805934906, + "step": 72280 + }, + { + "epoch": 10.261178140525196, + "grad_norm": 9.979314804077148, + "learning_rate": 8.974336408800567e-05, + "loss": 0.03386954665184021, + "step": 72290 + }, + { + "epoch": 10.262597586941093, + "grad_norm": 11.369985580444336, + "learning_rate": 8.974194464158978e-05, + "loss": 0.06406527757644653, + "step": 72300 + }, + { + "epoch": 10.264017033356991, + "grad_norm": 3.3756444454193115, + "learning_rate": 8.974052519517388e-05, + "loss": 0.03174246549606323, + "step": 72310 + }, + { + "epoch": 10.26543647977289, + "grad_norm": 0.29984939098358154, + "learning_rate": 8.973910574875799e-05, + "loss": 0.026856064796447754, + "step": 72320 + }, + { + "epoch": 10.266855926188786, + "grad_norm": 3.937253952026367, + "learning_rate": 8.973768630234209e-05, + "loss": 0.02418387681245804, + "step": 72330 + }, + { + "epoch": 10.268275372604684, + "grad_norm": 0.4848698675632477, + "learning_rate": 8.973626685592619e-05, + "loss": 0.04492365121841431, + "step": 72340 + }, + { + "epoch": 10.269694819020582, + "grad_norm": 0.23468013107776642, + "learning_rate": 8.97348474095103e-05, + "loss": 0.06487563252449036, + "step": 72350 + }, + { + "epoch": 10.27111426543648, + "grad_norm": 13.240750312805176, + "learning_rate": 8.97334279630944e-05, + "loss": 0.0500341534614563, + "step": 72360 + }, + { + "epoch": 10.272533711852377, + "grad_norm": 5.729434013366699, + "learning_rate": 8.97320085166785e-05, + "loss": 0.05774502754211426, + "step": 72370 + }, + { + "epoch": 10.273953158268275, + "grad_norm": 2.5826942920684814, + "learning_rate": 8.97305890702626e-05, + "loss": 0.027796417474746704, + "step": 72380 + }, + { + "epoch": 10.275372604684174, + "grad_norm": 0.07748646289110184, + "learning_rate": 8.972916962384671e-05, + "loss": 0.033330485224723816, + "step": 72390 + }, + { + "epoch": 10.27679205110007, + "grad_norm": 0.12528225779533386, + "learning_rate": 8.97277501774308e-05, + "loss": 0.020797872543334962, + "step": 72400 + }, + { + "epoch": 10.278211497515969, + "grad_norm": 0.2537434995174408, + "learning_rate": 8.972633073101491e-05, + "loss": 0.04087269008159637, + "step": 72410 + }, + { + "epoch": 10.279630943931867, + "grad_norm": 2.3561251163482666, + "learning_rate": 8.9724911284599e-05, + "loss": 0.04362359642982483, + "step": 72420 + }, + { + "epoch": 10.281050390347765, + "grad_norm": 0.16754914820194244, + "learning_rate": 8.972349183818312e-05, + "loss": 0.006162405386567116, + "step": 72430 + }, + { + "epoch": 10.282469836763662, + "grad_norm": 0.5426493883132935, + "learning_rate": 8.972207239176722e-05, + "loss": 0.04110488295555115, + "step": 72440 + }, + { + "epoch": 10.28388928317956, + "grad_norm": 3.838188409805298, + "learning_rate": 8.972065294535131e-05, + "loss": 0.06003108024597168, + "step": 72450 + }, + { + "epoch": 10.285308729595458, + "grad_norm": 0.3094750940799713, + "learning_rate": 8.971923349893542e-05, + "loss": 0.036252951622009276, + "step": 72460 + }, + { + "epoch": 10.286728176011355, + "grad_norm": 1.1412874460220337, + "learning_rate": 8.971781405251952e-05, + "loss": 0.02480601370334625, + "step": 72470 + }, + { + "epoch": 10.288147622427253, + "grad_norm": 3.882920265197754, + "learning_rate": 8.971639460610363e-05, + "loss": 0.056429213285446166, + "step": 72480 + }, + { + "epoch": 10.289567068843152, + "grad_norm": 0.4824908673763275, + "learning_rate": 8.971497515968773e-05, + "loss": 0.04092736542224884, + "step": 72490 + }, + { + "epoch": 10.29098651525905, + "grad_norm": 0.3222559988498688, + "learning_rate": 8.971355571327183e-05, + "loss": 0.09585509300231934, + "step": 72500 + }, + { + "epoch": 10.29098651525905, + "eval_accuracy": 0.9814332040440008, + "eval_loss": 0.06251820921897888, + "eval_runtime": 33.2932, + "eval_samples_per_second": 472.378, + "eval_steps_per_second": 14.778, + "step": 72500 + }, + { + "epoch": 10.292405961674946, + "grad_norm": 11.763492584228516, + "learning_rate": 8.971213626685592e-05, + "loss": 0.01904451847076416, + "step": 72510 + }, + { + "epoch": 10.293825408090845, + "grad_norm": 0.5626010298728943, + "learning_rate": 8.971071682044003e-05, + "loss": 0.01258438527584076, + "step": 72520 + }, + { + "epoch": 10.295244854506743, + "grad_norm": 5.586578845977783, + "learning_rate": 8.970929737402413e-05, + "loss": 0.028937163949012756, + "step": 72530 + }, + { + "epoch": 10.29666430092264, + "grad_norm": 4.926784515380859, + "learning_rate": 8.970787792760824e-05, + "loss": 0.05953345894813537, + "step": 72540 + }, + { + "epoch": 10.298083747338538, + "grad_norm": 0.07617692649364471, + "learning_rate": 8.970645848119234e-05, + "loss": 0.024326160550117493, + "step": 72550 + }, + { + "epoch": 10.299503193754436, + "grad_norm": 0.09429468214511871, + "learning_rate": 8.970503903477644e-05, + "loss": 0.016668303310871123, + "step": 72560 + }, + { + "epoch": 10.300922640170334, + "grad_norm": 0.03485213965177536, + "learning_rate": 8.970361958836055e-05, + "loss": 0.02714379131793976, + "step": 72570 + }, + { + "epoch": 10.302342086586231, + "grad_norm": 0.046538855880498886, + "learning_rate": 8.970220014194465e-05, + "loss": 0.03082246482372284, + "step": 72580 + }, + { + "epoch": 10.30376153300213, + "grad_norm": 0.529966413974762, + "learning_rate": 8.970078069552876e-05, + "loss": 0.010735002905130386, + "step": 72590 + }, + { + "epoch": 10.305180979418028, + "grad_norm": 1.1207096576690674, + "learning_rate": 8.969936124911284e-05, + "loss": 0.007675926387310028, + "step": 72600 + }, + { + "epoch": 10.306600425833924, + "grad_norm": 0.2528717517852783, + "learning_rate": 8.969794180269695e-05, + "loss": 0.027586179971694946, + "step": 72610 + }, + { + "epoch": 10.308019872249822, + "grad_norm": 2.0045723915100098, + "learning_rate": 8.969652235628105e-05, + "loss": 0.025220289826393127, + "step": 72620 + }, + { + "epoch": 10.30943931866572, + "grad_norm": 1.069236397743225, + "learning_rate": 8.969510290986516e-05, + "loss": 0.02478688657283783, + "step": 72630 + }, + { + "epoch": 10.310858765081619, + "grad_norm": 3.645983934402466, + "learning_rate": 8.969368346344926e-05, + "loss": 0.02384749799966812, + "step": 72640 + }, + { + "epoch": 10.312278211497516, + "grad_norm": 0.055776119232177734, + "learning_rate": 8.969226401703335e-05, + "loss": 0.025250345468521118, + "step": 72650 + }, + { + "epoch": 10.313697657913414, + "grad_norm": 1.7379151582717896, + "learning_rate": 8.969084457061747e-05, + "loss": 0.039242887496948244, + "step": 72660 + }, + { + "epoch": 10.315117104329312, + "grad_norm": 7.7128472328186035, + "learning_rate": 8.968942512420156e-05, + "loss": 0.04973468780517578, + "step": 72670 + }, + { + "epoch": 10.316536550745209, + "grad_norm": 0.4700365662574768, + "learning_rate": 8.968800567778567e-05, + "loss": 0.035573115944862364, + "step": 72680 + }, + { + "epoch": 10.317955997161107, + "grad_norm": 6.4476318359375, + "learning_rate": 8.968658623136977e-05, + "loss": 0.019644205272197724, + "step": 72690 + }, + { + "epoch": 10.319375443577005, + "grad_norm": 5.906562805175781, + "learning_rate": 8.968516678495387e-05, + "loss": 0.025435513257980345, + "step": 72700 + }, + { + "epoch": 10.320794889992904, + "grad_norm": 0.21683073043823242, + "learning_rate": 8.968374733853797e-05, + "loss": 0.06322475075721741, + "step": 72710 + }, + { + "epoch": 10.3222143364088, + "grad_norm": 1.5359755754470825, + "learning_rate": 8.968232789212208e-05, + "loss": 0.011539919674396515, + "step": 72720 + }, + { + "epoch": 10.323633782824698, + "grad_norm": 0.12440875917673111, + "learning_rate": 8.968090844570617e-05, + "loss": 0.043030500411987305, + "step": 72730 + }, + { + "epoch": 10.325053229240597, + "grad_norm": 4.936128616333008, + "learning_rate": 8.967948899929029e-05, + "loss": 0.018066060543060303, + "step": 72740 + }, + { + "epoch": 10.326472675656493, + "grad_norm": 0.04510660469532013, + "learning_rate": 8.967806955287438e-05, + "loss": 0.019599223136901857, + "step": 72750 + }, + { + "epoch": 10.327892122072392, + "grad_norm": 0.1886427402496338, + "learning_rate": 8.967665010645848e-05, + "loss": 0.04775834977626801, + "step": 72760 + }, + { + "epoch": 10.32931156848829, + "grad_norm": 0.006249363534152508, + "learning_rate": 8.967523066004259e-05, + "loss": 0.025642585754394532, + "step": 72770 + }, + { + "epoch": 10.330731014904188, + "grad_norm": 0.16167527437210083, + "learning_rate": 8.967381121362669e-05, + "loss": 0.024762631952762605, + "step": 72780 + }, + { + "epoch": 10.332150461320085, + "grad_norm": 1.3200987577438354, + "learning_rate": 8.96723917672108e-05, + "loss": 0.0683577299118042, + "step": 72790 + }, + { + "epoch": 10.333569907735983, + "grad_norm": 0.19871366024017334, + "learning_rate": 8.96709723207949e-05, + "loss": 0.026114186644554137, + "step": 72800 + }, + { + "epoch": 10.334989354151881, + "grad_norm": 3.256074905395508, + "learning_rate": 8.9669552874379e-05, + "loss": 0.020235490798950196, + "step": 72810 + }, + { + "epoch": 10.336408800567778, + "grad_norm": 4.438711643218994, + "learning_rate": 8.966813342796309e-05, + "loss": 0.025841870903968812, + "step": 72820 + }, + { + "epoch": 10.337828246983676, + "grad_norm": 6.780463695526123, + "learning_rate": 8.96667139815472e-05, + "loss": 0.06629498600959778, + "step": 72830 + }, + { + "epoch": 10.339247693399575, + "grad_norm": 5.759703159332275, + "learning_rate": 8.96652945351313e-05, + "loss": 0.05249155759811401, + "step": 72840 + }, + { + "epoch": 10.340667139815473, + "grad_norm": 6.208014011383057, + "learning_rate": 8.966387508871541e-05, + "loss": 0.029995641112327574, + "step": 72850 + }, + { + "epoch": 10.34208658623137, + "grad_norm": 0.3385080099105835, + "learning_rate": 8.966245564229951e-05, + "loss": 0.027232617139816284, + "step": 72860 + }, + { + "epoch": 10.343506032647268, + "grad_norm": 0.02547670714557171, + "learning_rate": 8.96610361958836e-05, + "loss": 0.006107653677463532, + "step": 72870 + }, + { + "epoch": 10.344925479063166, + "grad_norm": 1.217483401298523, + "learning_rate": 8.965961674946772e-05, + "loss": 0.02872964143753052, + "step": 72880 + }, + { + "epoch": 10.346344925479062, + "grad_norm": 3.2974629402160645, + "learning_rate": 8.965819730305181e-05, + "loss": 0.030625206232070924, + "step": 72890 + }, + { + "epoch": 10.34776437189496, + "grad_norm": 1.686771273612976, + "learning_rate": 8.965677785663592e-05, + "loss": 0.03720858991146088, + "step": 72900 + }, + { + "epoch": 10.349183818310859, + "grad_norm": 0.4160674214363098, + "learning_rate": 8.965535841022001e-05, + "loss": 0.03108443021774292, + "step": 72910 + }, + { + "epoch": 10.350603264726757, + "grad_norm": 0.33811572194099426, + "learning_rate": 8.965393896380412e-05, + "loss": 0.07100291848182679, + "step": 72920 + }, + { + "epoch": 10.352022711142654, + "grad_norm": 3.747161388397217, + "learning_rate": 8.965251951738822e-05, + "loss": 0.024768516421318054, + "step": 72930 + }, + { + "epoch": 10.353442157558552, + "grad_norm": 11.699270248413086, + "learning_rate": 8.965110007097233e-05, + "loss": 0.03598522841930389, + "step": 72940 + }, + { + "epoch": 10.35486160397445, + "grad_norm": 0.11568176746368408, + "learning_rate": 8.964968062455643e-05, + "loss": 0.030876615643501283, + "step": 72950 + }, + { + "epoch": 10.356281050390347, + "grad_norm": 0.1701243370771408, + "learning_rate": 8.964826117814052e-05, + "loss": 0.03053068518638611, + "step": 72960 + }, + { + "epoch": 10.357700496806245, + "grad_norm": 0.4134175777435303, + "learning_rate": 8.964684173172463e-05, + "loss": 0.022406129539012908, + "step": 72970 + }, + { + "epoch": 10.359119943222144, + "grad_norm": 9.19906997680664, + "learning_rate": 8.964542228530873e-05, + "loss": 0.05493233203887939, + "step": 72980 + }, + { + "epoch": 10.360539389638042, + "grad_norm": 11.903061866760254, + "learning_rate": 8.964400283889284e-05, + "loss": 0.07393259406089783, + "step": 72990 + }, + { + "epoch": 10.361958836053939, + "grad_norm": 3.575417995452881, + "learning_rate": 8.964258339247694e-05, + "loss": 0.06495519280433655, + "step": 73000 + }, + { + "epoch": 10.361958836053939, + "eval_accuracy": 0.9803522604438227, + "eval_loss": 0.06612089276313782, + "eval_runtime": 32.161, + "eval_samples_per_second": 489.009, + "eval_steps_per_second": 15.298, + "step": 73000 + }, + { + "epoch": 10.363378282469837, + "grad_norm": 0.9812063574790955, + "learning_rate": 8.964116394606104e-05, + "loss": 0.039226147532463077, + "step": 73010 + }, + { + "epoch": 10.364797728885735, + "grad_norm": 0.3453628718852997, + "learning_rate": 8.963974449964513e-05, + "loss": 0.03408626914024353, + "step": 73020 + }, + { + "epoch": 10.366217175301632, + "grad_norm": 5.1603899002075195, + "learning_rate": 8.963832505322924e-05, + "loss": 0.07247533798217773, + "step": 73030 + }, + { + "epoch": 10.36763662171753, + "grad_norm": 4.564572811126709, + "learning_rate": 8.963690560681334e-05, + "loss": 0.02123674303293228, + "step": 73040 + }, + { + "epoch": 10.369056068133428, + "grad_norm": 2.140882968902588, + "learning_rate": 8.963548616039745e-05, + "loss": 0.02011290192604065, + "step": 73050 + }, + { + "epoch": 10.370475514549327, + "grad_norm": 0.5259304046630859, + "learning_rate": 8.963406671398155e-05, + "loss": 0.03429713249206543, + "step": 73060 + }, + { + "epoch": 10.371894960965223, + "grad_norm": 14.053521156311035, + "learning_rate": 8.963264726756565e-05, + "loss": 0.0851962685585022, + "step": 73070 + }, + { + "epoch": 10.373314407381121, + "grad_norm": 0.11945252865552902, + "learning_rate": 8.963122782114976e-05, + "loss": 0.02100173830986023, + "step": 73080 + }, + { + "epoch": 10.37473385379702, + "grad_norm": 1.8677468299865723, + "learning_rate": 8.962980837473386e-05, + "loss": 0.042025390267372134, + "step": 73090 + }, + { + "epoch": 10.376153300212916, + "grad_norm": 7.363965034484863, + "learning_rate": 8.962838892831797e-05, + "loss": 0.024972128868103027, + "step": 73100 + }, + { + "epoch": 10.377572746628815, + "grad_norm": 7.26507568359375, + "learning_rate": 8.962696948190206e-05, + "loss": 0.055237317085266115, + "step": 73110 + }, + { + "epoch": 10.378992193044713, + "grad_norm": 0.9234290719032288, + "learning_rate": 8.962555003548616e-05, + "loss": 0.049257388710975646, + "step": 73120 + }, + { + "epoch": 10.380411639460611, + "grad_norm": 7.842380523681641, + "learning_rate": 8.962413058907026e-05, + "loss": 0.04204971194267273, + "step": 73130 + }, + { + "epoch": 10.381831085876508, + "grad_norm": 0.5620039701461792, + "learning_rate": 8.962271114265437e-05, + "loss": 0.006001041084527969, + "step": 73140 + }, + { + "epoch": 10.383250532292406, + "grad_norm": 3.0934367179870605, + "learning_rate": 8.962129169623847e-05, + "loss": 0.02211282551288605, + "step": 73150 + }, + { + "epoch": 10.384669978708304, + "grad_norm": 0.1924249231815338, + "learning_rate": 8.961987224982258e-05, + "loss": 0.012744960188865662, + "step": 73160 + }, + { + "epoch": 10.3860894251242, + "grad_norm": 3.350724697113037, + "learning_rate": 8.961845280340668e-05, + "loss": 0.022318266332149506, + "step": 73170 + }, + { + "epoch": 10.3875088715401, + "grad_norm": 3.635645866394043, + "learning_rate": 8.961703335699077e-05, + "loss": 0.029640212655067444, + "step": 73180 + }, + { + "epoch": 10.388928317955997, + "grad_norm": 0.46407830715179443, + "learning_rate": 8.961561391057488e-05, + "loss": 0.024921415746212004, + "step": 73190 + }, + { + "epoch": 10.390347764371896, + "grad_norm": 10.320816993713379, + "learning_rate": 8.961419446415898e-05, + "loss": 0.0269631564617157, + "step": 73200 + }, + { + "epoch": 10.391767210787792, + "grad_norm": 2.758819818496704, + "learning_rate": 8.961277501774309e-05, + "loss": 0.04611527621746063, + "step": 73210 + }, + { + "epoch": 10.39318665720369, + "grad_norm": 4.595924377441406, + "learning_rate": 8.961135557132718e-05, + "loss": 0.05312431454658508, + "step": 73220 + }, + { + "epoch": 10.394606103619589, + "grad_norm": 5.965487957000732, + "learning_rate": 8.960993612491129e-05, + "loss": 0.05250157713890076, + "step": 73230 + }, + { + "epoch": 10.396025550035485, + "grad_norm": 2.658355951309204, + "learning_rate": 8.960851667849538e-05, + "loss": 0.03350549340248108, + "step": 73240 + }, + { + "epoch": 10.397444996451384, + "grad_norm": 0.023311669006943703, + "learning_rate": 8.96070972320795e-05, + "loss": 0.01648566424846649, + "step": 73250 + }, + { + "epoch": 10.398864442867282, + "grad_norm": 10.702779769897461, + "learning_rate": 8.96056777856636e-05, + "loss": 0.04944937825202942, + "step": 73260 + }, + { + "epoch": 10.40028388928318, + "grad_norm": 0.25531113147735596, + "learning_rate": 8.960425833924769e-05, + "loss": 0.03268125057220459, + "step": 73270 + }, + { + "epoch": 10.401703335699077, + "grad_norm": 0.44327834248542786, + "learning_rate": 8.96028388928318e-05, + "loss": 0.025926288962364197, + "step": 73280 + }, + { + "epoch": 10.403122782114975, + "grad_norm": 0.04333629086613655, + "learning_rate": 8.96014194464159e-05, + "loss": 0.02328411638736725, + "step": 73290 + }, + { + "epoch": 10.404542228530874, + "grad_norm": 0.1707431674003601, + "learning_rate": 8.960000000000001e-05, + "loss": 0.008144380897283554, + "step": 73300 + }, + { + "epoch": 10.40596167494677, + "grad_norm": 2.7800562381744385, + "learning_rate": 8.959858055358411e-05, + "loss": 0.030040925741195677, + "step": 73310 + }, + { + "epoch": 10.407381121362668, + "grad_norm": 6.127976417541504, + "learning_rate": 8.95971611071682e-05, + "loss": 0.034268587827682495, + "step": 73320 + }, + { + "epoch": 10.408800567778567, + "grad_norm": 0.9095988869667053, + "learning_rate": 8.95957416607523e-05, + "loss": 0.036425772309303286, + "step": 73330 + }, + { + "epoch": 10.410220014194465, + "grad_norm": 9.289791107177734, + "learning_rate": 8.959432221433641e-05, + "loss": 0.07397414445877075, + "step": 73340 + }, + { + "epoch": 10.411639460610361, + "grad_norm": 0.14718365669250488, + "learning_rate": 8.959290276792052e-05, + "loss": 0.06551390886306763, + "step": 73350 + }, + { + "epoch": 10.41305890702626, + "grad_norm": 0.5513947010040283, + "learning_rate": 8.959148332150462e-05, + "loss": 0.018647877871990202, + "step": 73360 + }, + { + "epoch": 10.414478353442158, + "grad_norm": 10.3717622756958, + "learning_rate": 8.959006387508872e-05, + "loss": 0.0698345422744751, + "step": 73370 + }, + { + "epoch": 10.415897799858055, + "grad_norm": 1.1711900234222412, + "learning_rate": 8.958864442867282e-05, + "loss": 0.01738281548023224, + "step": 73380 + }, + { + "epoch": 10.417317246273953, + "grad_norm": 0.29801109433174133, + "learning_rate": 8.958722498225693e-05, + "loss": 0.02697826027870178, + "step": 73390 + }, + { + "epoch": 10.418736692689851, + "grad_norm": 1.2740803956985474, + "learning_rate": 8.958580553584102e-05, + "loss": 0.015290048718452454, + "step": 73400 + }, + { + "epoch": 10.42015613910575, + "grad_norm": 0.2341892421245575, + "learning_rate": 8.958438608942513e-05, + "loss": 0.07527807354927063, + "step": 73410 + }, + { + "epoch": 10.421575585521646, + "grad_norm": 0.5258547067642212, + "learning_rate": 8.958296664300922e-05, + "loss": 0.048480254411697385, + "step": 73420 + }, + { + "epoch": 10.422995031937544, + "grad_norm": 1.7956501245498657, + "learning_rate": 8.958154719659333e-05, + "loss": 0.02461201250553131, + "step": 73430 + }, + { + "epoch": 10.424414478353443, + "grad_norm": 4.295741081237793, + "learning_rate": 8.958012775017744e-05, + "loss": 0.02888575792312622, + "step": 73440 + }, + { + "epoch": 10.42583392476934, + "grad_norm": 0.2158520519733429, + "learning_rate": 8.957870830376154e-05, + "loss": 0.009636881202459336, + "step": 73450 + }, + { + "epoch": 10.427253371185238, + "grad_norm": 0.5940867066383362, + "learning_rate": 8.957728885734565e-05, + "loss": 0.02917068898677826, + "step": 73460 + }, + { + "epoch": 10.428672817601136, + "grad_norm": 3.378797769546509, + "learning_rate": 8.957586941092975e-05, + "loss": 0.01072699874639511, + "step": 73470 + }, + { + "epoch": 10.430092264017034, + "grad_norm": 9.421317100524902, + "learning_rate": 8.957444996451384e-05, + "loss": 0.01690336912870407, + "step": 73480 + }, + { + "epoch": 10.43151171043293, + "grad_norm": 3.34399676322937, + "learning_rate": 8.957303051809794e-05, + "loss": 0.011198329925537109, + "step": 73490 + }, + { + "epoch": 10.432931156848829, + "grad_norm": 2.572981119155884, + "learning_rate": 8.957161107168205e-05, + "loss": 0.00565105676651001, + "step": 73500 + }, + { + "epoch": 10.432931156848829, + "eval_accuracy": 0.9844216951739048, + "eval_loss": 0.05566272512078285, + "eval_runtime": 33.2646, + "eval_samples_per_second": 472.785, + "eval_steps_per_second": 14.791, + "step": 73500 + }, + { + "epoch": 10.434350603264727, + "grad_norm": 0.12054192274808884, + "learning_rate": 8.957019162526615e-05, + "loss": 0.05910096168518066, + "step": 73510 + }, + { + "epoch": 10.435770049680624, + "grad_norm": 0.16370511054992676, + "learning_rate": 8.956877217885026e-05, + "loss": 0.032081523537635805, + "step": 73520 + }, + { + "epoch": 10.437189496096522, + "grad_norm": 0.29961004853248596, + "learning_rate": 8.956735273243436e-05, + "loss": 0.007783429324626922, + "step": 73530 + }, + { + "epoch": 10.43860894251242, + "grad_norm": 2.4290831089019775, + "learning_rate": 8.956593328601845e-05, + "loss": 0.023958057165145874, + "step": 73540 + }, + { + "epoch": 10.440028388928319, + "grad_norm": 6.39838171005249, + "learning_rate": 8.956451383960257e-05, + "loss": 0.045057627558708194, + "step": 73550 + }, + { + "epoch": 10.441447835344215, + "grad_norm": 1.2620060443878174, + "learning_rate": 8.956309439318666e-05, + "loss": 0.010841131955385209, + "step": 73560 + }, + { + "epoch": 10.442867281760114, + "grad_norm": 0.03583799675107002, + "learning_rate": 8.956167494677077e-05, + "loss": 0.026120901107788086, + "step": 73570 + }, + { + "epoch": 10.444286728176012, + "grad_norm": 9.077432632446289, + "learning_rate": 8.956025550035486e-05, + "loss": 0.05460309386253357, + "step": 73580 + }, + { + "epoch": 10.445706174591908, + "grad_norm": 1.1113617420196533, + "learning_rate": 8.955883605393897e-05, + "loss": 0.04970341622829437, + "step": 73590 + }, + { + "epoch": 10.447125621007807, + "grad_norm": 2.5755014419555664, + "learning_rate": 8.955741660752307e-05, + "loss": 0.050418293476104735, + "step": 73600 + }, + { + "epoch": 10.448545067423705, + "grad_norm": 1.6536363363265991, + "learning_rate": 8.955599716110718e-05, + "loss": 0.03822358548641205, + "step": 73610 + }, + { + "epoch": 10.449964513839603, + "grad_norm": 15.947848320007324, + "learning_rate": 8.955457771469127e-05, + "loss": 0.051361745595932005, + "step": 73620 + }, + { + "epoch": 10.4513839602555, + "grad_norm": 5.572516441345215, + "learning_rate": 8.955315826827537e-05, + "loss": 0.024033012986183166, + "step": 73630 + }, + { + "epoch": 10.452803406671398, + "grad_norm": 4.491860866546631, + "learning_rate": 8.955173882185948e-05, + "loss": 0.05285232663154602, + "step": 73640 + }, + { + "epoch": 10.454222853087296, + "grad_norm": 2.209595203399658, + "learning_rate": 8.955031937544358e-05, + "loss": 0.010080764442682267, + "step": 73650 + }, + { + "epoch": 10.455642299503193, + "grad_norm": 7.413250923156738, + "learning_rate": 8.954889992902769e-05, + "loss": 0.05623313784599304, + "step": 73660 + }, + { + "epoch": 10.457061745919091, + "grad_norm": 5.551499366760254, + "learning_rate": 8.954748048261179e-05, + "loss": 0.022087934613227844, + "step": 73670 + }, + { + "epoch": 10.45848119233499, + "grad_norm": 0.6603949666023254, + "learning_rate": 8.954606103619589e-05, + "loss": 0.06099636554718017, + "step": 73680 + }, + { + "epoch": 10.459900638750888, + "grad_norm": 1.2046256065368652, + "learning_rate": 8.954464158977998e-05, + "loss": 0.04031319320201874, + "step": 73690 + }, + { + "epoch": 10.461320085166784, + "grad_norm": 2.295785427093506, + "learning_rate": 8.95432221433641e-05, + "loss": 0.0229206383228302, + "step": 73700 + }, + { + "epoch": 10.462739531582683, + "grad_norm": 1.1245261430740356, + "learning_rate": 8.954180269694819e-05, + "loss": 0.03432927429676056, + "step": 73710 + }, + { + "epoch": 10.464158977998581, + "grad_norm": 0.25341543555259705, + "learning_rate": 8.95403832505323e-05, + "loss": 0.011335819959640503, + "step": 73720 + }, + { + "epoch": 10.465578424414478, + "grad_norm": 1.2831389904022217, + "learning_rate": 8.95389638041164e-05, + "loss": 0.02653484046459198, + "step": 73730 + }, + { + "epoch": 10.466997870830376, + "grad_norm": 3.338085412979126, + "learning_rate": 8.95375443577005e-05, + "loss": 0.0467991441488266, + "step": 73740 + }, + { + "epoch": 10.468417317246274, + "grad_norm": 10.528369903564453, + "learning_rate": 8.953612491128461e-05, + "loss": 0.10263023376464844, + "step": 73750 + }, + { + "epoch": 10.469836763662173, + "grad_norm": 0.6761602759361267, + "learning_rate": 8.95347054648687e-05, + "loss": 0.010650408267974854, + "step": 73760 + }, + { + "epoch": 10.471256210078069, + "grad_norm": 0.20647557079792023, + "learning_rate": 8.953328601845282e-05, + "loss": 0.019610205292701723, + "step": 73770 + }, + { + "epoch": 10.472675656493967, + "grad_norm": 1.0002400875091553, + "learning_rate": 8.953186657203691e-05, + "loss": 0.018295831978321075, + "step": 73780 + }, + { + "epoch": 10.474095102909866, + "grad_norm": 0.8303980231285095, + "learning_rate": 8.953044712562101e-05, + "loss": 0.0178376168012619, + "step": 73790 + }, + { + "epoch": 10.475514549325762, + "grad_norm": 6.3407487869262695, + "learning_rate": 8.952902767920511e-05, + "loss": 0.06772493124008179, + "step": 73800 + }, + { + "epoch": 10.47693399574166, + "grad_norm": 4.017003536224365, + "learning_rate": 8.952760823278922e-05, + "loss": 0.04538638293743134, + "step": 73810 + }, + { + "epoch": 10.478353442157559, + "grad_norm": 8.43114185333252, + "learning_rate": 8.952618878637332e-05, + "loss": 0.03319251537322998, + "step": 73820 + }, + { + "epoch": 10.479772888573457, + "grad_norm": 2.7893521785736084, + "learning_rate": 8.952476933995743e-05, + "loss": 0.028853052854537965, + "step": 73830 + }, + { + "epoch": 10.481192334989354, + "grad_norm": 9.185164451599121, + "learning_rate": 8.952334989354153e-05, + "loss": 0.06574128866195679, + "step": 73840 + }, + { + "epoch": 10.482611781405252, + "grad_norm": 0.7617444396018982, + "learning_rate": 8.952193044712562e-05, + "loss": 0.04429103434085846, + "step": 73850 + }, + { + "epoch": 10.48403122782115, + "grad_norm": 2.2630093097686768, + "learning_rate": 8.952051100070973e-05, + "loss": 0.02493145763874054, + "step": 73860 + }, + { + "epoch": 10.485450674237047, + "grad_norm": 4.28364896774292, + "learning_rate": 8.951909155429383e-05, + "loss": 0.09974995851516724, + "step": 73870 + }, + { + "epoch": 10.486870120652945, + "grad_norm": 2.674414873123169, + "learning_rate": 8.951767210787794e-05, + "loss": 0.0344027042388916, + "step": 73880 + }, + { + "epoch": 10.488289567068843, + "grad_norm": 6.765301704406738, + "learning_rate": 8.951625266146203e-05, + "loss": 0.025649476051330566, + "step": 73890 + }, + { + "epoch": 10.489709013484742, + "grad_norm": 0.6529340744018555, + "learning_rate": 8.951483321504614e-05, + "loss": 0.00923079326748848, + "step": 73900 + }, + { + "epoch": 10.491128459900638, + "grad_norm": 0.058388613164424896, + "learning_rate": 8.951341376863023e-05, + "loss": 0.021965248882770537, + "step": 73910 + }, + { + "epoch": 10.492547906316537, + "grad_norm": 3.1133265495300293, + "learning_rate": 8.951199432221434e-05, + "loss": 0.014603239297866822, + "step": 73920 + }, + { + "epoch": 10.493967352732435, + "grad_norm": 4.840022087097168, + "learning_rate": 8.951057487579844e-05, + "loss": 0.04172016680240631, + "step": 73930 + }, + { + "epoch": 10.495386799148331, + "grad_norm": 3.6252007484436035, + "learning_rate": 8.950915542938254e-05, + "loss": 0.0229099839925766, + "step": 73940 + }, + { + "epoch": 10.49680624556423, + "grad_norm": 0.875785768032074, + "learning_rate": 8.950773598296665e-05, + "loss": 0.0125528022646904, + "step": 73950 + }, + { + "epoch": 10.498225691980128, + "grad_norm": 2.1686460971832275, + "learning_rate": 8.950631653655075e-05, + "loss": 0.01959478259086609, + "step": 73960 + }, + { + "epoch": 10.499645138396026, + "grad_norm": 2.5842530727386475, + "learning_rate": 8.950489709013486e-05, + "loss": 0.01581704616546631, + "step": 73970 + }, + { + "epoch": 10.501064584811923, + "grad_norm": 0.9878895878791809, + "learning_rate": 8.950347764371896e-05, + "loss": 0.029901912808418273, + "step": 73980 + }, + { + "epoch": 10.502484031227821, + "grad_norm": 0.5379632115364075, + "learning_rate": 8.950205819730305e-05, + "loss": 0.035832139849662784, + "step": 73990 + }, + { + "epoch": 10.50390347764372, + "grad_norm": 0.46223098039627075, + "learning_rate": 8.950063875088715e-05, + "loss": 0.02706916332244873, + "step": 74000 + }, + { + "epoch": 10.50390347764372, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.05921155586838722, + "eval_runtime": 32.9761, + "eval_samples_per_second": 476.921, + "eval_steps_per_second": 14.92, + "step": 74000 + }, + { + "epoch": 10.505322924059616, + "grad_norm": 0.26483798027038574, + "learning_rate": 8.949921930447126e-05, + "loss": 0.01467936635017395, + "step": 74010 + }, + { + "epoch": 10.506742370475514, + "grad_norm": 0.147200345993042, + "learning_rate": 8.949779985805536e-05, + "loss": 0.012765195965766907, + "step": 74020 + }, + { + "epoch": 10.508161816891413, + "grad_norm": 1.744972586631775, + "learning_rate": 8.949638041163947e-05, + "loss": 0.06021806001663208, + "step": 74030 + }, + { + "epoch": 10.509581263307311, + "grad_norm": 8.56665325164795, + "learning_rate": 8.949496096522357e-05, + "loss": 0.05351743698120117, + "step": 74040 + }, + { + "epoch": 10.511000709723207, + "grad_norm": 0.1041315421462059, + "learning_rate": 8.949354151880767e-05, + "loss": 0.04573552906513214, + "step": 74050 + }, + { + "epoch": 10.512420156139106, + "grad_norm": 5.440675258636475, + "learning_rate": 8.949212207239178e-05, + "loss": 0.04139101505279541, + "step": 74060 + }, + { + "epoch": 10.513839602555004, + "grad_norm": 2.8024065494537354, + "learning_rate": 8.949070262597587e-05, + "loss": 0.06140284538269043, + "step": 74070 + }, + { + "epoch": 10.5152590489709, + "grad_norm": 10.5914945602417, + "learning_rate": 8.948928317955998e-05, + "loss": 0.07495509386062622, + "step": 74080 + }, + { + "epoch": 10.516678495386799, + "grad_norm": 1.458267331123352, + "learning_rate": 8.948786373314407e-05, + "loss": 0.0535413384437561, + "step": 74090 + }, + { + "epoch": 10.518097941802697, + "grad_norm": 2.3449082374572754, + "learning_rate": 8.948644428672818e-05, + "loss": 0.04176829755306244, + "step": 74100 + }, + { + "epoch": 10.519517388218595, + "grad_norm": 0.42052292823791504, + "learning_rate": 8.948502484031228e-05, + "loss": 0.011891970038414001, + "step": 74110 + }, + { + "epoch": 10.520936834634492, + "grad_norm": 7.065118312835693, + "learning_rate": 8.948360539389639e-05, + "loss": 0.052879738807678225, + "step": 74120 + }, + { + "epoch": 10.52235628105039, + "grad_norm": 0.9390953183174133, + "learning_rate": 8.948218594748048e-05, + "loss": 0.04085269868373871, + "step": 74130 + }, + { + "epoch": 10.523775727466289, + "grad_norm": 0.5752598643302917, + "learning_rate": 8.94807665010646e-05, + "loss": 0.025446805357933044, + "step": 74140 + }, + { + "epoch": 10.525195173882185, + "grad_norm": 5.54420804977417, + "learning_rate": 8.947934705464869e-05, + "loss": 0.07761828303337097, + "step": 74150 + }, + { + "epoch": 10.526614620298083, + "grad_norm": 8.482158660888672, + "learning_rate": 8.947792760823279e-05, + "loss": 0.015085341036319732, + "step": 74160 + }, + { + "epoch": 10.528034066713982, + "grad_norm": 0.253826767206192, + "learning_rate": 8.94765081618169e-05, + "loss": 0.05122783184051514, + "step": 74170 + }, + { + "epoch": 10.52945351312988, + "grad_norm": 10.227400779724121, + "learning_rate": 8.9475088715401e-05, + "loss": 0.06506451368331909, + "step": 74180 + }, + { + "epoch": 10.530872959545777, + "grad_norm": 6.364181995391846, + "learning_rate": 8.947366926898511e-05, + "loss": 0.012909649312496186, + "step": 74190 + }, + { + "epoch": 10.532292405961675, + "grad_norm": 1.6474579572677612, + "learning_rate": 8.94722498225692e-05, + "loss": 0.058185654878616336, + "step": 74200 + }, + { + "epoch": 10.533711852377573, + "grad_norm": 0.05176066979765892, + "learning_rate": 8.94708303761533e-05, + "loss": 0.024706798791885375, + "step": 74210 + }, + { + "epoch": 10.53513129879347, + "grad_norm": 0.15353232622146606, + "learning_rate": 8.94694109297374e-05, + "loss": 0.010836786031723023, + "step": 74220 + }, + { + "epoch": 10.536550745209368, + "grad_norm": 1.6580612659454346, + "learning_rate": 8.946799148332151e-05, + "loss": 0.018438754975795744, + "step": 74230 + }, + { + "epoch": 10.537970191625266, + "grad_norm": 0.47383174300193787, + "learning_rate": 8.946657203690561e-05, + "loss": 0.019079934060573577, + "step": 74240 + }, + { + "epoch": 10.539389638041165, + "grad_norm": 0.06103862076997757, + "learning_rate": 8.946515259048971e-05, + "loss": 0.017451618611812592, + "step": 74250 + }, + { + "epoch": 10.540809084457061, + "grad_norm": 0.028250914067029953, + "learning_rate": 8.946373314407382e-05, + "loss": 0.018591858446598053, + "step": 74260 + }, + { + "epoch": 10.54222853087296, + "grad_norm": 0.04944806545972824, + "learning_rate": 8.946231369765792e-05, + "loss": 0.026098889112472535, + "step": 74270 + }, + { + "epoch": 10.543647977288858, + "grad_norm": 0.3458721935749054, + "learning_rate": 8.946089425124203e-05, + "loss": 0.046861696243286136, + "step": 74280 + }, + { + "epoch": 10.545067423704754, + "grad_norm": 1.2632274627685547, + "learning_rate": 8.945947480482612e-05, + "loss": 0.017438746988773346, + "step": 74290 + }, + { + "epoch": 10.546486870120653, + "grad_norm": 1.1462650299072266, + "learning_rate": 8.945805535841022e-05, + "loss": 0.018086281418800355, + "step": 74300 + }, + { + "epoch": 10.547906316536551, + "grad_norm": 2.7778356075286865, + "learning_rate": 8.945663591199432e-05, + "loss": 0.04267987906932831, + "step": 74310 + }, + { + "epoch": 10.54932576295245, + "grad_norm": 1.8293638229370117, + "learning_rate": 8.945521646557843e-05, + "loss": 0.059062355756759645, + "step": 74320 + }, + { + "epoch": 10.550745209368346, + "grad_norm": 7.1552019119262695, + "learning_rate": 8.945379701916253e-05, + "loss": 0.04298066794872284, + "step": 74330 + }, + { + "epoch": 10.552164655784244, + "grad_norm": 6.903208255767822, + "learning_rate": 8.945237757274664e-05, + "loss": 0.020708955824375153, + "step": 74340 + }, + { + "epoch": 10.553584102200142, + "grad_norm": 0.5696495175361633, + "learning_rate": 8.945095812633074e-05, + "loss": 0.027511507272720337, + "step": 74350 + }, + { + "epoch": 10.555003548616039, + "grad_norm": 1.9177380800247192, + "learning_rate": 8.944953867991483e-05, + "loss": 0.016340428590774538, + "step": 74360 + }, + { + "epoch": 10.556422995031937, + "grad_norm": 0.17848406732082367, + "learning_rate": 8.944811923349894e-05, + "loss": 0.04354447424411774, + "step": 74370 + }, + { + "epoch": 10.557842441447836, + "grad_norm": 5.381374835968018, + "learning_rate": 8.944669978708304e-05, + "loss": 0.03182124495506287, + "step": 74380 + }, + { + "epoch": 10.559261887863734, + "grad_norm": 4.705573558807373, + "learning_rate": 8.944528034066715e-05, + "loss": 0.029999750852584838, + "step": 74390 + }, + { + "epoch": 10.56068133427963, + "grad_norm": 0.5373235940933228, + "learning_rate": 8.944386089425124e-05, + "loss": 0.03394618630409241, + "step": 74400 + }, + { + "epoch": 10.562100780695529, + "grad_norm": 0.5422061681747437, + "learning_rate": 8.944244144783535e-05, + "loss": 0.01418820470571518, + "step": 74410 + }, + { + "epoch": 10.563520227111427, + "grad_norm": 5.353031635284424, + "learning_rate": 8.944102200141944e-05, + "loss": 0.017679476737976076, + "step": 74420 + }, + { + "epoch": 10.564939673527324, + "grad_norm": 0.4547473192214966, + "learning_rate": 8.943960255500356e-05, + "loss": 0.029423204064369202, + "step": 74430 + }, + { + "epoch": 10.566359119943222, + "grad_norm": 6.3978166580200195, + "learning_rate": 8.943818310858765e-05, + "loss": 0.07245333194732666, + "step": 74440 + }, + { + "epoch": 10.56777856635912, + "grad_norm": 0.7407926321029663, + "learning_rate": 8.943676366217175e-05, + "loss": 0.03220914006233215, + "step": 74450 + }, + { + "epoch": 10.569198012775018, + "grad_norm": 6.225142478942871, + "learning_rate": 8.943548616039745e-05, + "loss": 0.021983492374420165, + "step": 74460 + }, + { + "epoch": 10.570617459190915, + "grad_norm": 0.22022587060928345, + "learning_rate": 8.943406671398156e-05, + "loss": 0.017795734107494354, + "step": 74470 + }, + { + "epoch": 10.572036905606813, + "grad_norm": 5.597069263458252, + "learning_rate": 8.943264726756566e-05, + "loss": 0.01740722209215164, + "step": 74480 + }, + { + "epoch": 10.573456352022712, + "grad_norm": 1.1836518049240112, + "learning_rate": 8.943122782114975e-05, + "loss": 0.03079001307487488, + "step": 74490 + }, + { + "epoch": 10.574875798438608, + "grad_norm": 15.553241729736328, + "learning_rate": 8.942980837473386e-05, + "loss": 0.07134093046188354, + "step": 74500 + }, + { + "epoch": 10.574875798438608, + "eval_accuracy": 0.9783175430787817, + "eval_loss": 0.07355938851833344, + "eval_runtime": 33.2908, + "eval_samples_per_second": 472.412, + "eval_steps_per_second": 14.779, + "step": 74500 + }, + { + "epoch": 10.576295244854506, + "grad_norm": 13.643026351928711, + "learning_rate": 8.942838892831796e-05, + "loss": 0.09327298998832703, + "step": 74510 + }, + { + "epoch": 10.577714691270405, + "grad_norm": 9.596179008483887, + "learning_rate": 8.942696948190207e-05, + "loss": 0.04536471366882324, + "step": 74520 + }, + { + "epoch": 10.579134137686303, + "grad_norm": 1.495818853378296, + "learning_rate": 8.942555003548616e-05, + "loss": 0.019459769129753113, + "step": 74530 + }, + { + "epoch": 10.5805535841022, + "grad_norm": 7.002575397491455, + "learning_rate": 8.942413058907027e-05, + "loss": 0.02139394134283066, + "step": 74540 + }, + { + "epoch": 10.581973030518098, + "grad_norm": 0.691582977771759, + "learning_rate": 8.942271114265437e-05, + "loss": 0.04519861042499542, + "step": 74550 + }, + { + "epoch": 10.583392476933996, + "grad_norm": 4.451111316680908, + "learning_rate": 8.942129169623848e-05, + "loss": 0.05654475688934326, + "step": 74560 + }, + { + "epoch": 10.584811923349893, + "grad_norm": 1.640137791633606, + "learning_rate": 8.941987224982257e-05, + "loss": 0.06334338784217834, + "step": 74570 + }, + { + "epoch": 10.586231369765791, + "grad_norm": 0.7545045614242554, + "learning_rate": 8.941845280340667e-05, + "loss": 0.02559314966201782, + "step": 74580 + }, + { + "epoch": 10.58765081618169, + "grad_norm": 2.474801540374756, + "learning_rate": 8.941703335699078e-05, + "loss": 0.048035275936126706, + "step": 74590 + }, + { + "epoch": 10.589070262597588, + "grad_norm": 0.6320196986198425, + "learning_rate": 8.941561391057488e-05, + "loss": 0.01585846096277237, + "step": 74600 + }, + { + "epoch": 10.590489709013484, + "grad_norm": 1.918588399887085, + "learning_rate": 8.941419446415899e-05, + "loss": 0.020170879364013673, + "step": 74610 + }, + { + "epoch": 10.591909155429382, + "grad_norm": 1.1940841674804688, + "learning_rate": 8.941277501774309e-05, + "loss": 0.06623907089233398, + "step": 74620 + }, + { + "epoch": 10.59332860184528, + "grad_norm": 8.052373886108398, + "learning_rate": 8.941135557132718e-05, + "loss": 0.03799133002758026, + "step": 74630 + }, + { + "epoch": 10.594748048261177, + "grad_norm": 0.05093760788440704, + "learning_rate": 8.940993612491128e-05, + "loss": 0.058805429935455324, + "step": 74640 + }, + { + "epoch": 10.596167494677076, + "grad_norm": 1.0790239572525024, + "learning_rate": 8.94085166784954e-05, + "loss": 0.02620888352394104, + "step": 74650 + }, + { + "epoch": 10.597586941092974, + "grad_norm": 0.16304579377174377, + "learning_rate": 8.940709723207949e-05, + "loss": 0.0440018504858017, + "step": 74660 + }, + { + "epoch": 10.599006387508872, + "grad_norm": 9.762938499450684, + "learning_rate": 8.94056777856636e-05, + "loss": 0.01490759253501892, + "step": 74670 + }, + { + "epoch": 10.600425833924769, + "grad_norm": 6.83378267288208, + "learning_rate": 8.94042583392477e-05, + "loss": 0.022882431745529175, + "step": 74680 + }, + { + "epoch": 10.601845280340667, + "grad_norm": 5.488767147064209, + "learning_rate": 8.94028388928318e-05, + "loss": 0.01500108242034912, + "step": 74690 + }, + { + "epoch": 10.603264726756565, + "grad_norm": 3.6321401596069336, + "learning_rate": 8.940141944641591e-05, + "loss": 0.017778295278549194, + "step": 74700 + }, + { + "epoch": 10.604684173172462, + "grad_norm": 0.013210363686084747, + "learning_rate": 8.94e-05, + "loss": 0.007808870077133179, + "step": 74710 + }, + { + "epoch": 10.60610361958836, + "grad_norm": 1.6923620700836182, + "learning_rate": 8.939858055358412e-05, + "loss": 0.03966827690601349, + "step": 74720 + }, + { + "epoch": 10.607523066004259, + "grad_norm": 1.4342869520187378, + "learning_rate": 8.93971611071682e-05, + "loss": 0.013448211550712585, + "step": 74730 + }, + { + "epoch": 10.608942512420157, + "grad_norm": 0.07803019136190414, + "learning_rate": 8.939574166075231e-05, + "loss": 0.03317614197731018, + "step": 74740 + }, + { + "epoch": 10.610361958836053, + "grad_norm": 0.46355992555618286, + "learning_rate": 8.939432221433641e-05, + "loss": 0.01559600830078125, + "step": 74750 + }, + { + "epoch": 10.611781405251952, + "grad_norm": 18.4334659576416, + "learning_rate": 8.939290276792052e-05, + "loss": 0.06782117486000061, + "step": 74760 + }, + { + "epoch": 10.61320085166785, + "grad_norm": 1.2076555490493774, + "learning_rate": 8.939148332150462e-05, + "loss": 0.020013023912906647, + "step": 74770 + }, + { + "epoch": 10.614620298083747, + "grad_norm": 0.34689363837242126, + "learning_rate": 8.939006387508871e-05, + "loss": 0.040333092212677, + "step": 74780 + }, + { + "epoch": 10.616039744499645, + "grad_norm": 3.217885971069336, + "learning_rate": 8.938864442867282e-05, + "loss": 0.038464948534965515, + "step": 74790 + }, + { + "epoch": 10.617459190915543, + "grad_norm": 0.19117842614650726, + "learning_rate": 8.938722498225692e-05, + "loss": 0.025781697034835814, + "step": 74800 + }, + { + "epoch": 10.618878637331441, + "grad_norm": 0.020094774663448334, + "learning_rate": 8.938580553584103e-05, + "loss": 0.035686278343200685, + "step": 74810 + }, + { + "epoch": 10.620298083747338, + "grad_norm": 0.07451222836971283, + "learning_rate": 8.938438608942513e-05, + "loss": 0.06001535654067993, + "step": 74820 + }, + { + "epoch": 10.621717530163236, + "grad_norm": 2.8348300457000732, + "learning_rate": 8.938296664300924e-05, + "loss": 0.014533805847167968, + "step": 74830 + }, + { + "epoch": 10.623136976579135, + "grad_norm": 1.1169466972351074, + "learning_rate": 8.938154719659332e-05, + "loss": 0.028077208995819093, + "step": 74840 + }, + { + "epoch": 10.624556422995031, + "grad_norm": 0.6225524544715881, + "learning_rate": 8.938012775017744e-05, + "loss": 0.012209897488355636, + "step": 74850 + }, + { + "epoch": 10.62597586941093, + "grad_norm": 0.010887812823057175, + "learning_rate": 8.937870830376153e-05, + "loss": 0.03707170486450195, + "step": 74860 + }, + { + "epoch": 10.627395315826828, + "grad_norm": 0.1151101142168045, + "learning_rate": 8.937728885734564e-05, + "loss": 0.03600144684314728, + "step": 74870 + }, + { + "epoch": 10.628814762242726, + "grad_norm": 8.231071472167969, + "learning_rate": 8.937586941092974e-05, + "loss": 0.053849917650222776, + "step": 74880 + }, + { + "epoch": 10.630234208658623, + "grad_norm": 1.1308414936065674, + "learning_rate": 8.937444996451384e-05, + "loss": 0.02963399589061737, + "step": 74890 + }, + { + "epoch": 10.63165365507452, + "grad_norm": 0.0716499611735344, + "learning_rate": 8.937303051809795e-05, + "loss": 0.018058374524116516, + "step": 74900 + }, + { + "epoch": 10.63307310149042, + "grad_norm": 3.5925307273864746, + "learning_rate": 8.937161107168205e-05, + "loss": 0.02541220486164093, + "step": 74910 + }, + { + "epoch": 10.634492547906316, + "grad_norm": 0.12991972267627716, + "learning_rate": 8.937019162526616e-05, + "loss": 0.03352646231651306, + "step": 74920 + }, + { + "epoch": 10.635911994322214, + "grad_norm": 9.830734252929688, + "learning_rate": 8.936877217885026e-05, + "loss": 0.06148951053619385, + "step": 74930 + }, + { + "epoch": 10.637331440738112, + "grad_norm": 1.4155148267745972, + "learning_rate": 8.936735273243435e-05, + "loss": 0.015182647109031677, + "step": 74940 + }, + { + "epoch": 10.63875088715401, + "grad_norm": 0.11837710440158844, + "learning_rate": 8.936593328601845e-05, + "loss": 0.016064786911010744, + "step": 74950 + }, + { + "epoch": 10.640170333569907, + "grad_norm": 1.7777584791183472, + "learning_rate": 8.936451383960256e-05, + "loss": 0.040622872114181516, + "step": 74960 + }, + { + "epoch": 10.641589779985805, + "grad_norm": 0.8232346773147583, + "learning_rate": 8.936309439318666e-05, + "loss": 0.02240632474422455, + "step": 74970 + }, + { + "epoch": 10.643009226401704, + "grad_norm": 0.6091140508651733, + "learning_rate": 8.936167494677077e-05, + "loss": 0.01667594611644745, + "step": 74980 + }, + { + "epoch": 10.6444286728176, + "grad_norm": 0.2595471441745758, + "learning_rate": 8.936025550035487e-05, + "loss": 0.022316190600395202, + "step": 74990 + }, + { + "epoch": 10.645848119233499, + "grad_norm": 0.0824456438422203, + "learning_rate": 8.935883605393896e-05, + "loss": 0.04005849361419678, + "step": 75000 + }, + { + "epoch": 10.645848119233499, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.05868366360664368, + "eval_runtime": 35.5358, + "eval_samples_per_second": 442.568, + "eval_steps_per_second": 13.845, + "step": 75000 + }, + { + "epoch": 10.647267565649397, + "grad_norm": 0.06293433904647827, + "learning_rate": 8.935741660752307e-05, + "loss": 0.01167212575674057, + "step": 75010 + }, + { + "epoch": 10.648687012065295, + "grad_norm": 3.401526927947998, + "learning_rate": 8.935599716110717e-05, + "loss": 0.02366384267807007, + "step": 75020 + }, + { + "epoch": 10.650106458481192, + "grad_norm": 1.3368604183197021, + "learning_rate": 8.935457771469128e-05, + "loss": 0.030054858326911925, + "step": 75030 + }, + { + "epoch": 10.65152590489709, + "grad_norm": 0.2567216157913208, + "learning_rate": 8.935315826827537e-05, + "loss": 0.009073252975940704, + "step": 75040 + }, + { + "epoch": 10.652945351312988, + "grad_norm": 6.668602466583252, + "learning_rate": 8.935173882185948e-05, + "loss": 0.02904718816280365, + "step": 75050 + }, + { + "epoch": 10.654364797728885, + "grad_norm": 0.33078089356422424, + "learning_rate": 8.935031937544358e-05, + "loss": 0.006366993486881256, + "step": 75060 + }, + { + "epoch": 10.655784244144783, + "grad_norm": 0.9511492252349854, + "learning_rate": 8.934889992902769e-05, + "loss": 0.05331156253814697, + "step": 75070 + }, + { + "epoch": 10.657203690560682, + "grad_norm": 9.841053009033203, + "learning_rate": 8.934748048261178e-05, + "loss": 0.04374881982803345, + "step": 75080 + }, + { + "epoch": 10.65862313697658, + "grad_norm": 1.8360716104507446, + "learning_rate": 8.934606103619588e-05, + "loss": 0.05705385804176331, + "step": 75090 + }, + { + "epoch": 10.660042583392476, + "grad_norm": 6.725709915161133, + "learning_rate": 8.934464158977999e-05, + "loss": 0.023094192147254944, + "step": 75100 + }, + { + "epoch": 10.661462029808375, + "grad_norm": 0.10598300397396088, + "learning_rate": 8.934322214336409e-05, + "loss": 0.011065666377544404, + "step": 75110 + }, + { + "epoch": 10.662881476224273, + "grad_norm": 0.052498895674943924, + "learning_rate": 8.93418026969482e-05, + "loss": 0.00460980124771595, + "step": 75120 + }, + { + "epoch": 10.66430092264017, + "grad_norm": 0.47180619835853577, + "learning_rate": 8.93403832505323e-05, + "loss": 0.02264900803565979, + "step": 75130 + }, + { + "epoch": 10.665720369056068, + "grad_norm": 1.3694490194320679, + "learning_rate": 8.93389638041164e-05, + "loss": 0.05580574870109558, + "step": 75140 + }, + { + "epoch": 10.667139815471966, + "grad_norm": 1.9044922590255737, + "learning_rate": 8.933754435770049e-05, + "loss": 0.012723922729492188, + "step": 75150 + }, + { + "epoch": 10.668559261887864, + "grad_norm": 0.10306615382432938, + "learning_rate": 8.93361249112846e-05, + "loss": 0.03403681516647339, + "step": 75160 + }, + { + "epoch": 10.669978708303761, + "grad_norm": 0.0405077300965786, + "learning_rate": 8.93347054648687e-05, + "loss": 0.047677081823348996, + "step": 75170 + }, + { + "epoch": 10.67139815471966, + "grad_norm": 1.0609707832336426, + "learning_rate": 8.933328601845281e-05, + "loss": 0.015549467504024505, + "step": 75180 + }, + { + "epoch": 10.672817601135558, + "grad_norm": 3.3458545207977295, + "learning_rate": 8.933186657203691e-05, + "loss": 0.016075128316879274, + "step": 75190 + }, + { + "epoch": 10.674237047551454, + "grad_norm": 3.9271702766418457, + "learning_rate": 8.9330447125621e-05, + "loss": 0.044463536143302916, + "step": 75200 + }, + { + "epoch": 10.675656493967352, + "grad_norm": 0.6383125185966492, + "learning_rate": 8.932902767920512e-05, + "loss": 0.059473490715026854, + "step": 75210 + }, + { + "epoch": 10.67707594038325, + "grad_norm": 0.0646110400557518, + "learning_rate": 8.932760823278921e-05, + "loss": 0.005915617942810059, + "step": 75220 + }, + { + "epoch": 10.678495386799149, + "grad_norm": 1.6763479709625244, + "learning_rate": 8.932618878637333e-05, + "loss": 0.08584545850753784, + "step": 75230 + }, + { + "epoch": 10.679914833215046, + "grad_norm": 2.803025484085083, + "learning_rate": 8.932476933995742e-05, + "loss": 0.012907765805721283, + "step": 75240 + }, + { + "epoch": 10.681334279630944, + "grad_norm": 4.714158535003662, + "learning_rate": 8.932334989354152e-05, + "loss": 0.014616544544696807, + "step": 75250 + }, + { + "epoch": 10.682753726046842, + "grad_norm": 6.278298854827881, + "learning_rate": 8.932193044712562e-05, + "loss": 0.08344311714172363, + "step": 75260 + }, + { + "epoch": 10.684173172462739, + "grad_norm": 0.2862883508205414, + "learning_rate": 8.932051100070973e-05, + "loss": 0.0512550950050354, + "step": 75270 + }, + { + "epoch": 10.685592618878637, + "grad_norm": 1.354404091835022, + "learning_rate": 8.931909155429383e-05, + "loss": 0.0363610714673996, + "step": 75280 + }, + { + "epoch": 10.687012065294535, + "grad_norm": 2.309035062789917, + "learning_rate": 8.931767210787794e-05, + "loss": 0.033329719305038454, + "step": 75290 + }, + { + "epoch": 10.688431511710434, + "grad_norm": 0.763126015663147, + "learning_rate": 8.931625266146203e-05, + "loss": 0.00558549165725708, + "step": 75300 + }, + { + "epoch": 10.68985095812633, + "grad_norm": 2.8885669708251953, + "learning_rate": 8.931483321504613e-05, + "loss": 0.022483193874359132, + "step": 75310 + }, + { + "epoch": 10.691270404542228, + "grad_norm": 1.614470362663269, + "learning_rate": 8.931341376863024e-05, + "loss": 0.046516886353492735, + "step": 75320 + }, + { + "epoch": 10.692689850958127, + "grad_norm": 0.4518176019191742, + "learning_rate": 8.931199432221434e-05, + "loss": 0.016953733563423157, + "step": 75330 + }, + { + "epoch": 10.694109297374023, + "grad_norm": 0.07498191297054291, + "learning_rate": 8.931057487579845e-05, + "loss": 0.0493558257818222, + "step": 75340 + }, + { + "epoch": 10.695528743789922, + "grad_norm": 1.2771762609481812, + "learning_rate": 8.930915542938253e-05, + "loss": 0.033792906999588014, + "step": 75350 + }, + { + "epoch": 10.69694819020582, + "grad_norm": 5.1456403732299805, + "learning_rate": 8.930773598296665e-05, + "loss": 0.0450606644153595, + "step": 75360 + }, + { + "epoch": 10.698367636621718, + "grad_norm": 16.016571044921875, + "learning_rate": 8.930631653655074e-05, + "loss": 0.060096734762191774, + "step": 75370 + }, + { + "epoch": 10.699787083037615, + "grad_norm": 0.5739765167236328, + "learning_rate": 8.930489709013485e-05, + "loss": 0.09587665796279907, + "step": 75380 + }, + { + "epoch": 10.701206529453513, + "grad_norm": 2.421649217605591, + "learning_rate": 8.930347764371895e-05, + "loss": 0.01479882448911667, + "step": 75390 + }, + { + "epoch": 10.702625975869411, + "grad_norm": 0.6501554846763611, + "learning_rate": 8.930205819730305e-05, + "loss": 0.02887186110019684, + "step": 75400 + }, + { + "epoch": 10.704045422285308, + "grad_norm": 0.08935364335775375, + "learning_rate": 8.930063875088716e-05, + "loss": 0.018288043141365052, + "step": 75410 + }, + { + "epoch": 10.705464868701206, + "grad_norm": 9.896994590759277, + "learning_rate": 8.929921930447126e-05, + "loss": 0.01711161434650421, + "step": 75420 + }, + { + "epoch": 10.706884315117104, + "grad_norm": 4.586911678314209, + "learning_rate": 8.929779985805537e-05, + "loss": 0.004791490733623505, + "step": 75430 + }, + { + "epoch": 10.708303761533003, + "grad_norm": 2.6947715282440186, + "learning_rate": 8.929638041163947e-05, + "loss": 0.04513532817363739, + "step": 75440 + }, + { + "epoch": 10.7097232079489, + "grad_norm": 0.541801393032074, + "learning_rate": 8.929496096522356e-05, + "loss": 0.03101123869419098, + "step": 75450 + }, + { + "epoch": 10.711142654364798, + "grad_norm": 1.2876341342926025, + "learning_rate": 8.929354151880766e-05, + "loss": 0.01341709792613983, + "step": 75460 + }, + { + "epoch": 10.712562100780696, + "grad_norm": 4.885845184326172, + "learning_rate": 8.929212207239177e-05, + "loss": 0.007390654087066651, + "step": 75470 + }, + { + "epoch": 10.713981547196592, + "grad_norm": 0.015270525589585304, + "learning_rate": 8.929070262597587e-05, + "loss": 0.015379874408245087, + "step": 75480 + }, + { + "epoch": 10.71540099361249, + "grad_norm": 3.687673568725586, + "learning_rate": 8.928928317955998e-05, + "loss": 0.005444001033902168, + "step": 75490 + }, + { + "epoch": 10.716820440028389, + "grad_norm": 10.40943717956543, + "learning_rate": 8.928786373314408e-05, + "loss": 0.035025835037231445, + "step": 75500 + }, + { + "epoch": 10.716820440028389, + "eval_accuracy": 0.9825777325618363, + "eval_loss": 0.05798032879829407, + "eval_runtime": 35.3118, + "eval_samples_per_second": 445.375, + "eval_steps_per_second": 13.933, + "step": 75500 + }, + { + "epoch": 10.718239886444287, + "grad_norm": 0.14924176037311554, + "learning_rate": 8.928644428672817e-05, + "loss": 0.02595347762107849, + "step": 75510 + }, + { + "epoch": 10.719659332860184, + "grad_norm": 7.112300872802734, + "learning_rate": 8.928502484031229e-05, + "loss": 0.04068517088890076, + "step": 75520 + }, + { + "epoch": 10.721078779276082, + "grad_norm": 0.46678414940834045, + "learning_rate": 8.928360539389638e-05, + "loss": 0.06199964880943298, + "step": 75530 + }, + { + "epoch": 10.72249822569198, + "grad_norm": 7.660800457000732, + "learning_rate": 8.92821859474805e-05, + "loss": 0.06672753095626831, + "step": 75540 + }, + { + "epoch": 10.723917672107877, + "grad_norm": 0.13635525107383728, + "learning_rate": 8.928076650106459e-05, + "loss": 0.0058893729001283646, + "step": 75550 + }, + { + "epoch": 10.725337118523775, + "grad_norm": 0.050406236201524734, + "learning_rate": 8.927934705464869e-05, + "loss": 0.02629355192184448, + "step": 75560 + }, + { + "epoch": 10.726756564939674, + "grad_norm": 2.4663329124450684, + "learning_rate": 8.927792760823279e-05, + "loss": 0.02295355051755905, + "step": 75570 + }, + { + "epoch": 10.728176011355572, + "grad_norm": 1.0391991138458252, + "learning_rate": 8.92765081618169e-05, + "loss": 0.023616223037242888, + "step": 75580 + }, + { + "epoch": 10.729595457771469, + "grad_norm": 13.274736404418945, + "learning_rate": 8.927508871540101e-05, + "loss": 0.07250704765319824, + "step": 75590 + }, + { + "epoch": 10.731014904187367, + "grad_norm": 1.5516211986541748, + "learning_rate": 8.92736692689851e-05, + "loss": 0.027104687690734864, + "step": 75600 + }, + { + "epoch": 10.732434350603265, + "grad_norm": 0.705222487449646, + "learning_rate": 8.92722498225692e-05, + "loss": 0.04327774345874787, + "step": 75610 + }, + { + "epoch": 10.733853797019162, + "grad_norm": 0.7016850709915161, + "learning_rate": 8.92708303761533e-05, + "loss": 0.023375515639781953, + "step": 75620 + }, + { + "epoch": 10.73527324343506, + "grad_norm": 7.4501776695251465, + "learning_rate": 8.926941092973741e-05, + "loss": 0.06602050065994262, + "step": 75630 + }, + { + "epoch": 10.736692689850958, + "grad_norm": 0.02406780607998371, + "learning_rate": 8.926799148332151e-05, + "loss": 0.052145916223526004, + "step": 75640 + }, + { + "epoch": 10.738112136266857, + "grad_norm": 7.1097331047058105, + "learning_rate": 8.926657203690562e-05, + "loss": 0.04959434568881989, + "step": 75650 + }, + { + "epoch": 10.739531582682753, + "grad_norm": 0.3886570632457733, + "learning_rate": 8.92651525904897e-05, + "loss": 0.01884925365447998, + "step": 75660 + }, + { + "epoch": 10.740951029098651, + "grad_norm": 13.658404350280762, + "learning_rate": 8.926373314407381e-05, + "loss": 0.03599739670753479, + "step": 75670 + }, + { + "epoch": 10.74237047551455, + "grad_norm": 4.648027420043945, + "learning_rate": 8.926231369765792e-05, + "loss": 0.041656050086021426, + "step": 75680 + }, + { + "epoch": 10.743789921930446, + "grad_norm": 0.9928642511367798, + "learning_rate": 8.926089425124202e-05, + "loss": 0.03774539828300476, + "step": 75690 + }, + { + "epoch": 10.745209368346345, + "grad_norm": 0.8404362797737122, + "learning_rate": 8.925947480482613e-05, + "loss": 0.04949051737785339, + "step": 75700 + }, + { + "epoch": 10.746628814762243, + "grad_norm": 0.06839687377214432, + "learning_rate": 8.925805535841022e-05, + "loss": 0.017173881828784942, + "step": 75710 + }, + { + "epoch": 10.748048261178141, + "grad_norm": 0.04905456304550171, + "learning_rate": 8.925663591199433e-05, + "loss": 0.07567955255508423, + "step": 75720 + }, + { + "epoch": 10.749467707594038, + "grad_norm": 0.1964964121580124, + "learning_rate": 8.925521646557842e-05, + "loss": 0.019594097137451173, + "step": 75730 + }, + { + "epoch": 10.750887154009936, + "grad_norm": 2.3201723098754883, + "learning_rate": 8.925379701916254e-05, + "loss": 0.02814412713050842, + "step": 75740 + }, + { + "epoch": 10.752306600425834, + "grad_norm": 9.624185562133789, + "learning_rate": 8.925237757274663e-05, + "loss": 0.02252240777015686, + "step": 75750 + }, + { + "epoch": 10.75372604684173, + "grad_norm": 0.9482161402702332, + "learning_rate": 8.925095812633073e-05, + "loss": 0.011075331270694733, + "step": 75760 + }, + { + "epoch": 10.75514549325763, + "grad_norm": 4.261680603027344, + "learning_rate": 8.924953867991484e-05, + "loss": 0.012068639695644378, + "step": 75770 + }, + { + "epoch": 10.756564939673527, + "grad_norm": 2.8619332313537598, + "learning_rate": 8.924811923349894e-05, + "loss": 0.04735492467880249, + "step": 75780 + }, + { + "epoch": 10.757984386089426, + "grad_norm": 0.07442318648099899, + "learning_rate": 8.924669978708305e-05, + "loss": 0.01848388910293579, + "step": 75790 + }, + { + "epoch": 10.759403832505322, + "grad_norm": 0.33886000514030457, + "learning_rate": 8.924528034066715e-05, + "loss": 0.03830481171607971, + "step": 75800 + }, + { + "epoch": 10.76082327892122, + "grad_norm": 4.740257263183594, + "learning_rate": 8.924386089425124e-05, + "loss": 0.01993533968925476, + "step": 75810 + }, + { + "epoch": 10.762242725337119, + "grad_norm": 4.805813789367676, + "learning_rate": 8.924244144783534e-05, + "loss": 0.016506880521774292, + "step": 75820 + }, + { + "epoch": 10.763662171753015, + "grad_norm": 0.7994409203529358, + "learning_rate": 8.924102200141945e-05, + "loss": 0.020026545226573943, + "step": 75830 + }, + { + "epoch": 10.765081618168914, + "grad_norm": 0.1343299001455307, + "learning_rate": 8.923960255500355e-05, + "loss": 0.01588977873325348, + "step": 75840 + }, + { + "epoch": 10.766501064584812, + "grad_norm": 0.33635425567626953, + "learning_rate": 8.923818310858766e-05, + "loss": 0.015015700459480285, + "step": 75850 + }, + { + "epoch": 10.76792051100071, + "grad_norm": 3.677292585372925, + "learning_rate": 8.923676366217176e-05, + "loss": 0.020980848371982573, + "step": 75860 + }, + { + "epoch": 10.769339957416607, + "grad_norm": 1.9598472118377686, + "learning_rate": 8.923534421575586e-05, + "loss": 0.03935782611370087, + "step": 75870 + }, + { + "epoch": 10.770759403832505, + "grad_norm": 6.468889236450195, + "learning_rate": 8.923392476933997e-05, + "loss": 0.038150209188461306, + "step": 75880 + }, + { + "epoch": 10.772178850248403, + "grad_norm": 0.1789119988679886, + "learning_rate": 8.923250532292406e-05, + "loss": 0.03437398672103882, + "step": 75890 + }, + { + "epoch": 10.7735982966643, + "grad_norm": 4.859720230102539, + "learning_rate": 8.923108587650818e-05, + "loss": 0.04559687972068786, + "step": 75900 + }, + { + "epoch": 10.775017743080198, + "grad_norm": 0.1075957715511322, + "learning_rate": 8.922966643009227e-05, + "loss": 0.022505611181259155, + "step": 75910 + }, + { + "epoch": 10.776437189496097, + "grad_norm": 0.7738917469978333, + "learning_rate": 8.922824698367637e-05, + "loss": 0.02040761262178421, + "step": 75920 + }, + { + "epoch": 10.777856635911995, + "grad_norm": 0.00945677887648344, + "learning_rate": 8.922682753726047e-05, + "loss": 0.07506464719772339, + "step": 75930 + }, + { + "epoch": 10.779276082327891, + "grad_norm": 0.06521529704332352, + "learning_rate": 8.922540809084458e-05, + "loss": 0.017863285541534425, + "step": 75940 + }, + { + "epoch": 10.78069552874379, + "grad_norm": 0.05482591688632965, + "learning_rate": 8.922398864442868e-05, + "loss": 0.009694677591323853, + "step": 75950 + }, + { + "epoch": 10.782114975159688, + "grad_norm": 0.3654983341693878, + "learning_rate": 8.922256919801279e-05, + "loss": 0.05228215456008911, + "step": 75960 + }, + { + "epoch": 10.783534421575585, + "grad_norm": 9.811040878295898, + "learning_rate": 8.922114975159688e-05, + "loss": 0.04277914464473724, + "step": 75970 + }, + { + "epoch": 10.784953867991483, + "grad_norm": 8.080113410949707, + "learning_rate": 8.921973030518098e-05, + "loss": 0.06942117214202881, + "step": 75980 + }, + { + "epoch": 10.786373314407381, + "grad_norm": 3.861632823944092, + "learning_rate": 8.921831085876509e-05, + "loss": 0.04788309037685394, + "step": 75990 + }, + { + "epoch": 10.78779276082328, + "grad_norm": 2.738668203353882, + "learning_rate": 8.921689141234919e-05, + "loss": 0.012685440480709076, + "step": 76000 + }, + { + "epoch": 10.78779276082328, + "eval_accuracy": 0.9816875437146309, + "eval_loss": 0.062259022146463394, + "eval_runtime": 33.1136, + "eval_samples_per_second": 474.94, + "eval_steps_per_second": 14.858, + "step": 76000 + }, + { + "epoch": 10.789212207239176, + "grad_norm": 5.251237869262695, + "learning_rate": 8.92154719659333e-05, + "loss": 0.051242268085479735, + "step": 76010 + }, + { + "epoch": 10.790631653655074, + "grad_norm": 1.256506085395813, + "learning_rate": 8.921405251951738e-05, + "loss": 0.02849005162715912, + "step": 76020 + }, + { + "epoch": 10.792051100070973, + "grad_norm": 8.649620056152344, + "learning_rate": 8.92126330731015e-05, + "loss": 0.021818137168884276, + "step": 76030 + }, + { + "epoch": 10.79347054648687, + "grad_norm": 9.754542350769043, + "learning_rate": 8.921121362668559e-05, + "loss": 0.02822069525718689, + "step": 76040 + }, + { + "epoch": 10.794889992902768, + "grad_norm": 1.492602825164795, + "learning_rate": 8.92097941802697e-05, + "loss": 0.007633008062839508, + "step": 76050 + }, + { + "epoch": 10.796309439318666, + "grad_norm": 0.14724503457546234, + "learning_rate": 8.92083747338538e-05, + "loss": 0.03960501551628113, + "step": 76060 + }, + { + "epoch": 10.797728885734564, + "grad_norm": 10.349737167358398, + "learning_rate": 8.92069552874379e-05, + "loss": 0.034658610820770264, + "step": 76070 + }, + { + "epoch": 10.79914833215046, + "grad_norm": 0.13395637273788452, + "learning_rate": 8.920553584102201e-05, + "loss": 0.021459685266017915, + "step": 76080 + }, + { + "epoch": 10.800567778566359, + "grad_norm": 0.9659720063209534, + "learning_rate": 8.92041163946061e-05, + "loss": 0.0231198787689209, + "step": 76090 + }, + { + "epoch": 10.801987224982257, + "grad_norm": 10.902729034423828, + "learning_rate": 8.920269694819022e-05, + "loss": 0.03658841252326965, + "step": 76100 + }, + { + "epoch": 10.803406671398154, + "grad_norm": 0.12141856551170349, + "learning_rate": 8.920127750177431e-05, + "loss": 0.030691704154014586, + "step": 76110 + }, + { + "epoch": 10.804826117814052, + "grad_norm": 0.9635015726089478, + "learning_rate": 8.919985805535841e-05, + "loss": 0.026519355177879334, + "step": 76120 + }, + { + "epoch": 10.80624556422995, + "grad_norm": 0.17087894678115845, + "learning_rate": 8.919843860894251e-05, + "loss": 0.008873078972101212, + "step": 76130 + }, + { + "epoch": 10.807665010645849, + "grad_norm": 1.8956565856933594, + "learning_rate": 8.919701916252662e-05, + "loss": 0.0233942449092865, + "step": 76140 + }, + { + "epoch": 10.809084457061745, + "grad_norm": 0.9512972235679626, + "learning_rate": 8.919559971611072e-05, + "loss": 0.05020730495452881, + "step": 76150 + }, + { + "epoch": 10.810503903477644, + "grad_norm": 2.1291935443878174, + "learning_rate": 8.919418026969483e-05, + "loss": 0.06957405805587769, + "step": 76160 + }, + { + "epoch": 10.811923349893542, + "grad_norm": 5.251128196716309, + "learning_rate": 8.919276082327893e-05, + "loss": 0.030325111746788026, + "step": 76170 + }, + { + "epoch": 10.813342796309438, + "grad_norm": 0.18351991474628448, + "learning_rate": 8.919134137686302e-05, + "loss": 0.05944993495941162, + "step": 76180 + }, + { + "epoch": 10.814762242725337, + "grad_norm": 3.2230257987976074, + "learning_rate": 8.918992193044713e-05, + "loss": 0.027699339389801025, + "step": 76190 + }, + { + "epoch": 10.816181689141235, + "grad_norm": 9.277687072753906, + "learning_rate": 8.918850248403123e-05, + "loss": 0.030685320496559143, + "step": 76200 + }, + { + "epoch": 10.817601135557133, + "grad_norm": 8.766088485717773, + "learning_rate": 8.918708303761534e-05, + "loss": 0.018734395503997803, + "step": 76210 + }, + { + "epoch": 10.81902058197303, + "grad_norm": 4.725594997406006, + "learning_rate": 8.918566359119944e-05, + "loss": 0.04566576480865479, + "step": 76220 + }, + { + "epoch": 10.820440028388928, + "grad_norm": 0.19386504590511322, + "learning_rate": 8.918424414478354e-05, + "loss": 0.012554574012756347, + "step": 76230 + }, + { + "epoch": 10.821859474804826, + "grad_norm": 0.6063259243965149, + "learning_rate": 8.918282469836763e-05, + "loss": 0.053751158714294436, + "step": 76240 + }, + { + "epoch": 10.823278921220723, + "grad_norm": 0.2790958285331726, + "learning_rate": 8.918140525195175e-05, + "loss": 0.08109488487243652, + "step": 76250 + }, + { + "epoch": 10.824698367636621, + "grad_norm": 2.9445583820343018, + "learning_rate": 8.917998580553584e-05, + "loss": 0.046107107400894166, + "step": 76260 + }, + { + "epoch": 10.82611781405252, + "grad_norm": 1.1821379661560059, + "learning_rate": 8.917856635911995e-05, + "loss": 0.044123581051826476, + "step": 76270 + }, + { + "epoch": 10.827537260468418, + "grad_norm": 0.30668017268180847, + "learning_rate": 8.917714691270405e-05, + "loss": 0.028360003232955934, + "step": 76280 + }, + { + "epoch": 10.828956706884314, + "grad_norm": 0.2103549838066101, + "learning_rate": 8.917572746628815e-05, + "loss": 0.05053573250770569, + "step": 76290 + }, + { + "epoch": 10.830376153300213, + "grad_norm": 0.880430281162262, + "learning_rate": 8.917430801987226e-05, + "loss": 0.01300375759601593, + "step": 76300 + }, + { + "epoch": 10.831795599716111, + "grad_norm": 11.212428092956543, + "learning_rate": 8.917288857345636e-05, + "loss": 0.07399303913116455, + "step": 76310 + }, + { + "epoch": 10.833215046132008, + "grad_norm": 0.44175541400909424, + "learning_rate": 8.917146912704047e-05, + "loss": 0.05799729228019714, + "step": 76320 + }, + { + "epoch": 10.834634492547906, + "grad_norm": 4.154860973358154, + "learning_rate": 8.917004968062455e-05, + "loss": 0.03559871315956116, + "step": 76330 + }, + { + "epoch": 10.836053938963804, + "grad_norm": 0.07235050946474075, + "learning_rate": 8.916863023420866e-05, + "loss": 0.039110025763511656, + "step": 76340 + }, + { + "epoch": 10.837473385379703, + "grad_norm": 0.8463187217712402, + "learning_rate": 8.916721078779276e-05, + "loss": 0.05355388522148132, + "step": 76350 + }, + { + "epoch": 10.838892831795599, + "grad_norm": 2.4119479656219482, + "learning_rate": 8.916579134137687e-05, + "loss": 0.038125574588775635, + "step": 76360 + }, + { + "epoch": 10.840312278211497, + "grad_norm": 0.2792767286300659, + "learning_rate": 8.916437189496097e-05, + "loss": 0.03929415941238403, + "step": 76370 + }, + { + "epoch": 10.841731724627396, + "grad_norm": 2.039494752883911, + "learning_rate": 8.916295244854507e-05, + "loss": 0.017711035907268524, + "step": 76380 + }, + { + "epoch": 10.843151171043292, + "grad_norm": 2.766622543334961, + "learning_rate": 8.916153300212918e-05, + "loss": 0.02152646780014038, + "step": 76390 + }, + { + "epoch": 10.84457061745919, + "grad_norm": 0.2604852318763733, + "learning_rate": 8.916011355571327e-05, + "loss": 0.05204763412475586, + "step": 76400 + }, + { + "epoch": 10.845990063875089, + "grad_norm": 0.023158498108386993, + "learning_rate": 8.915869410929739e-05, + "loss": 0.015346193313598632, + "step": 76410 + }, + { + "epoch": 10.847409510290987, + "grad_norm": 0.13105705380439758, + "learning_rate": 8.915727466288148e-05, + "loss": 0.022280707955360413, + "step": 76420 + }, + { + "epoch": 10.848828956706884, + "grad_norm": 5.168460845947266, + "learning_rate": 8.915585521646558e-05, + "loss": 0.016544350981712343, + "step": 76430 + }, + { + "epoch": 10.850248403122782, + "grad_norm": 0.03163724020123482, + "learning_rate": 8.915443577004968e-05, + "loss": 0.029596129059791566, + "step": 76440 + }, + { + "epoch": 10.85166784953868, + "grad_norm": 10.255916595458984, + "learning_rate": 8.915301632363379e-05, + "loss": 0.01805151104927063, + "step": 76450 + }, + { + "epoch": 10.853087295954577, + "grad_norm": 0.2659781277179718, + "learning_rate": 8.915159687721789e-05, + "loss": 0.03853174448013306, + "step": 76460 + }, + { + "epoch": 10.854506742370475, + "grad_norm": 12.415229797363281, + "learning_rate": 8.9150177430802e-05, + "loss": 0.0705374002456665, + "step": 76470 + }, + { + "epoch": 10.855926188786373, + "grad_norm": 6.229217529296875, + "learning_rate": 8.91487579843861e-05, + "loss": 0.04124915301799774, + "step": 76480 + }, + { + "epoch": 10.857345635202272, + "grad_norm": 0.02586524747312069, + "learning_rate": 8.914733853797019e-05, + "loss": 0.015211585164070129, + "step": 76490 + }, + { + "epoch": 10.858765081618168, + "grad_norm": 4.167519569396973, + "learning_rate": 8.91459190915543e-05, + "loss": 0.05534020662307739, + "step": 76500 + }, + { + "epoch": 10.858765081618168, + "eval_accuracy": 0.9865835823742608, + "eval_loss": 0.040406033396720886, + "eval_runtime": 33.5985, + "eval_samples_per_second": 468.086, + "eval_steps_per_second": 14.644, + "step": 76500 + }, + { + "epoch": 10.860184528034067, + "grad_norm": 0.2459697723388672, + "learning_rate": 8.91444996451384e-05, + "loss": 0.02314354032278061, + "step": 76510 + }, + { + "epoch": 10.861603974449965, + "grad_norm": 0.3980192244052887, + "learning_rate": 8.914308019872251e-05, + "loss": 0.020578236877918245, + "step": 76520 + }, + { + "epoch": 10.863023420865863, + "grad_norm": 1.053245186805725, + "learning_rate": 8.91416607523066e-05, + "loss": 0.05024528503417969, + "step": 76530 + }, + { + "epoch": 10.86444286728176, + "grad_norm": 13.911527633666992, + "learning_rate": 8.91402413058907e-05, + "loss": 0.08677828907966614, + "step": 76540 + }, + { + "epoch": 10.865862313697658, + "grad_norm": 0.045699913054704666, + "learning_rate": 8.91388218594748e-05, + "loss": 0.021171575784683226, + "step": 76550 + }, + { + "epoch": 10.867281760113556, + "grad_norm": 0.06769008189439774, + "learning_rate": 8.913740241305891e-05, + "loss": 0.05880331993103027, + "step": 76560 + }, + { + "epoch": 10.868701206529453, + "grad_norm": 15.066309928894043, + "learning_rate": 8.913598296664301e-05, + "loss": 0.09242464900016785, + "step": 76570 + }, + { + "epoch": 10.870120652945351, + "grad_norm": 10.06954288482666, + "learning_rate": 8.913456352022712e-05, + "loss": 0.06237143278121948, + "step": 76580 + }, + { + "epoch": 10.87154009936125, + "grad_norm": 5.549213886260986, + "learning_rate": 8.913314407381122e-05, + "loss": 0.01747702956199646, + "step": 76590 + }, + { + "epoch": 10.872959545777148, + "grad_norm": 0.04882671684026718, + "learning_rate": 8.913172462739532e-05, + "loss": 0.029430165886878967, + "step": 76600 + }, + { + "epoch": 10.874378992193044, + "grad_norm": 3.903329372406006, + "learning_rate": 8.913030518097943e-05, + "loss": 0.01609521061182022, + "step": 76610 + }, + { + "epoch": 10.875798438608943, + "grad_norm": 4.08885383605957, + "learning_rate": 8.912888573456352e-05, + "loss": 0.014848698675632478, + "step": 76620 + }, + { + "epoch": 10.87721788502484, + "grad_norm": 5.423572540283203, + "learning_rate": 8.912746628814764e-05, + "loss": 0.0083135724067688, + "step": 76630 + }, + { + "epoch": 10.878637331440737, + "grad_norm": 11.061261177062988, + "learning_rate": 8.912604684173172e-05, + "loss": 0.07412885427474976, + "step": 76640 + }, + { + "epoch": 10.880056777856636, + "grad_norm": 0.16122563183307648, + "learning_rate": 8.912462739531583e-05, + "loss": 0.021539703011512756, + "step": 76650 + }, + { + "epoch": 10.881476224272534, + "grad_norm": 0.2158479243516922, + "learning_rate": 8.912320794889993e-05, + "loss": 0.026331749558448792, + "step": 76660 + }, + { + "epoch": 10.882895670688432, + "grad_norm": 1.3243156671524048, + "learning_rate": 8.912178850248404e-05, + "loss": 0.02839702069759369, + "step": 76670 + }, + { + "epoch": 10.884315117104329, + "grad_norm": 8.35107707977295, + "learning_rate": 8.912036905606814e-05, + "loss": 0.05367217659950256, + "step": 76680 + }, + { + "epoch": 10.885734563520227, + "grad_norm": 2.349330425262451, + "learning_rate": 8.911894960965223e-05, + "loss": 0.02370249629020691, + "step": 76690 + }, + { + "epoch": 10.887154009936125, + "grad_norm": 0.0545620433986187, + "learning_rate": 8.911753016323634e-05, + "loss": 0.021818794310092926, + "step": 76700 + }, + { + "epoch": 10.888573456352022, + "grad_norm": 0.1560916304588318, + "learning_rate": 8.911611071682044e-05, + "loss": 0.014035370945930482, + "step": 76710 + }, + { + "epoch": 10.88999290276792, + "grad_norm": 0.5020081400871277, + "learning_rate": 8.911469127040455e-05, + "loss": 0.03221254944801331, + "step": 76720 + }, + { + "epoch": 10.891412349183819, + "grad_norm": 0.08128255605697632, + "learning_rate": 8.911327182398865e-05, + "loss": 0.028797352313995363, + "step": 76730 + }, + { + "epoch": 10.892831795599717, + "grad_norm": 14.98328685760498, + "learning_rate": 8.911185237757275e-05, + "loss": 0.0881187915802002, + "step": 76740 + }, + { + "epoch": 10.894251242015613, + "grad_norm": 9.377324104309082, + "learning_rate": 8.911043293115685e-05, + "loss": 0.043282487988471986, + "step": 76750 + }, + { + "epoch": 10.895670688431512, + "grad_norm": 7.479063510894775, + "learning_rate": 8.910901348474096e-05, + "loss": 0.05147362947463989, + "step": 76760 + }, + { + "epoch": 10.89709013484741, + "grad_norm": 1.4757670164108276, + "learning_rate": 8.910759403832505e-05, + "loss": 0.021717017889022826, + "step": 76770 + }, + { + "epoch": 10.898509581263307, + "grad_norm": 2.818730354309082, + "learning_rate": 8.910617459190916e-05, + "loss": 0.005148597434163094, + "step": 76780 + }, + { + "epoch": 10.899929027679205, + "grad_norm": 0.34069201350212097, + "learning_rate": 8.910475514549326e-05, + "loss": 0.02241356670856476, + "step": 76790 + }, + { + "epoch": 10.901348474095103, + "grad_norm": 0.8318196535110474, + "learning_rate": 8.910333569907736e-05, + "loss": 0.022158035635948183, + "step": 76800 + }, + { + "epoch": 10.902767920511002, + "grad_norm": 0.2929559350013733, + "learning_rate": 8.910191625266147e-05, + "loss": 0.026474547386169434, + "step": 76810 + }, + { + "epoch": 10.904187366926898, + "grad_norm": 9.473808288574219, + "learning_rate": 8.910049680624557e-05, + "loss": 0.017038679122924803, + "step": 76820 + }, + { + "epoch": 10.905606813342796, + "grad_norm": 12.349149703979492, + "learning_rate": 8.909907735982968e-05, + "loss": 0.01714346408843994, + "step": 76830 + }, + { + "epoch": 10.907026259758695, + "grad_norm": 0.05128764733672142, + "learning_rate": 8.909765791341376e-05, + "loss": 0.012290577590465545, + "step": 76840 + }, + { + "epoch": 10.908445706174591, + "grad_norm": 1.4182794094085693, + "learning_rate": 8.909623846699787e-05, + "loss": 0.045222300291061404, + "step": 76850 + }, + { + "epoch": 10.90986515259049, + "grad_norm": 6.337584972381592, + "learning_rate": 8.909481902058197e-05, + "loss": 0.033546441793441774, + "step": 76860 + }, + { + "epoch": 10.911284599006388, + "grad_norm": 0.1292036771774292, + "learning_rate": 8.909339957416608e-05, + "loss": 0.022637271881103517, + "step": 76870 + }, + { + "epoch": 10.912704045422286, + "grad_norm": 3.266705274581909, + "learning_rate": 8.909198012775018e-05, + "loss": 0.028116098046302794, + "step": 76880 + }, + { + "epoch": 10.914123491838183, + "grad_norm": 12.7665376663208, + "learning_rate": 8.909056068133428e-05, + "loss": 0.06570132970809936, + "step": 76890 + }, + { + "epoch": 10.915542938254081, + "grad_norm": 3.6279942989349365, + "learning_rate": 8.908914123491839e-05, + "loss": 0.04003820419311523, + "step": 76900 + }, + { + "epoch": 10.91696238466998, + "grad_norm": 0.5584992170333862, + "learning_rate": 8.908772178850248e-05, + "loss": 0.05849987268447876, + "step": 76910 + }, + { + "epoch": 10.918381831085876, + "grad_norm": 6.1313629150390625, + "learning_rate": 8.90863023420866e-05, + "loss": 0.024083325266838075, + "step": 76920 + }, + { + "epoch": 10.919801277501774, + "grad_norm": 7.750735282897949, + "learning_rate": 8.908488289567069e-05, + "loss": 0.024571770429611207, + "step": 76930 + }, + { + "epoch": 10.921220723917672, + "grad_norm": 0.34011775255203247, + "learning_rate": 8.90834634492548e-05, + "loss": 0.04700807929039001, + "step": 76940 + }, + { + "epoch": 10.92264017033357, + "grad_norm": 0.03488544002175331, + "learning_rate": 8.908204400283889e-05, + "loss": 0.037614253163337705, + "step": 76950 + }, + { + "epoch": 10.924059616749467, + "grad_norm": 8.300019264221191, + "learning_rate": 8.9080624556423e-05, + "loss": 0.0336763858795166, + "step": 76960 + }, + { + "epoch": 10.925479063165366, + "grad_norm": 3.3698670864105225, + "learning_rate": 8.90792051100071e-05, + "loss": 0.04678107500076294, + "step": 76970 + }, + { + "epoch": 10.926898509581264, + "grad_norm": 0.13484854996204376, + "learning_rate": 8.90777856635912e-05, + "loss": 0.03645238280296326, + "step": 76980 + }, + { + "epoch": 10.92831795599716, + "grad_norm": 2.7829763889312744, + "learning_rate": 8.907636621717532e-05, + "loss": 0.03553824722766876, + "step": 76990 + }, + { + "epoch": 10.929737402413059, + "grad_norm": 0.27800244092941284, + "learning_rate": 8.90749467707594e-05, + "loss": 0.04834374189376831, + "step": 77000 + }, + { + "epoch": 10.929737402413059, + "eval_accuracy": 0.9841673555032746, + "eval_loss": 0.04912128299474716, + "eval_runtime": 32.9222, + "eval_samples_per_second": 477.702, + "eval_steps_per_second": 14.944, + "step": 77000 + }, + { + "epoch": 10.931156848828957, + "grad_norm": 2.2467801570892334, + "learning_rate": 8.907352732434351e-05, + "loss": 0.051036447286605835, + "step": 77010 + }, + { + "epoch": 10.932576295244855, + "grad_norm": 0.80274498462677, + "learning_rate": 8.907210787792761e-05, + "loss": 0.020682474970817565, + "step": 77020 + }, + { + "epoch": 10.933995741660752, + "grad_norm": 5.00384521484375, + "learning_rate": 8.907068843151172e-05, + "loss": 0.020825859904289246, + "step": 77030 + }, + { + "epoch": 10.93541518807665, + "grad_norm": 1.326248049736023, + "learning_rate": 8.906926898509582e-05, + "loss": 0.0339616596698761, + "step": 77040 + }, + { + "epoch": 10.936834634492548, + "grad_norm": 0.44627857208251953, + "learning_rate": 8.906784953867992e-05, + "loss": 0.01936686933040619, + "step": 77050 + }, + { + "epoch": 10.938254080908445, + "grad_norm": 2.1883907318115234, + "learning_rate": 8.906643009226401e-05, + "loss": 0.023100431263446807, + "step": 77060 + }, + { + "epoch": 10.939673527324343, + "grad_norm": 4.381327152252197, + "learning_rate": 8.906501064584812e-05, + "loss": 0.051467007398605345, + "step": 77070 + }, + { + "epoch": 10.941092973740242, + "grad_norm": 0.08656803518533707, + "learning_rate": 8.906359119943223e-05, + "loss": 0.07422645092010498, + "step": 77080 + }, + { + "epoch": 10.94251242015614, + "grad_norm": 1.2520685195922852, + "learning_rate": 8.906217175301633e-05, + "loss": 0.023004311323165893, + "step": 77090 + }, + { + "epoch": 10.943931866572036, + "grad_norm": 0.6970699429512024, + "learning_rate": 8.906075230660043e-05, + "loss": 0.04289089739322662, + "step": 77100 + }, + { + "epoch": 10.945351312987935, + "grad_norm": 1.45811927318573, + "learning_rate": 8.905933286018453e-05, + "loss": 0.050782245397567746, + "step": 77110 + }, + { + "epoch": 10.946770759403833, + "grad_norm": 0.21888741850852966, + "learning_rate": 8.905791341376864e-05, + "loss": 0.005652286112308502, + "step": 77120 + }, + { + "epoch": 10.94819020581973, + "grad_norm": 1.4828088283538818, + "learning_rate": 8.905663591199432e-05, + "loss": 0.037698855996131896, + "step": 77130 + }, + { + "epoch": 10.949609652235628, + "grad_norm": 0.18355588614940643, + "learning_rate": 8.905521646557843e-05, + "loss": 0.02868366539478302, + "step": 77140 + }, + { + "epoch": 10.951029098651526, + "grad_norm": 3.372123956680298, + "learning_rate": 8.905379701916253e-05, + "loss": 0.03374863862991333, + "step": 77150 + }, + { + "epoch": 10.952448545067424, + "grad_norm": 4.836116790771484, + "learning_rate": 8.905237757274664e-05, + "loss": 0.05168530941009521, + "step": 77160 + }, + { + "epoch": 10.953867991483321, + "grad_norm": 5.375553131103516, + "learning_rate": 8.905095812633073e-05, + "loss": 0.11784226894378662, + "step": 77170 + }, + { + "epoch": 10.95528743789922, + "grad_norm": 0.08931706100702286, + "learning_rate": 8.904953867991484e-05, + "loss": 0.0237982839345932, + "step": 77180 + }, + { + "epoch": 10.956706884315118, + "grad_norm": 0.5286836624145508, + "learning_rate": 8.904811923349893e-05, + "loss": 0.031697696447372435, + "step": 77190 + }, + { + "epoch": 10.958126330731014, + "grad_norm": 0.030751101672649384, + "learning_rate": 8.904669978708304e-05, + "loss": 0.038225984573364256, + "step": 77200 + }, + { + "epoch": 10.959545777146912, + "grad_norm": 0.2199520617723465, + "learning_rate": 8.904528034066714e-05, + "loss": 0.014718365669250489, + "step": 77210 + }, + { + "epoch": 10.96096522356281, + "grad_norm": 0.08655641973018646, + "learning_rate": 8.904386089425124e-05, + "loss": 0.03180850744247436, + "step": 77220 + }, + { + "epoch": 10.962384669978709, + "grad_norm": 7.156330585479736, + "learning_rate": 8.904244144783535e-05, + "loss": 0.02334403395652771, + "step": 77230 + }, + { + "epoch": 10.963804116394606, + "grad_norm": 9.984768867492676, + "learning_rate": 8.904102200141945e-05, + "loss": 0.040103816986083986, + "step": 77240 + }, + { + "epoch": 10.965223562810504, + "grad_norm": 7.9748640060424805, + "learning_rate": 8.903960255500356e-05, + "loss": 0.04069978296756745, + "step": 77250 + }, + { + "epoch": 10.966643009226402, + "grad_norm": 7.155030727386475, + "learning_rate": 8.903818310858766e-05, + "loss": 0.022116178274154664, + "step": 77260 + }, + { + "epoch": 10.968062455642299, + "grad_norm": 0.7471809387207031, + "learning_rate": 8.903676366217177e-05, + "loss": 0.01585453748703003, + "step": 77270 + }, + { + "epoch": 10.969481902058197, + "grad_norm": 7.733248233795166, + "learning_rate": 8.903534421575585e-05, + "loss": 0.06696531176567078, + "step": 77280 + }, + { + "epoch": 10.970901348474095, + "grad_norm": 0.08299662172794342, + "learning_rate": 8.903392476933996e-05, + "loss": 0.012213122844696046, + "step": 77290 + }, + { + "epoch": 10.972320794889994, + "grad_norm": 4.243612766265869, + "learning_rate": 8.903250532292406e-05, + "loss": 0.018444839119911193, + "step": 77300 + }, + { + "epoch": 10.97374024130589, + "grad_norm": 4.067358016967773, + "learning_rate": 8.903108587650817e-05, + "loss": 0.0302010178565979, + "step": 77310 + }, + { + "epoch": 10.975159687721789, + "grad_norm": 3.4205238819122314, + "learning_rate": 8.902966643009227e-05, + "loss": 0.011153788864612579, + "step": 77320 + }, + { + "epoch": 10.976579134137687, + "grad_norm": 0.927170991897583, + "learning_rate": 8.902824698367636e-05, + "loss": 0.025216352939605714, + "step": 77330 + }, + { + "epoch": 10.977998580553583, + "grad_norm": 1.6161059141159058, + "learning_rate": 8.902682753726048e-05, + "loss": 0.05900951623916626, + "step": 77340 + }, + { + "epoch": 10.979418026969482, + "grad_norm": 0.03316309675574303, + "learning_rate": 8.902540809084457e-05, + "loss": 0.08623931407928467, + "step": 77350 + }, + { + "epoch": 10.98083747338538, + "grad_norm": 0.952528178691864, + "learning_rate": 8.902398864442868e-05, + "loss": 0.0170826256275177, + "step": 77360 + }, + { + "epoch": 10.982256919801278, + "grad_norm": 2.0128047466278076, + "learning_rate": 8.902256919801278e-05, + "loss": 0.03770047724246979, + "step": 77370 + }, + { + "epoch": 10.983676366217175, + "grad_norm": 0.32875731587409973, + "learning_rate": 8.902114975159688e-05, + "loss": 0.03144612908363342, + "step": 77380 + }, + { + "epoch": 10.985095812633073, + "grad_norm": 1.2215029001235962, + "learning_rate": 8.901973030518098e-05, + "loss": 0.04773979187011719, + "step": 77390 + }, + { + "epoch": 10.986515259048971, + "grad_norm": 6.523492336273193, + "learning_rate": 8.901831085876509e-05, + "loss": 0.035741502046585084, + "step": 77400 + }, + { + "epoch": 10.987934705464868, + "grad_norm": 11.67943000793457, + "learning_rate": 8.901689141234918e-05, + "loss": 0.022857260704040528, + "step": 77410 + }, + { + "epoch": 10.989354151880766, + "grad_norm": 0.12591129541397095, + "learning_rate": 8.90154719659333e-05, + "loss": 0.03355528712272644, + "step": 77420 + }, + { + "epoch": 10.990773598296665, + "grad_norm": 7.616695880889893, + "learning_rate": 8.901405251951739e-05, + "loss": 0.08523765802383423, + "step": 77430 + }, + { + "epoch": 10.992193044712563, + "grad_norm": 19.298355102539062, + "learning_rate": 8.901263307310149e-05, + "loss": 0.06812095642089844, + "step": 77440 + }, + { + "epoch": 10.99361249112846, + "grad_norm": 7.188718795776367, + "learning_rate": 8.90112136266856e-05, + "loss": 0.05898793935775757, + "step": 77450 + }, + { + "epoch": 10.995031937544358, + "grad_norm": 0.2391853779554367, + "learning_rate": 8.90097941802697e-05, + "loss": 0.012519893050193787, + "step": 77460 + }, + { + "epoch": 10.996451383960256, + "grad_norm": 0.12945179641246796, + "learning_rate": 8.900837473385381e-05, + "loss": 0.014145855605602265, + "step": 77470 + }, + { + "epoch": 10.997870830376153, + "grad_norm": 0.19085891544818878, + "learning_rate": 8.90069552874379e-05, + "loss": 0.015647728741168977, + "step": 77480 + }, + { + "epoch": 10.99929027679205, + "grad_norm": 0.04224420338869095, + "learning_rate": 8.9005535841022e-05, + "loss": 0.01942497193813324, + "step": 77490 + }, + { + "epoch": 11.00070972320795, + "grad_norm": 0.024194825440645218, + "learning_rate": 8.90041163946061e-05, + "loss": 0.01477721482515335, + "step": 77500 + }, + { + "epoch": 11.00070972320795, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.04882017523050308, + "eval_runtime": 32.218, + "eval_samples_per_second": 488.143, + "eval_steps_per_second": 15.271, + "step": 77500 + }, + { + "epoch": 11.002129169623847, + "grad_norm": 1.208798885345459, + "learning_rate": 8.900269694819021e-05, + "loss": 0.016632518172264098, + "step": 77510 + }, + { + "epoch": 11.003548616039744, + "grad_norm": 5.06233024597168, + "learning_rate": 8.900127750177431e-05, + "loss": 0.03450791537761688, + "step": 77520 + }, + { + "epoch": 11.004968062455642, + "grad_norm": 5.087305545806885, + "learning_rate": 8.899985805535841e-05, + "loss": 0.04986914396286011, + "step": 77530 + }, + { + "epoch": 11.00638750887154, + "grad_norm": 10.168726921081543, + "learning_rate": 8.899843860894252e-05, + "loss": 0.040247094631195066, + "step": 77540 + }, + { + "epoch": 11.007806955287437, + "grad_norm": 0.219258114695549, + "learning_rate": 8.899701916252662e-05, + "loss": 0.006717376410961151, + "step": 77550 + }, + { + "epoch": 11.009226401703335, + "grad_norm": 0.42845240235328674, + "learning_rate": 8.899559971611073e-05, + "loss": 0.022232869267463685, + "step": 77560 + }, + { + "epoch": 11.010645848119234, + "grad_norm": 1.3312067985534668, + "learning_rate": 8.899418026969482e-05, + "loss": 0.020168834924697877, + "step": 77570 + }, + { + "epoch": 11.012065294535132, + "grad_norm": 0.6087434887886047, + "learning_rate": 8.899276082327892e-05, + "loss": 0.012365716695785522, + "step": 77580 + }, + { + "epoch": 11.013484740951029, + "grad_norm": 1.740855097770691, + "learning_rate": 8.899134137686302e-05, + "loss": 0.00474800281226635, + "step": 77590 + }, + { + "epoch": 11.014904187366927, + "grad_norm": 2.4797630310058594, + "learning_rate": 8.898992193044713e-05, + "loss": 0.021309559047222138, + "step": 77600 + }, + { + "epoch": 11.016323633782825, + "grad_norm": 0.2835226058959961, + "learning_rate": 8.898850248403123e-05, + "loss": 0.004374232143163681, + "step": 77610 + }, + { + "epoch": 11.017743080198722, + "grad_norm": 0.2921813130378723, + "learning_rate": 8.898708303761534e-05, + "loss": 0.01224185824394226, + "step": 77620 + }, + { + "epoch": 11.01916252661462, + "grad_norm": 0.12308903783559799, + "learning_rate": 8.898566359119944e-05, + "loss": 0.014115402102470398, + "step": 77630 + }, + { + "epoch": 11.020581973030518, + "grad_norm": 1.0105446577072144, + "learning_rate": 8.898424414478353e-05, + "loss": 0.02689531445503235, + "step": 77640 + }, + { + "epoch": 11.022001419446417, + "grad_norm": 1.195181965827942, + "learning_rate": 8.898282469836764e-05, + "loss": 0.041376516222953796, + "step": 77650 + }, + { + "epoch": 11.023420865862313, + "grad_norm": 2.656174659729004, + "learning_rate": 8.898140525195174e-05, + "loss": 0.025141558051109313, + "step": 77660 + }, + { + "epoch": 11.024840312278211, + "grad_norm": 0.2554796040058136, + "learning_rate": 8.897998580553585e-05, + "loss": 0.030996525287628175, + "step": 77670 + }, + { + "epoch": 11.02625975869411, + "grad_norm": 2.055675506591797, + "learning_rate": 8.897856635911995e-05, + "loss": 0.00680890753865242, + "step": 77680 + }, + { + "epoch": 11.027679205110006, + "grad_norm": 5.830803871154785, + "learning_rate": 8.897714691270405e-05, + "loss": 0.09050383567810058, + "step": 77690 + }, + { + "epoch": 11.029098651525905, + "grad_norm": 2.9373791217803955, + "learning_rate": 8.897572746628814e-05, + "loss": 0.031220877170562746, + "step": 77700 + }, + { + "epoch": 11.030518097941803, + "grad_norm": 0.3388267159461975, + "learning_rate": 8.897430801987225e-05, + "loss": 0.026630723476409913, + "step": 77710 + }, + { + "epoch": 11.031937544357701, + "grad_norm": 0.21325798332691193, + "learning_rate": 8.897288857345635e-05, + "loss": 0.062087488174438474, + "step": 77720 + }, + { + "epoch": 11.033356990773598, + "grad_norm": 0.05454428121447563, + "learning_rate": 8.897146912704046e-05, + "loss": 0.08088703751564026, + "step": 77730 + }, + { + "epoch": 11.034776437189496, + "grad_norm": 2.0780720710754395, + "learning_rate": 8.897004968062456e-05, + "loss": 0.029281583428382874, + "step": 77740 + }, + { + "epoch": 11.036195883605394, + "grad_norm": 0.6183001399040222, + "learning_rate": 8.896863023420866e-05, + "loss": 0.027968209981918336, + "step": 77750 + }, + { + "epoch": 11.037615330021291, + "grad_norm": 1.7325291633605957, + "learning_rate": 8.896721078779277e-05, + "loss": 0.04854607284069061, + "step": 77760 + }, + { + "epoch": 11.03903477643719, + "grad_norm": 3.145956516265869, + "learning_rate": 8.896579134137687e-05, + "loss": 0.0039848946034908295, + "step": 77770 + }, + { + "epoch": 11.040454222853088, + "grad_norm": 0.12728923559188843, + "learning_rate": 8.896437189496098e-05, + "loss": 0.019792550802230836, + "step": 77780 + }, + { + "epoch": 11.041873669268986, + "grad_norm": 1.0950226783752441, + "learning_rate": 8.896295244854506e-05, + "loss": 0.033737349510192874, + "step": 77790 + }, + { + "epoch": 11.043293115684882, + "grad_norm": 3.4214136600494385, + "learning_rate": 8.896153300212917e-05, + "loss": 0.009793543070554734, + "step": 77800 + }, + { + "epoch": 11.04471256210078, + "grad_norm": 0.49779754877090454, + "learning_rate": 8.896011355571327e-05, + "loss": 0.009086959809064866, + "step": 77810 + }, + { + "epoch": 11.046132008516679, + "grad_norm": 0.01435944065451622, + "learning_rate": 8.895869410929738e-05, + "loss": 0.02067076861858368, + "step": 77820 + }, + { + "epoch": 11.047551454932576, + "grad_norm": 5.94788122177124, + "learning_rate": 8.895727466288149e-05, + "loss": 0.011896288394927979, + "step": 77830 + }, + { + "epoch": 11.048970901348474, + "grad_norm": 0.29182007908821106, + "learning_rate": 8.895585521646558e-05, + "loss": 0.026392003893852232, + "step": 77840 + }, + { + "epoch": 11.050390347764372, + "grad_norm": 0.21065226197242737, + "learning_rate": 8.895443577004969e-05, + "loss": 0.028645521402359007, + "step": 77850 + }, + { + "epoch": 11.05180979418027, + "grad_norm": 0.05954265221953392, + "learning_rate": 8.895301632363378e-05, + "loss": 0.010045936703681946, + "step": 77860 + }, + { + "epoch": 11.053229240596167, + "grad_norm": 0.0884803906083107, + "learning_rate": 8.89515968772179e-05, + "loss": 0.011941583454608917, + "step": 77870 + }, + { + "epoch": 11.054648687012065, + "grad_norm": 0.1329718977212906, + "learning_rate": 8.895017743080199e-05, + "loss": 0.010423028469085693, + "step": 77880 + }, + { + "epoch": 11.056068133427964, + "grad_norm": 0.2441541701555252, + "learning_rate": 8.894875798438609e-05, + "loss": 0.002589988708496094, + "step": 77890 + }, + { + "epoch": 11.05748757984386, + "grad_norm": 0.7030231356620789, + "learning_rate": 8.894733853797019e-05, + "loss": 0.005089676007628441, + "step": 77900 + }, + { + "epoch": 11.058907026259758, + "grad_norm": 0.0806090235710144, + "learning_rate": 8.89459190915543e-05, + "loss": 0.007579062879085541, + "step": 77910 + }, + { + "epoch": 11.060326472675657, + "grad_norm": 3.0792415142059326, + "learning_rate": 8.894449964513841e-05, + "loss": 0.004580720514059067, + "step": 77920 + }, + { + "epoch": 11.061745919091555, + "grad_norm": 0.10120502859354019, + "learning_rate": 8.89430801987225e-05, + "loss": 0.007818933576345444, + "step": 77930 + }, + { + "epoch": 11.063165365507452, + "grad_norm": 0.13891956210136414, + "learning_rate": 8.89416607523066e-05, + "loss": 0.020057034492492676, + "step": 77940 + }, + { + "epoch": 11.06458481192335, + "grad_norm": 0.9457455277442932, + "learning_rate": 8.89402413058907e-05, + "loss": 0.03999852836132049, + "step": 77950 + }, + { + "epoch": 11.066004258339248, + "grad_norm": 0.043018776923418045, + "learning_rate": 8.893882185947481e-05, + "loss": 0.02448986768722534, + "step": 77960 + }, + { + "epoch": 11.067423704755145, + "grad_norm": 0.0917859748005867, + "learning_rate": 8.893740241305891e-05, + "loss": 0.04967025220394135, + "step": 77970 + }, + { + "epoch": 11.068843151171043, + "grad_norm": 0.42049574851989746, + "learning_rate": 8.893598296664302e-05, + "loss": 0.02361696809530258, + "step": 77980 + }, + { + "epoch": 11.070262597586941, + "grad_norm": 0.10781079530715942, + "learning_rate": 8.893456352022712e-05, + "loss": 0.04162313342094422, + "step": 77990 + }, + { + "epoch": 11.07168204400284, + "grad_norm": 4.091378688812256, + "learning_rate": 8.893314407381121e-05, + "loss": 0.006732091307640076, + "step": 78000 + }, + { + "epoch": 11.07168204400284, + "eval_accuracy": 0.9848032046798499, + "eval_loss": 0.05146849900484085, + "eval_runtime": 34.0197, + "eval_samples_per_second": 462.291, + "eval_steps_per_second": 14.462, + "step": 78000 + }, + { + "epoch": 11.073101490418736, + "grad_norm": 0.03186691179871559, + "learning_rate": 8.893172462739533e-05, + "loss": 0.020208078622817992, + "step": 78010 + }, + { + "epoch": 11.074520936834634, + "grad_norm": 0.0380316786468029, + "learning_rate": 8.893030518097942e-05, + "loss": 0.01988564133644104, + "step": 78020 + }, + { + "epoch": 11.075940383250533, + "grad_norm": 0.009216410107910633, + "learning_rate": 8.892888573456353e-05, + "loss": 0.03737284243106842, + "step": 78030 + }, + { + "epoch": 11.07735982966643, + "grad_norm": 0.7872573733329773, + "learning_rate": 8.892746628814763e-05, + "loss": 0.025534018874168396, + "step": 78040 + }, + { + "epoch": 11.078779276082328, + "grad_norm": 0.015327519737184048, + "learning_rate": 8.892604684173173e-05, + "loss": 0.0050436832010746, + "step": 78050 + }, + { + "epoch": 11.080198722498226, + "grad_norm": 10.065001487731934, + "learning_rate": 8.892462739531583e-05, + "loss": 0.022689974308013915, + "step": 78060 + }, + { + "epoch": 11.081618168914124, + "grad_norm": 14.562976837158203, + "learning_rate": 8.892320794889994e-05, + "loss": 0.024745500087738036, + "step": 78070 + }, + { + "epoch": 11.08303761533002, + "grad_norm": 5.8700737953186035, + "learning_rate": 8.892178850248403e-05, + "loss": 0.03570305109024048, + "step": 78080 + }, + { + "epoch": 11.084457061745919, + "grad_norm": 0.8238467574119568, + "learning_rate": 8.892036905606814e-05, + "loss": 0.025160768628120424, + "step": 78090 + }, + { + "epoch": 11.085876508161817, + "grad_norm": 3.493910312652588, + "learning_rate": 8.891894960965224e-05, + "loss": 0.08694150447845458, + "step": 78100 + }, + { + "epoch": 11.087295954577714, + "grad_norm": 3.4150233268737793, + "learning_rate": 8.891753016323634e-05, + "loss": 0.020310258865356444, + "step": 78110 + }, + { + "epoch": 11.088715400993612, + "grad_norm": 14.740667343139648, + "learning_rate": 8.891611071682045e-05, + "loss": 0.030592340230941772, + "step": 78120 + }, + { + "epoch": 11.09013484740951, + "grad_norm": 7.057147026062012, + "learning_rate": 8.891469127040455e-05, + "loss": 0.026824843883514405, + "step": 78130 + }, + { + "epoch": 11.091554293825409, + "grad_norm": 0.21818548440933228, + "learning_rate": 8.891327182398866e-05, + "loss": 0.05211725831031799, + "step": 78140 + }, + { + "epoch": 11.092973740241305, + "grad_norm": 7.997917175292969, + "learning_rate": 8.891185237757274e-05, + "loss": 0.04818741977214813, + "step": 78150 + }, + { + "epoch": 11.094393186657204, + "grad_norm": 0.17914900183677673, + "learning_rate": 8.891043293115685e-05, + "loss": 0.018161210417747497, + "step": 78160 + }, + { + "epoch": 11.095812633073102, + "grad_norm": 0.2530238926410675, + "learning_rate": 8.890901348474095e-05, + "loss": 0.008942224830389024, + "step": 78170 + }, + { + "epoch": 11.097232079488998, + "grad_norm": 0.2684612274169922, + "learning_rate": 8.890759403832506e-05, + "loss": 0.016461795568466185, + "step": 78180 + }, + { + "epoch": 11.098651525904897, + "grad_norm": 7.170384883880615, + "learning_rate": 8.890617459190916e-05, + "loss": 0.031466320157051086, + "step": 78190 + }, + { + "epoch": 11.100070972320795, + "grad_norm": 0.10846813768148422, + "learning_rate": 8.890475514549326e-05, + "loss": 0.04786921739578247, + "step": 78200 + }, + { + "epoch": 11.101490418736693, + "grad_norm": 3.167357921600342, + "learning_rate": 8.890333569907737e-05, + "loss": 0.023188070952892305, + "step": 78210 + }, + { + "epoch": 11.10290986515259, + "grad_norm": 0.05991131067276001, + "learning_rate": 8.890191625266147e-05, + "loss": 0.047985118627548215, + "step": 78220 + }, + { + "epoch": 11.104329311568488, + "grad_norm": 9.863299369812012, + "learning_rate": 8.890049680624558e-05, + "loss": 0.0429348349571228, + "step": 78230 + }, + { + "epoch": 11.105748757984387, + "grad_norm": 12.251611709594727, + "learning_rate": 8.889907735982967e-05, + "loss": 0.035148638486862185, + "step": 78240 + }, + { + "epoch": 11.107168204400283, + "grad_norm": 0.5501163601875305, + "learning_rate": 8.889765791341377e-05, + "loss": 0.005860616266727447, + "step": 78250 + }, + { + "epoch": 11.108587650816181, + "grad_norm": 7.741093158721924, + "learning_rate": 8.889623846699787e-05, + "loss": 0.01935284584760666, + "step": 78260 + }, + { + "epoch": 11.11000709723208, + "grad_norm": 0.04390472546219826, + "learning_rate": 8.889481902058198e-05, + "loss": 0.03044103682041168, + "step": 78270 + }, + { + "epoch": 11.111426543647978, + "grad_norm": 0.007192371413111687, + "learning_rate": 8.889339957416608e-05, + "loss": 0.018576528131961822, + "step": 78280 + }, + { + "epoch": 11.112845990063875, + "grad_norm": 0.30019840598106384, + "learning_rate": 8.889198012775019e-05, + "loss": 0.05336242914199829, + "step": 78290 + }, + { + "epoch": 11.114265436479773, + "grad_norm": 0.11804159730672836, + "learning_rate": 8.889056068133428e-05, + "loss": 0.054386216402053836, + "step": 78300 + }, + { + "epoch": 11.115684882895671, + "grad_norm": 1.0424158573150635, + "learning_rate": 8.888914123491838e-05, + "loss": 0.019771167635917665, + "step": 78310 + }, + { + "epoch": 11.117104329311568, + "grad_norm": 3.4843974113464355, + "learning_rate": 8.888772178850249e-05, + "loss": 0.014606226980686188, + "step": 78320 + }, + { + "epoch": 11.118523775727466, + "grad_norm": 0.32944443821907043, + "learning_rate": 8.888630234208659e-05, + "loss": 0.048432594537734984, + "step": 78330 + }, + { + "epoch": 11.119943222143364, + "grad_norm": 10.374595642089844, + "learning_rate": 8.88848828956707e-05, + "loss": 0.055147755146026614, + "step": 78340 + }, + { + "epoch": 11.121362668559263, + "grad_norm": 4.440117835998535, + "learning_rate": 8.88834634492548e-05, + "loss": 0.0403089314699173, + "step": 78350 + }, + { + "epoch": 11.12278211497516, + "grad_norm": 0.6370695233345032, + "learning_rate": 8.88820440028389e-05, + "loss": 0.021577592194080352, + "step": 78360 + }, + { + "epoch": 11.124201561391057, + "grad_norm": 0.009489811025559902, + "learning_rate": 8.8880624556423e-05, + "loss": 0.0039813734591007234, + "step": 78370 + }, + { + "epoch": 11.125621007806956, + "grad_norm": 10.565815925598145, + "learning_rate": 8.88792051100071e-05, + "loss": 0.03302794098854065, + "step": 78380 + }, + { + "epoch": 11.127040454222852, + "grad_norm": 0.8729338645935059, + "learning_rate": 8.88777856635912e-05, + "loss": 0.004103229567408562, + "step": 78390 + }, + { + "epoch": 11.12845990063875, + "grad_norm": 0.12913577258586884, + "learning_rate": 8.887636621717531e-05, + "loss": 0.023288239538669587, + "step": 78400 + }, + { + "epoch": 11.129879347054649, + "grad_norm": 0.22171258926391602, + "learning_rate": 8.887494677075941e-05, + "loss": 0.013404671847820283, + "step": 78410 + }, + { + "epoch": 11.131298793470547, + "grad_norm": 0.15066424012184143, + "learning_rate": 8.887352732434351e-05, + "loss": 0.009888184815645218, + "step": 78420 + }, + { + "epoch": 11.132718239886444, + "grad_norm": 0.3783220052719116, + "learning_rate": 8.887210787792762e-05, + "loss": 0.02776823043823242, + "step": 78430 + }, + { + "epoch": 11.134137686302342, + "grad_norm": 1.576729416847229, + "learning_rate": 8.887068843151172e-05, + "loss": 0.023171845078468322, + "step": 78440 + }, + { + "epoch": 11.13555713271824, + "grad_norm": 0.04876582324504852, + "learning_rate": 8.886926898509583e-05, + "loss": 0.05620192885398865, + "step": 78450 + }, + { + "epoch": 11.136976579134137, + "grad_norm": 4.442808628082275, + "learning_rate": 8.886784953867991e-05, + "loss": 0.02469184249639511, + "step": 78460 + }, + { + "epoch": 11.138396025550035, + "grad_norm": 1.914604663848877, + "learning_rate": 8.886643009226402e-05, + "loss": 0.008845466375350951, + "step": 78470 + }, + { + "epoch": 11.139815471965933, + "grad_norm": 0.0457342192530632, + "learning_rate": 8.886501064584812e-05, + "loss": 0.039513444900512694, + "step": 78480 + }, + { + "epoch": 11.141234918381832, + "grad_norm": 0.19912536442279816, + "learning_rate": 8.886359119943223e-05, + "loss": 0.08067465424537659, + "step": 78490 + }, + { + "epoch": 11.142654364797728, + "grad_norm": 6.148006439208984, + "learning_rate": 8.886217175301633e-05, + "loss": 0.036643752455711366, + "step": 78500 + }, + { + "epoch": 11.142654364797728, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.057572051882743835, + "eval_runtime": 33.7716, + "eval_samples_per_second": 465.687, + "eval_steps_per_second": 14.568, + "step": 78500 + }, + { + "epoch": 11.144073811213627, + "grad_norm": 1.614670991897583, + "learning_rate": 8.886075230660042e-05, + "loss": 0.01014809012413025, + "step": 78510 + }, + { + "epoch": 11.145493257629525, + "grad_norm": 0.3039005696773529, + "learning_rate": 8.885933286018454e-05, + "loss": 0.03332527279853821, + "step": 78520 + }, + { + "epoch": 11.146912704045421, + "grad_norm": 10.968552589416504, + "learning_rate": 8.885791341376863e-05, + "loss": 0.06424198150634766, + "step": 78530 + }, + { + "epoch": 11.14833215046132, + "grad_norm": 0.9706463813781738, + "learning_rate": 8.885649396735274e-05, + "loss": 0.02765073478221893, + "step": 78540 + }, + { + "epoch": 11.149751596877218, + "grad_norm": 1.7395691871643066, + "learning_rate": 8.885507452093684e-05, + "loss": 0.023248912394046785, + "step": 78550 + }, + { + "epoch": 11.151171043293116, + "grad_norm": 0.9122004508972168, + "learning_rate": 8.885365507452094e-05, + "loss": 0.012182090431451797, + "step": 78560 + }, + { + "epoch": 11.152590489709013, + "grad_norm": 2.4857709407806396, + "learning_rate": 8.885223562810504e-05, + "loss": 0.052464467287063596, + "step": 78570 + }, + { + "epoch": 11.154009936124911, + "grad_norm": 0.9637348055839539, + "learning_rate": 8.885081618168915e-05, + "loss": 0.04379349946975708, + "step": 78580 + }, + { + "epoch": 11.15542938254081, + "grad_norm": 13.401801109313965, + "learning_rate": 8.884939673527324e-05, + "loss": 0.08828679919242859, + "step": 78590 + }, + { + "epoch": 11.156848828956706, + "grad_norm": 8.565984725952148, + "learning_rate": 8.884797728885736e-05, + "loss": 0.059217429161071776, + "step": 78600 + }, + { + "epoch": 11.158268275372604, + "grad_norm": 1.7600605487823486, + "learning_rate": 8.884655784244145e-05, + "loss": 0.010065761208534241, + "step": 78610 + }, + { + "epoch": 11.159687721788503, + "grad_norm": 0.9549081325531006, + "learning_rate": 8.884513839602555e-05, + "loss": 0.019970996677875517, + "step": 78620 + }, + { + "epoch": 11.161107168204401, + "grad_norm": 0.03332928195595741, + "learning_rate": 8.884371894960966e-05, + "loss": 0.009172002971172332, + "step": 78630 + }, + { + "epoch": 11.162526614620297, + "grad_norm": 0.23457317054271698, + "learning_rate": 8.884229950319376e-05, + "loss": 0.03987969756126404, + "step": 78640 + }, + { + "epoch": 11.163946061036196, + "grad_norm": 0.09130463004112244, + "learning_rate": 8.884088005677787e-05, + "loss": 0.005275613814592362, + "step": 78650 + }, + { + "epoch": 11.165365507452094, + "grad_norm": 0.07525503635406494, + "learning_rate": 8.883946061036195e-05, + "loss": 0.03904627561569214, + "step": 78660 + }, + { + "epoch": 11.16678495386799, + "grad_norm": 1.4086534976959229, + "learning_rate": 8.883804116394606e-05, + "loss": 0.01731160581111908, + "step": 78670 + }, + { + "epoch": 11.168204400283889, + "grad_norm": 4.701272487640381, + "learning_rate": 8.883662171753016e-05, + "loss": 0.022493749856948853, + "step": 78680 + }, + { + "epoch": 11.169623846699787, + "grad_norm": 0.021906519308686256, + "learning_rate": 8.883520227111427e-05, + "loss": 0.0666272759437561, + "step": 78690 + }, + { + "epoch": 11.171043293115686, + "grad_norm": 3.216141700744629, + "learning_rate": 8.883378282469837e-05, + "loss": 0.0164662629365921, + "step": 78700 + }, + { + "epoch": 11.172462739531582, + "grad_norm": 10.842391014099121, + "learning_rate": 8.883236337828248e-05, + "loss": 0.030035099387168883, + "step": 78710 + }, + { + "epoch": 11.17388218594748, + "grad_norm": 7.16567325592041, + "learning_rate": 8.883094393186658e-05, + "loss": 0.013143278658390045, + "step": 78720 + }, + { + "epoch": 11.175301632363379, + "grad_norm": 0.0488220676779747, + "learning_rate": 8.882952448545068e-05, + "loss": 0.049586433172225955, + "step": 78730 + }, + { + "epoch": 11.176721078779275, + "grad_norm": 0.45880597829818726, + "learning_rate": 8.882810503903479e-05, + "loss": 0.004569170251488686, + "step": 78740 + }, + { + "epoch": 11.178140525195174, + "grad_norm": 13.114766120910645, + "learning_rate": 8.882668559261888e-05, + "loss": 0.02692474126815796, + "step": 78750 + }, + { + "epoch": 11.179559971611072, + "grad_norm": 8.099347114562988, + "learning_rate": 8.8825266146203e-05, + "loss": 0.042691168189048764, + "step": 78760 + }, + { + "epoch": 11.18097941802697, + "grad_norm": 0.07002486288547516, + "learning_rate": 8.882384669978708e-05, + "loss": 0.00397140234708786, + "step": 78770 + }, + { + "epoch": 11.182398864442867, + "grad_norm": 0.8963044881820679, + "learning_rate": 8.882242725337119e-05, + "loss": 0.034300926327705386, + "step": 78780 + }, + { + "epoch": 11.183818310858765, + "grad_norm": 0.19894537329673767, + "learning_rate": 8.882100780695529e-05, + "loss": 0.010814374685287476, + "step": 78790 + }, + { + "epoch": 11.185237757274663, + "grad_norm": 7.048020839691162, + "learning_rate": 8.88195883605394e-05, + "loss": 0.03981360495090484, + "step": 78800 + }, + { + "epoch": 11.18665720369056, + "grad_norm": 7.870205879211426, + "learning_rate": 8.88181689141235e-05, + "loss": 0.020268261432647705, + "step": 78810 + }, + { + "epoch": 11.188076650106458, + "grad_norm": 7.8461594581604, + "learning_rate": 8.881674946770759e-05, + "loss": 0.06974129676818848, + "step": 78820 + }, + { + "epoch": 11.189496096522356, + "grad_norm": 2.0994327068328857, + "learning_rate": 8.88153300212917e-05, + "loss": 0.008805645257234573, + "step": 78830 + }, + { + "epoch": 11.190915542938255, + "grad_norm": 7.246284484863281, + "learning_rate": 8.88139105748758e-05, + "loss": 0.041193538904190065, + "step": 78840 + }, + { + "epoch": 11.192334989354151, + "grad_norm": 3.468059778213501, + "learning_rate": 8.881249112845991e-05, + "loss": 0.029273611307144166, + "step": 78850 + }, + { + "epoch": 11.19375443577005, + "grad_norm": 11.098454475402832, + "learning_rate": 8.881107168204401e-05, + "loss": 0.039662298560142514, + "step": 78860 + }, + { + "epoch": 11.195173882185948, + "grad_norm": 11.667226791381836, + "learning_rate": 8.88096522356281e-05, + "loss": 0.043052345514297485, + "step": 78870 + }, + { + "epoch": 11.196593328601844, + "grad_norm": 0.6782079339027405, + "learning_rate": 8.88082327892122e-05, + "loss": 0.03640368580818176, + "step": 78880 + }, + { + "epoch": 11.198012775017743, + "grad_norm": 3.632553815841675, + "learning_rate": 8.880681334279631e-05, + "loss": 0.011744405329227447, + "step": 78890 + }, + { + "epoch": 11.199432221433641, + "grad_norm": 0.102350153028965, + "learning_rate": 8.880539389638041e-05, + "loss": 0.036199426651000975, + "step": 78900 + }, + { + "epoch": 11.20085166784954, + "grad_norm": 0.1365147978067398, + "learning_rate": 8.880397444996452e-05, + "loss": 0.024364030361175536, + "step": 78910 + }, + { + "epoch": 11.202271114265436, + "grad_norm": 0.05941876396536827, + "learning_rate": 8.880255500354862e-05, + "loss": 0.02454724460840225, + "step": 78920 + }, + { + "epoch": 11.203690560681334, + "grad_norm": 2.1816346645355225, + "learning_rate": 8.880113555713272e-05, + "loss": 0.017804595828056335, + "step": 78930 + }, + { + "epoch": 11.205110007097232, + "grad_norm": 0.7614143490791321, + "learning_rate": 8.879971611071683e-05, + "loss": 0.008576758205890656, + "step": 78940 + }, + { + "epoch": 11.206529453513129, + "grad_norm": 0.6037997007369995, + "learning_rate": 8.879829666430093e-05, + "loss": 0.044572693109512326, + "step": 78950 + }, + { + "epoch": 11.207948899929027, + "grad_norm": 0.44765669107437134, + "learning_rate": 8.879687721788504e-05, + "loss": 0.07759880423545837, + "step": 78960 + }, + { + "epoch": 11.209368346344926, + "grad_norm": 3.6659607887268066, + "learning_rate": 8.879545777146912e-05, + "loss": 0.045743897557258606, + "step": 78970 + }, + { + "epoch": 11.210787792760824, + "grad_norm": 12.800979614257812, + "learning_rate": 8.879403832505323e-05, + "loss": 0.03383589088916779, + "step": 78980 + }, + { + "epoch": 11.21220723917672, + "grad_norm": 0.5182911157608032, + "learning_rate": 8.879261887863733e-05, + "loss": 0.020882833003997802, + "step": 78990 + }, + { + "epoch": 11.213626685592619, + "grad_norm": 3.3220176696777344, + "learning_rate": 8.879119943222144e-05, + "loss": 0.010641470551490784, + "step": 79000 + }, + { + "epoch": 11.213626685592619, + "eval_accuracy": 0.9826413174794939, + "eval_loss": 0.06146110221743584, + "eval_runtime": 34.5996, + "eval_samples_per_second": 454.542, + "eval_steps_per_second": 14.22, + "step": 79000 + }, + { + "epoch": 11.215046132008517, + "grad_norm": 0.5559946298599243, + "learning_rate": 8.878977998580554e-05, + "loss": 0.032634681463241576, + "step": 79010 + }, + { + "epoch": 11.216465578424414, + "grad_norm": 1.8470778465270996, + "learning_rate": 8.878836053938965e-05, + "loss": 0.04523923397064209, + "step": 79020 + }, + { + "epoch": 11.217885024840312, + "grad_norm": 3.1267638206481934, + "learning_rate": 8.878694109297375e-05, + "loss": 0.04637001752853394, + "step": 79030 + }, + { + "epoch": 11.21930447125621, + "grad_norm": 8.157069206237793, + "learning_rate": 8.878552164655784e-05, + "loss": 0.016615946590900422, + "step": 79040 + }, + { + "epoch": 11.220723917672109, + "grad_norm": 0.06618022173643112, + "learning_rate": 8.878410220014195e-05, + "loss": 0.07373695969581603, + "step": 79050 + }, + { + "epoch": 11.222143364088005, + "grad_norm": 0.6748453378677368, + "learning_rate": 8.878268275372605e-05, + "loss": 0.05090689063072205, + "step": 79060 + }, + { + "epoch": 11.223562810503903, + "grad_norm": 0.33752214908599854, + "learning_rate": 8.878126330731016e-05, + "loss": 0.016496339440345766, + "step": 79070 + }, + { + "epoch": 11.224982256919802, + "grad_norm": 1.0742039680480957, + "learning_rate": 8.877984386089425e-05, + "loss": 0.035834723711013795, + "step": 79080 + }, + { + "epoch": 11.2264017033357, + "grad_norm": 1.6405409574508667, + "learning_rate": 8.877842441447836e-05, + "loss": 0.014355912804603577, + "step": 79090 + }, + { + "epoch": 11.227821149751597, + "grad_norm": 2.370652437210083, + "learning_rate": 8.877700496806245e-05, + "loss": 0.02448154389858246, + "step": 79100 + }, + { + "epoch": 11.229240596167495, + "grad_norm": 0.8242597579956055, + "learning_rate": 8.877558552164657e-05, + "loss": 0.04506869912147522, + "step": 79110 + }, + { + "epoch": 11.230660042583393, + "grad_norm": 2.0276906490325928, + "learning_rate": 8.877416607523066e-05, + "loss": 0.05213066339492798, + "step": 79120 + }, + { + "epoch": 11.23207948899929, + "grad_norm": 2.8450264930725098, + "learning_rate": 8.877274662881476e-05, + "loss": 0.020429591834545135, + "step": 79130 + }, + { + "epoch": 11.233498935415188, + "grad_norm": 0.10804232209920883, + "learning_rate": 8.877132718239887e-05, + "loss": 0.022500227391719817, + "step": 79140 + }, + { + "epoch": 11.234918381831086, + "grad_norm": 7.79222297668457, + "learning_rate": 8.876990773598297e-05, + "loss": 0.07455472350120544, + "step": 79150 + }, + { + "epoch": 11.236337828246985, + "grad_norm": 13.84041690826416, + "learning_rate": 8.876848828956708e-05, + "loss": 0.043375393748283385, + "step": 79160 + }, + { + "epoch": 11.237757274662881, + "grad_norm": 0.08527079224586487, + "learning_rate": 8.876706884315118e-05, + "loss": 0.04165436625480652, + "step": 79170 + }, + { + "epoch": 11.23917672107878, + "grad_norm": 0.046399861574172974, + "learning_rate": 8.876564939673527e-05, + "loss": 0.043243306875228885, + "step": 79180 + }, + { + "epoch": 11.240596167494678, + "grad_norm": 0.0947565957903862, + "learning_rate": 8.876422995031937e-05, + "loss": 0.011657755821943283, + "step": 79190 + }, + { + "epoch": 11.242015613910574, + "grad_norm": 0.4945715367794037, + "learning_rate": 8.876281050390348e-05, + "loss": 0.026895123720169067, + "step": 79200 + }, + { + "epoch": 11.243435060326473, + "grad_norm": 0.6884742975234985, + "learning_rate": 8.876139105748758e-05, + "loss": 0.019331425428390503, + "step": 79210 + }, + { + "epoch": 11.24485450674237, + "grad_norm": 3.9027488231658936, + "learning_rate": 8.875997161107169e-05, + "loss": 0.00956488996744156, + "step": 79220 + }, + { + "epoch": 11.24627395315827, + "grad_norm": 0.17311543226242065, + "learning_rate": 8.875855216465579e-05, + "loss": 0.03562336564064026, + "step": 79230 + }, + { + "epoch": 11.247693399574166, + "grad_norm": 0.2896651029586792, + "learning_rate": 8.875713271823989e-05, + "loss": 0.013322600722312927, + "step": 79240 + }, + { + "epoch": 11.249112845990064, + "grad_norm": 0.5805850625038147, + "learning_rate": 8.8755713271824e-05, + "loss": 0.017601439356803895, + "step": 79250 + }, + { + "epoch": 11.250532292405962, + "grad_norm": 0.29199790954589844, + "learning_rate": 8.87542938254081e-05, + "loss": 0.032927751541137695, + "step": 79260 + }, + { + "epoch": 11.251951738821859, + "grad_norm": 6.623978614807129, + "learning_rate": 8.87528743789922e-05, + "loss": 0.02440749704837799, + "step": 79270 + }, + { + "epoch": 11.253371185237757, + "grad_norm": 2.0030710697174072, + "learning_rate": 8.875145493257629e-05, + "loss": 0.010758772492408752, + "step": 79280 + }, + { + "epoch": 11.254790631653655, + "grad_norm": 8.13338565826416, + "learning_rate": 8.87500354861604e-05, + "loss": 0.01573694348335266, + "step": 79290 + }, + { + "epoch": 11.256210078069554, + "grad_norm": 0.5603359937667847, + "learning_rate": 8.87486160397445e-05, + "loss": 0.0071515366435050964, + "step": 79300 + }, + { + "epoch": 11.25762952448545, + "grad_norm": 0.025435080751776695, + "learning_rate": 8.874719659332861e-05, + "loss": 0.05067628026008606, + "step": 79310 + }, + { + "epoch": 11.259048970901349, + "grad_norm": 0.007434059400111437, + "learning_rate": 8.874577714691272e-05, + "loss": 0.009691541641950607, + "step": 79320 + }, + { + "epoch": 11.260468417317247, + "grad_norm": 0.15233097970485687, + "learning_rate": 8.87443577004968e-05, + "loss": 0.008891449868679046, + "step": 79330 + }, + { + "epoch": 11.261887863733143, + "grad_norm": 0.013036555610597134, + "learning_rate": 8.874293825408091e-05, + "loss": 0.01533883363008499, + "step": 79340 + }, + { + "epoch": 11.263307310149042, + "grad_norm": 2.5614285469055176, + "learning_rate": 8.874151880766501e-05, + "loss": 0.007508398592472076, + "step": 79350 + }, + { + "epoch": 11.26472675656494, + "grad_norm": 0.06737040728330612, + "learning_rate": 8.874009936124912e-05, + "loss": 0.032192906737327574, + "step": 79360 + }, + { + "epoch": 11.266146202980838, + "grad_norm": 0.9063697457313538, + "learning_rate": 8.873867991483322e-05, + "loss": 0.01731267273426056, + "step": 79370 + }, + { + "epoch": 11.267565649396735, + "grad_norm": 0.378262996673584, + "learning_rate": 8.873726046841733e-05, + "loss": 0.001695651188492775, + "step": 79380 + }, + { + "epoch": 11.268985095812633, + "grad_norm": 11.412947654724121, + "learning_rate": 8.873584102200141e-05, + "loss": 0.054120153188705444, + "step": 79390 + }, + { + "epoch": 11.270404542228531, + "grad_norm": 1.0254113674163818, + "learning_rate": 8.873442157558552e-05, + "loss": 0.002478482201695442, + "step": 79400 + }, + { + "epoch": 11.271823988644428, + "grad_norm": 0.222798153758049, + "learning_rate": 8.873300212916964e-05, + "loss": 0.011906647682189941, + "step": 79410 + }, + { + "epoch": 11.273243435060326, + "grad_norm": 0.8719398379325867, + "learning_rate": 8.873158268275373e-05, + "loss": 0.0305912584066391, + "step": 79420 + }, + { + "epoch": 11.274662881476225, + "grad_norm": 0.04235215485095978, + "learning_rate": 8.873016323633784e-05, + "loss": 0.030136144161224364, + "step": 79430 + }, + { + "epoch": 11.276082327892123, + "grad_norm": 6.246129035949707, + "learning_rate": 8.872874378992193e-05, + "loss": 0.03775702714920044, + "step": 79440 + }, + { + "epoch": 11.27750177430802, + "grad_norm": 0.19702669978141785, + "learning_rate": 8.872732434350604e-05, + "loss": 0.04544393420219421, + "step": 79450 + }, + { + "epoch": 11.278921220723918, + "grad_norm": 1.5803076028823853, + "learning_rate": 8.872590489709014e-05, + "loss": 0.01723621040582657, + "step": 79460 + }, + { + "epoch": 11.280340667139816, + "grad_norm": 0.902417778968811, + "learning_rate": 8.872448545067425e-05, + "loss": 0.009972374886274338, + "step": 79470 + }, + { + "epoch": 11.281760113555713, + "grad_norm": 0.10034406930208206, + "learning_rate": 8.872306600425834e-05, + "loss": 0.02509826123714447, + "step": 79480 + }, + { + "epoch": 11.283179559971611, + "grad_norm": 1.11604905128479, + "learning_rate": 8.872164655784244e-05, + "loss": 0.020654731988906862, + "step": 79490 + }, + { + "epoch": 11.28459900638751, + "grad_norm": 0.7076891660690308, + "learning_rate": 8.872022711142655e-05, + "loss": 0.01148422509431839, + "step": 79500 + }, + { + "epoch": 11.28459900638751, + "eval_accuracy": 0.9832771666560692, + "eval_loss": 0.05689075216650963, + "eval_runtime": 33.9551, + "eval_samples_per_second": 463.171, + "eval_steps_per_second": 14.49, + "step": 79500 + }, + { + "epoch": 11.286018452803408, + "grad_norm": 1.7435786724090576, + "learning_rate": 8.871880766501065e-05, + "loss": 0.025782716274261475, + "step": 79510 + }, + { + "epoch": 11.287437899219304, + "grad_norm": 0.5907371044158936, + "learning_rate": 8.871738821859476e-05, + "loss": 0.019414816796779633, + "step": 79520 + }, + { + "epoch": 11.288857345635202, + "grad_norm": 0.14381571114063263, + "learning_rate": 8.871596877217886e-05, + "loss": 0.05670464038848877, + "step": 79530 + }, + { + "epoch": 11.2902767920511, + "grad_norm": 6.07039737701416, + "learning_rate": 8.871469127040454e-05, + "loss": 0.0341320812702179, + "step": 79540 + }, + { + "epoch": 11.291696238466997, + "grad_norm": 1.4728301763534546, + "learning_rate": 8.871327182398865e-05, + "loss": 0.019729208946228028, + "step": 79550 + }, + { + "epoch": 11.293115684882896, + "grad_norm": 8.61385726928711, + "learning_rate": 8.871185237757275e-05, + "loss": 0.03510661125183105, + "step": 79560 + }, + { + "epoch": 11.294535131298794, + "grad_norm": 0.5472472906112671, + "learning_rate": 8.871043293115685e-05, + "loss": 0.008604159206151962, + "step": 79570 + }, + { + "epoch": 11.295954577714692, + "grad_norm": 1.1503679752349854, + "learning_rate": 8.870901348474096e-05, + "loss": 0.03208954930305481, + "step": 79580 + }, + { + "epoch": 11.297374024130589, + "grad_norm": 0.0036283473018556833, + "learning_rate": 8.870759403832506e-05, + "loss": 0.002127677947282791, + "step": 79590 + }, + { + "epoch": 11.298793470546487, + "grad_norm": 2.7503178119659424, + "learning_rate": 8.870617459190917e-05, + "loss": 0.008547821640968322, + "step": 79600 + }, + { + "epoch": 11.300212916962385, + "grad_norm": 0.21989195048809052, + "learning_rate": 8.870475514549325e-05, + "loss": 0.013670270144939423, + "step": 79610 + }, + { + "epoch": 11.301632363378282, + "grad_norm": 0.0767766609787941, + "learning_rate": 8.870333569907736e-05, + "loss": 0.03806124329566955, + "step": 79620 + }, + { + "epoch": 11.30305180979418, + "grad_norm": 0.07631035149097443, + "learning_rate": 8.870191625266146e-05, + "loss": 0.023271280527114867, + "step": 79630 + }, + { + "epoch": 11.304471256210078, + "grad_norm": 4.039665699005127, + "learning_rate": 8.870049680624557e-05, + "loss": 0.03466680645942688, + "step": 79640 + }, + { + "epoch": 11.305890702625977, + "grad_norm": 0.5322751998901367, + "learning_rate": 8.869907735982967e-05, + "loss": 0.021226316690444946, + "step": 79650 + }, + { + "epoch": 11.307310149041873, + "grad_norm": 0.2919057011604309, + "learning_rate": 8.869765791341377e-05, + "loss": 0.02747492790222168, + "step": 79660 + }, + { + "epoch": 11.308729595457772, + "grad_norm": 1.9962308406829834, + "learning_rate": 8.869623846699788e-05, + "loss": 0.022367829084396364, + "step": 79670 + }, + { + "epoch": 11.31014904187367, + "grad_norm": 0.6660625338554382, + "learning_rate": 8.869481902058197e-05, + "loss": 0.032745882868766785, + "step": 79680 + }, + { + "epoch": 11.311568488289566, + "grad_norm": 0.26197269558906555, + "learning_rate": 8.869339957416609e-05, + "loss": 0.04385312795639038, + "step": 79690 + }, + { + "epoch": 11.312987934705465, + "grad_norm": 4.215728282928467, + "learning_rate": 8.869198012775018e-05, + "loss": 0.020120292901992798, + "step": 79700 + }, + { + "epoch": 11.314407381121363, + "grad_norm": 8.843235969543457, + "learning_rate": 8.86905606813343e-05, + "loss": 0.019011446833610536, + "step": 79710 + }, + { + "epoch": 11.315826827537261, + "grad_norm": 17.59849739074707, + "learning_rate": 8.868914123491838e-05, + "loss": 0.030032917857170105, + "step": 79720 + }, + { + "epoch": 11.317246273953158, + "grad_norm": 0.15840287506580353, + "learning_rate": 8.868772178850249e-05, + "loss": 0.023678889870643614, + "step": 79730 + }, + { + "epoch": 11.318665720369056, + "grad_norm": 0.020314080640673637, + "learning_rate": 8.868630234208659e-05, + "loss": 0.026168987154960632, + "step": 79740 + }, + { + "epoch": 11.320085166784954, + "grad_norm": 0.6125651001930237, + "learning_rate": 8.86848828956707e-05, + "loss": 0.023059825599193572, + "step": 79750 + }, + { + "epoch": 11.321504613200851, + "grad_norm": 15.21810245513916, + "learning_rate": 8.86834634492548e-05, + "loss": 0.03762706518173218, + "step": 79760 + }, + { + "epoch": 11.32292405961675, + "grad_norm": 5.474844455718994, + "learning_rate": 8.868204400283889e-05, + "loss": 0.049691683053970336, + "step": 79770 + }, + { + "epoch": 11.324343506032648, + "grad_norm": 1.5685131549835205, + "learning_rate": 8.8680624556423e-05, + "loss": 0.012304575741291046, + "step": 79780 + }, + { + "epoch": 11.325762952448546, + "grad_norm": 1.084864616394043, + "learning_rate": 8.86792051100071e-05, + "loss": 0.031174218654632567, + "step": 79790 + }, + { + "epoch": 11.327182398864442, + "grad_norm": 8.637118339538574, + "learning_rate": 8.867778566359121e-05, + "loss": 0.01682147979736328, + "step": 79800 + }, + { + "epoch": 11.32860184528034, + "grad_norm": 0.6938183307647705, + "learning_rate": 8.867636621717531e-05, + "loss": 0.0065284594893455505, + "step": 79810 + }, + { + "epoch": 11.330021291696239, + "grad_norm": 0.4806843101978302, + "learning_rate": 8.86749467707594e-05, + "loss": 0.04211854636669159, + "step": 79820 + }, + { + "epoch": 11.331440738112136, + "grad_norm": 3.1211087703704834, + "learning_rate": 8.86735273243435e-05, + "loss": 0.017286308109760284, + "step": 79830 + }, + { + "epoch": 11.332860184528034, + "grad_norm": 0.42606401443481445, + "learning_rate": 8.867210787792761e-05, + "loss": 0.012424388527870178, + "step": 79840 + }, + { + "epoch": 11.334279630943932, + "grad_norm": 0.31800413131713867, + "learning_rate": 8.867068843151171e-05, + "loss": 0.015106824040412904, + "step": 79850 + }, + { + "epoch": 11.33569907735983, + "grad_norm": 8.675800323486328, + "learning_rate": 8.866926898509582e-05, + "loss": 0.06075004935264587, + "step": 79860 + }, + { + "epoch": 11.337118523775727, + "grad_norm": 0.16496433317661285, + "learning_rate": 8.866784953867992e-05, + "loss": 0.011707325279712678, + "step": 79870 + }, + { + "epoch": 11.338537970191625, + "grad_norm": 0.20603810250759125, + "learning_rate": 8.866643009226402e-05, + "loss": 0.03473392426967621, + "step": 79880 + }, + { + "epoch": 11.339957416607524, + "grad_norm": 0.07010184228420258, + "learning_rate": 8.866501064584813e-05, + "loss": 0.022533835470676424, + "step": 79890 + }, + { + "epoch": 11.34137686302342, + "grad_norm": 1.6609989404678345, + "learning_rate": 8.866359119943222e-05, + "loss": 0.05829617381095886, + "step": 79900 + }, + { + "epoch": 11.342796309439318, + "grad_norm": 3.047590970993042, + "learning_rate": 8.866217175301634e-05, + "loss": 0.04038935005664825, + "step": 79910 + }, + { + "epoch": 11.344215755855217, + "grad_norm": 0.25923168659210205, + "learning_rate": 8.866075230660042e-05, + "loss": 0.047234049439430235, + "step": 79920 + }, + { + "epoch": 11.345635202271115, + "grad_norm": 3.83129620552063, + "learning_rate": 8.865933286018453e-05, + "loss": 0.03659171462059021, + "step": 79930 + }, + { + "epoch": 11.347054648687012, + "grad_norm": 6.185966968536377, + "learning_rate": 8.865791341376863e-05, + "loss": 0.03016130328178406, + "step": 79940 + }, + { + "epoch": 11.34847409510291, + "grad_norm": 0.1661355197429657, + "learning_rate": 8.865649396735274e-05, + "loss": 0.016263149678707123, + "step": 79950 + }, + { + "epoch": 11.349893541518808, + "grad_norm": 0.04041367396712303, + "learning_rate": 8.865507452093684e-05, + "loss": 0.022622223198413848, + "step": 79960 + }, + { + "epoch": 11.351312987934705, + "grad_norm": 13.005172729492188, + "learning_rate": 8.865365507452093e-05, + "loss": 0.03981154561042786, + "step": 79970 + }, + { + "epoch": 11.352732434350603, + "grad_norm": 0.0274747796356678, + "learning_rate": 8.865223562810504e-05, + "loss": 0.02526825964450836, + "step": 79980 + }, + { + "epoch": 11.354151880766501, + "grad_norm": 5.506161212921143, + "learning_rate": 8.865081618168914e-05, + "loss": 0.013444554805755616, + "step": 79990 + }, + { + "epoch": 11.3555713271824, + "grad_norm": 2.2358124256134033, + "learning_rate": 8.864939673527325e-05, + "loss": 0.06353100538253784, + "step": 80000 + }, + { + "epoch": 11.3555713271824, + "eval_accuracy": 0.9811152794557131, + "eval_loss": 0.0651044249534607, + "eval_runtime": 33.7254, + "eval_samples_per_second": 466.325, + "eval_steps_per_second": 14.588, + "step": 80000 + }, + { + "epoch": 11.356990773598296, + "grad_norm": 11.643047332763672, + "learning_rate": 8.864797728885735e-05, + "loss": 0.034117665886878965, + "step": 80010 + }, + { + "epoch": 11.358410220014195, + "grad_norm": 0.7074921727180481, + "learning_rate": 8.864655784244145e-05, + "loss": 0.02698880434036255, + "step": 80020 + }, + { + "epoch": 11.359829666430093, + "grad_norm": 5.939971446990967, + "learning_rate": 8.864513839602554e-05, + "loss": 0.050882387161254886, + "step": 80030 + }, + { + "epoch": 11.36124911284599, + "grad_norm": 6.972652912139893, + "learning_rate": 8.864371894960966e-05, + "loss": 0.0220290869474411, + "step": 80040 + }, + { + "epoch": 11.362668559261888, + "grad_norm": 0.12394507229328156, + "learning_rate": 8.864229950319375e-05, + "loss": 0.011210134625434876, + "step": 80050 + }, + { + "epoch": 11.364088005677786, + "grad_norm": 0.2570814788341522, + "learning_rate": 8.864088005677786e-05, + "loss": 0.021895354986190795, + "step": 80060 + }, + { + "epoch": 11.365507452093684, + "grad_norm": 1.4537698030471802, + "learning_rate": 8.863946061036198e-05, + "loss": 0.04261449277400971, + "step": 80070 + }, + { + "epoch": 11.36692689850958, + "grad_norm": 0.11611072719097137, + "learning_rate": 8.863804116394606e-05, + "loss": 0.0075361371040344235, + "step": 80080 + }, + { + "epoch": 11.36834634492548, + "grad_norm": 8.89481258392334, + "learning_rate": 8.863662171753017e-05, + "loss": 0.010879594087600707, + "step": 80090 + }, + { + "epoch": 11.369765791341377, + "grad_norm": 1.2125955820083618, + "learning_rate": 8.863520227111427e-05, + "loss": 0.009992797672748566, + "step": 80100 + }, + { + "epoch": 11.371185237757274, + "grad_norm": 0.028409961611032486, + "learning_rate": 8.863378282469838e-05, + "loss": 0.014760425686836243, + "step": 80110 + }, + { + "epoch": 11.372604684173172, + "grad_norm": 1.1265093088150024, + "learning_rate": 8.863236337828248e-05, + "loss": 0.01140914335846901, + "step": 80120 + }, + { + "epoch": 11.37402413058907, + "grad_norm": 0.033151112496852875, + "learning_rate": 8.863094393186657e-05, + "loss": 0.01173935979604721, + "step": 80130 + }, + { + "epoch": 11.375443577004969, + "grad_norm": 0.9151448011398315, + "learning_rate": 8.862952448545067e-05, + "loss": 0.011046409606933594, + "step": 80140 + }, + { + "epoch": 11.376863023420865, + "grad_norm": 0.5126664638519287, + "learning_rate": 8.862810503903478e-05, + "loss": 0.03162610232830047, + "step": 80150 + }, + { + "epoch": 11.378282469836764, + "grad_norm": 0.37595346570014954, + "learning_rate": 8.862668559261889e-05, + "loss": 0.04972442090511322, + "step": 80160 + }, + { + "epoch": 11.379701916252662, + "grad_norm": 0.044639065861701965, + "learning_rate": 8.862526614620299e-05, + "loss": 0.032768523693084715, + "step": 80170 + }, + { + "epoch": 11.381121362668559, + "grad_norm": 5.298313617706299, + "learning_rate": 8.862384669978709e-05, + "loss": 0.01692819893360138, + "step": 80180 + }, + { + "epoch": 11.382540809084457, + "grad_norm": 10.989527702331543, + "learning_rate": 8.862242725337118e-05, + "loss": 0.011271566897630692, + "step": 80190 + }, + { + "epoch": 11.383960255500355, + "grad_norm": 18.672773361206055, + "learning_rate": 8.86210078069553e-05, + "loss": 0.0333732008934021, + "step": 80200 + }, + { + "epoch": 11.385379701916253, + "grad_norm": 0.6371679902076721, + "learning_rate": 8.861958836053939e-05, + "loss": 0.03899551033973694, + "step": 80210 + }, + { + "epoch": 11.38679914833215, + "grad_norm": 0.4237102270126343, + "learning_rate": 8.86181689141235e-05, + "loss": 0.01527671068906784, + "step": 80220 + }, + { + "epoch": 11.388218594748048, + "grad_norm": 2.081489324569702, + "learning_rate": 8.861674946770759e-05, + "loss": 0.007568246126174927, + "step": 80230 + }, + { + "epoch": 11.389638041163947, + "grad_norm": 4.329316139221191, + "learning_rate": 8.86153300212917e-05, + "loss": 0.012238572537899017, + "step": 80240 + }, + { + "epoch": 11.391057487579843, + "grad_norm": 0.23751749098300934, + "learning_rate": 8.861391057487581e-05, + "loss": 0.023023539781570436, + "step": 80250 + }, + { + "epoch": 11.392476933995741, + "grad_norm": 3.3496580123901367, + "learning_rate": 8.86124911284599e-05, + "loss": 0.04121274352073669, + "step": 80260 + }, + { + "epoch": 11.39389638041164, + "grad_norm": 10.261673927307129, + "learning_rate": 8.861107168204402e-05, + "loss": 0.04002052247524261, + "step": 80270 + }, + { + "epoch": 11.395315826827538, + "grad_norm": 0.14845259487628937, + "learning_rate": 8.86096522356281e-05, + "loss": 0.028948825597763062, + "step": 80280 + }, + { + "epoch": 11.396735273243435, + "grad_norm": 2.195971965789795, + "learning_rate": 8.860823278921221e-05, + "loss": 0.030800750851631163, + "step": 80290 + }, + { + "epoch": 11.398154719659333, + "grad_norm": 1.8944286108016968, + "learning_rate": 8.860681334279631e-05, + "loss": 0.049839770793914794, + "step": 80300 + }, + { + "epoch": 11.399574166075231, + "grad_norm": 2.8817431926727295, + "learning_rate": 8.860539389638042e-05, + "loss": 0.03240306973457337, + "step": 80310 + }, + { + "epoch": 11.400993612491128, + "grad_norm": 13.485665321350098, + "learning_rate": 8.860397444996452e-05, + "loss": 0.03296520709991455, + "step": 80320 + }, + { + "epoch": 11.402413058907026, + "grad_norm": 0.11741983145475388, + "learning_rate": 8.860255500354862e-05, + "loss": 0.06958463191986083, + "step": 80330 + }, + { + "epoch": 11.403832505322924, + "grad_norm": 5.842444896697998, + "learning_rate": 8.860113555713273e-05, + "loss": 0.03531903624534607, + "step": 80340 + }, + { + "epoch": 11.405251951738823, + "grad_norm": 5.8268327713012695, + "learning_rate": 8.859971611071682e-05, + "loss": 0.03155616819858551, + "step": 80350 + }, + { + "epoch": 11.40667139815472, + "grad_norm": 0.37767449021339417, + "learning_rate": 8.859829666430093e-05, + "loss": 0.06218478083610535, + "step": 80360 + }, + { + "epoch": 11.408090844570618, + "grad_norm": 0.15035152435302734, + "learning_rate": 8.859687721788503e-05, + "loss": 0.028446558117866515, + "step": 80370 + }, + { + "epoch": 11.409510290986516, + "grad_norm": 11.59382152557373, + "learning_rate": 8.859545777146913e-05, + "loss": 0.026360827684402465, + "step": 80380 + }, + { + "epoch": 11.410929737402412, + "grad_norm": 2.5553624629974365, + "learning_rate": 8.859403832505323e-05, + "loss": 0.00743272751569748, + "step": 80390 + }, + { + "epoch": 11.41234918381831, + "grad_norm": 3.6909255981445312, + "learning_rate": 8.859261887863734e-05, + "loss": 0.015209051966667175, + "step": 80400 + }, + { + "epoch": 11.413768630234209, + "grad_norm": 0.004849972203373909, + "learning_rate": 8.859119943222143e-05, + "loss": 0.04423539340496063, + "step": 80410 + }, + { + "epoch": 11.415188076650107, + "grad_norm": 6.711785316467285, + "learning_rate": 8.858977998580555e-05, + "loss": 0.036108124256134036, + "step": 80420 + }, + { + "epoch": 11.416607523066004, + "grad_norm": 3.5907931327819824, + "learning_rate": 8.858836053938964e-05, + "loss": 0.035010167956352235, + "step": 80430 + }, + { + "epoch": 11.418026969481902, + "grad_norm": 3.7037017345428467, + "learning_rate": 8.858694109297374e-05, + "loss": 0.05554625988006592, + "step": 80440 + }, + { + "epoch": 11.4194464158978, + "grad_norm": 2.50538969039917, + "learning_rate": 8.858552164655785e-05, + "loss": 0.039789438247680664, + "step": 80450 + }, + { + "epoch": 11.420865862313697, + "grad_norm": 0.15071162581443787, + "learning_rate": 8.858410220014195e-05, + "loss": 0.030578255653381348, + "step": 80460 + }, + { + "epoch": 11.422285308729595, + "grad_norm": 8.107895851135254, + "learning_rate": 8.858268275372606e-05, + "loss": 0.06292965412139892, + "step": 80470 + }, + { + "epoch": 11.423704755145494, + "grad_norm": 4.886325836181641, + "learning_rate": 8.858126330731016e-05, + "loss": 0.05497379899024964, + "step": 80480 + }, + { + "epoch": 11.425124201561392, + "grad_norm": 0.04609977453947067, + "learning_rate": 8.857984386089425e-05, + "loss": 0.031700408458709715, + "step": 80490 + }, + { + "epoch": 11.426543647977288, + "grad_norm": 5.431364059448242, + "learning_rate": 8.857842441447835e-05, + "loss": 0.0376334011554718, + "step": 80500 + }, + { + "epoch": 11.426543647977288, + "eval_accuracy": 0.981941883385261, + "eval_loss": 0.06595078110694885, + "eval_runtime": 33.4544, + "eval_samples_per_second": 470.102, + "eval_steps_per_second": 14.707, + "step": 80500 + }, + { + "epoch": 11.427963094393187, + "grad_norm": 1.621141791343689, + "learning_rate": 8.857700496806246e-05, + "loss": 0.043695205450057985, + "step": 80510 + }, + { + "epoch": 11.429382540809085, + "grad_norm": 2.8783621788024902, + "learning_rate": 8.857558552164656e-05, + "loss": 0.01851654052734375, + "step": 80520 + }, + { + "epoch": 11.430801987224982, + "grad_norm": 1.7176443338394165, + "learning_rate": 8.857416607523067e-05, + "loss": 0.047656843066215517, + "step": 80530 + }, + { + "epoch": 11.43222143364088, + "grad_norm": 0.0061423284932971, + "learning_rate": 8.857274662881477e-05, + "loss": 0.06792023777961731, + "step": 80540 + }, + { + "epoch": 11.433640880056778, + "grad_norm": 0.18173955380916595, + "learning_rate": 8.857132718239887e-05, + "loss": 0.0309945285320282, + "step": 80550 + }, + { + "epoch": 11.435060326472676, + "grad_norm": 7.272887229919434, + "learning_rate": 8.856990773598298e-05, + "loss": 0.06236481070518494, + "step": 80560 + }, + { + "epoch": 11.436479772888573, + "grad_norm": 6.205739974975586, + "learning_rate": 8.856848828956707e-05, + "loss": 0.04723560214042664, + "step": 80570 + }, + { + "epoch": 11.437899219304471, + "grad_norm": 5.547316074371338, + "learning_rate": 8.856706884315119e-05, + "loss": 0.028773194551467894, + "step": 80580 + }, + { + "epoch": 11.43931866572037, + "grad_norm": 0.14390479028224945, + "learning_rate": 8.856564939673527e-05, + "loss": 0.01777784675359726, + "step": 80590 + }, + { + "epoch": 11.440738112136266, + "grad_norm": 0.3887863755226135, + "learning_rate": 8.856422995031938e-05, + "loss": 0.040037679672241214, + "step": 80600 + }, + { + "epoch": 11.442157558552164, + "grad_norm": 4.7135748863220215, + "learning_rate": 8.856281050390348e-05, + "loss": 0.01964379847049713, + "step": 80610 + }, + { + "epoch": 11.443577004968063, + "grad_norm": 4.665762424468994, + "learning_rate": 8.856139105748759e-05, + "loss": 0.06711235046386718, + "step": 80620 + }, + { + "epoch": 11.444996451383961, + "grad_norm": 7.4186248779296875, + "learning_rate": 8.855997161107169e-05, + "loss": 0.04224056899547577, + "step": 80630 + }, + { + "epoch": 11.446415897799858, + "grad_norm": 1.482706069946289, + "learning_rate": 8.855855216465578e-05, + "loss": 0.02505524754524231, + "step": 80640 + }, + { + "epoch": 11.447835344215756, + "grad_norm": 0.08347708731889725, + "learning_rate": 8.85571327182399e-05, + "loss": 0.033928149938583375, + "step": 80650 + }, + { + "epoch": 11.449254790631654, + "grad_norm": 1.5397976636886597, + "learning_rate": 8.855571327182399e-05, + "loss": 0.0054555382579565045, + "step": 80660 + }, + { + "epoch": 11.45067423704755, + "grad_norm": 0.7289921045303345, + "learning_rate": 8.85542938254081e-05, + "loss": 0.03309193551540375, + "step": 80670 + }, + { + "epoch": 11.452093683463449, + "grad_norm": 4.813479900360107, + "learning_rate": 8.85528743789922e-05, + "loss": 0.09139240980148315, + "step": 80680 + }, + { + "epoch": 11.453513129879347, + "grad_norm": 10.499625205993652, + "learning_rate": 8.85514549325763e-05, + "loss": 0.040348267555236815, + "step": 80690 + }, + { + "epoch": 11.454932576295246, + "grad_norm": 1.5510531663894653, + "learning_rate": 8.85500354861604e-05, + "loss": 0.04443258345127106, + "step": 80700 + }, + { + "epoch": 11.456352022711142, + "grad_norm": 5.092698097229004, + "learning_rate": 8.85486160397445e-05, + "loss": 0.05160248279571533, + "step": 80710 + }, + { + "epoch": 11.45777146912704, + "grad_norm": 6.294242858886719, + "learning_rate": 8.85471965933286e-05, + "loss": 0.02735318839550018, + "step": 80720 + }, + { + "epoch": 11.459190915542939, + "grad_norm": 0.024253351613879204, + "learning_rate": 8.854577714691271e-05, + "loss": 0.03131797909736633, + "step": 80730 + }, + { + "epoch": 11.460610361958835, + "grad_norm": 4.466399192810059, + "learning_rate": 8.854435770049681e-05, + "loss": 0.03240303099155426, + "step": 80740 + }, + { + "epoch": 11.462029808374734, + "grad_norm": 0.31110042333602905, + "learning_rate": 8.854293825408091e-05, + "loss": 0.03598732352256775, + "step": 80750 + }, + { + "epoch": 11.463449254790632, + "grad_norm": 0.1252836138010025, + "learning_rate": 8.854151880766502e-05, + "loss": 0.03730970919132233, + "step": 80760 + }, + { + "epoch": 11.46486870120653, + "grad_norm": 0.3589157462120056, + "learning_rate": 8.854009936124912e-05, + "loss": 0.00836140513420105, + "step": 80770 + }, + { + "epoch": 11.466288147622427, + "grad_norm": 0.5227982401847839, + "learning_rate": 8.853867991483323e-05, + "loss": 0.021766206622123717, + "step": 80780 + }, + { + "epoch": 11.467707594038325, + "grad_norm": 0.19833236932754517, + "learning_rate": 8.853726046841733e-05, + "loss": 0.00756232813000679, + "step": 80790 + }, + { + "epoch": 11.469127040454223, + "grad_norm": 0.6320415139198303, + "learning_rate": 8.853584102200142e-05, + "loss": 0.058226609230041505, + "step": 80800 + }, + { + "epoch": 11.47054648687012, + "grad_norm": 1.0651955604553223, + "learning_rate": 8.853442157558552e-05, + "loss": 0.04778565466403961, + "step": 80810 + }, + { + "epoch": 11.471965933286018, + "grad_norm": 5.570420265197754, + "learning_rate": 8.853300212916963e-05, + "loss": 0.0344421774148941, + "step": 80820 + }, + { + "epoch": 11.473385379701917, + "grad_norm": 0.3369903266429901, + "learning_rate": 8.853158268275373e-05, + "loss": 0.03234374821186066, + "step": 80830 + }, + { + "epoch": 11.474804826117815, + "grad_norm": 0.012835043482482433, + "learning_rate": 8.853016323633784e-05, + "loss": 0.02318093180656433, + "step": 80840 + }, + { + "epoch": 11.476224272533711, + "grad_norm": 1.5728015899658203, + "learning_rate": 8.852874378992194e-05, + "loss": 0.023737967014312744, + "step": 80850 + }, + { + "epoch": 11.47764371894961, + "grad_norm": 0.11739411950111389, + "learning_rate": 8.852732434350603e-05, + "loss": 0.05025478005409241, + "step": 80860 + }, + { + "epoch": 11.479063165365508, + "grad_norm": 0.9397228360176086, + "learning_rate": 8.852590489709014e-05, + "loss": 0.04425714910030365, + "step": 80870 + }, + { + "epoch": 11.480482611781405, + "grad_norm": 6.053242206573486, + "learning_rate": 8.852448545067424e-05, + "loss": 0.07350468635559082, + "step": 80880 + }, + { + "epoch": 11.481902058197303, + "grad_norm": 0.6401218175888062, + "learning_rate": 8.852306600425835e-05, + "loss": 0.04295220375061035, + "step": 80890 + }, + { + "epoch": 11.483321504613201, + "grad_norm": 1.2638099193572998, + "learning_rate": 8.852164655784244e-05, + "loss": 0.06553231477737427, + "step": 80900 + }, + { + "epoch": 11.4847409510291, + "grad_norm": 0.16520237922668457, + "learning_rate": 8.852022711142655e-05, + "loss": 0.022380702197551727, + "step": 80910 + }, + { + "epoch": 11.486160397444996, + "grad_norm": 0.17308127880096436, + "learning_rate": 8.851880766501065e-05, + "loss": 0.03869318962097168, + "step": 80920 + }, + { + "epoch": 11.487579843860894, + "grad_norm": 4.885634422302246, + "learning_rate": 8.851738821859476e-05, + "loss": 0.037957805395126346, + "step": 80930 + }, + { + "epoch": 11.488999290276793, + "grad_norm": 7.335200786590576, + "learning_rate": 8.851596877217885e-05, + "loss": 0.030105233192443848, + "step": 80940 + }, + { + "epoch": 11.490418736692689, + "grad_norm": 5.766997814178467, + "learning_rate": 8.851454932576295e-05, + "loss": 0.04862077534198761, + "step": 80950 + }, + { + "epoch": 11.491838183108587, + "grad_norm": 0.8015795350074768, + "learning_rate": 8.851312987934706e-05, + "loss": 0.032071438431739804, + "step": 80960 + }, + { + "epoch": 11.493257629524486, + "grad_norm": 2.2064619064331055, + "learning_rate": 8.851171043293116e-05, + "loss": 0.02999696135520935, + "step": 80970 + }, + { + "epoch": 11.494677075940384, + "grad_norm": 8.180752754211426, + "learning_rate": 8.851029098651527e-05, + "loss": 0.032754439115524295, + "step": 80980 + }, + { + "epoch": 11.49609652235628, + "grad_norm": 8.906973838806152, + "learning_rate": 8.850887154009937e-05, + "loss": 0.014296115934848785, + "step": 80990 + }, + { + "epoch": 11.497515968772179, + "grad_norm": 4.91297721862793, + "learning_rate": 8.850745209368346e-05, + "loss": 0.022449912130832674, + "step": 81000 + }, + { + "epoch": 11.497515968772179, + "eval_accuracy": 0.9774909391492338, + "eval_loss": 0.07550395280122757, + "eval_runtime": 32.707, + "eval_samples_per_second": 480.844, + "eval_steps_per_second": 15.043, + "step": 81000 + }, + { + "epoch": 11.498935415188077, + "grad_norm": 1.925844430923462, + "learning_rate": 8.850603264726756e-05, + "loss": 0.10111439228057861, + "step": 81010 + }, + { + "epoch": 11.500354861603974, + "grad_norm": 0.3102894425392151, + "learning_rate": 8.850461320085167e-05, + "loss": 0.02656802237033844, + "step": 81020 + }, + { + "epoch": 11.501774308019872, + "grad_norm": 0.8817775845527649, + "learning_rate": 8.850319375443577e-05, + "loss": 0.009132151305675507, + "step": 81030 + }, + { + "epoch": 11.50319375443577, + "grad_norm": 7.1372809410095215, + "learning_rate": 8.850177430801988e-05, + "loss": 0.023169676959514617, + "step": 81040 + }, + { + "epoch": 11.504613200851669, + "grad_norm": 0.17022785544395447, + "learning_rate": 8.850035486160398e-05, + "loss": 0.010735802352428436, + "step": 81050 + }, + { + "epoch": 11.506032647267565, + "grad_norm": 0.09135201573371887, + "learning_rate": 8.849893541518808e-05, + "loss": 0.02542368769645691, + "step": 81060 + }, + { + "epoch": 11.507452093683463, + "grad_norm": 1.0567549467086792, + "learning_rate": 8.849751596877219e-05, + "loss": 0.02125450372695923, + "step": 81070 + }, + { + "epoch": 11.508871540099362, + "grad_norm": 0.1266782283782959, + "learning_rate": 8.849609652235628e-05, + "loss": 0.015872204303741456, + "step": 81080 + }, + { + "epoch": 11.510290986515258, + "grad_norm": 6.861550807952881, + "learning_rate": 8.84946770759404e-05, + "loss": 0.032646551728248596, + "step": 81090 + }, + { + "epoch": 11.511710432931157, + "grad_norm": 0.036920398473739624, + "learning_rate": 8.849325762952448e-05, + "loss": 0.01691252291202545, + "step": 81100 + }, + { + "epoch": 11.513129879347055, + "grad_norm": 0.8152676224708557, + "learning_rate": 8.849183818310859e-05, + "loss": 0.011697210371494293, + "step": 81110 + }, + { + "epoch": 11.514549325762953, + "grad_norm": 7.33953332901001, + "learning_rate": 8.849041873669269e-05, + "loss": 0.02431018799543381, + "step": 81120 + }, + { + "epoch": 11.51596877217885, + "grad_norm": 0.4651520550251007, + "learning_rate": 8.84889992902768e-05, + "loss": 0.023129910230636597, + "step": 81130 + }, + { + "epoch": 11.517388218594748, + "grad_norm": 0.4674939811229706, + "learning_rate": 8.84875798438609e-05, + "loss": 0.03972046971321106, + "step": 81140 + }, + { + "epoch": 11.518807665010646, + "grad_norm": 8.952187538146973, + "learning_rate": 8.8486160397445e-05, + "loss": 0.05522555708885193, + "step": 81150 + }, + { + "epoch": 11.520227111426543, + "grad_norm": 0.10738738626241684, + "learning_rate": 8.84847409510291e-05, + "loss": 0.012743067741394044, + "step": 81160 + }, + { + "epoch": 11.521646557842441, + "grad_norm": 0.03591417148709297, + "learning_rate": 8.84833215046132e-05, + "loss": 0.021634458005428313, + "step": 81170 + }, + { + "epoch": 11.52306600425834, + "grad_norm": 0.01823955960571766, + "learning_rate": 8.848190205819731e-05, + "loss": 0.014959985017776489, + "step": 81180 + }, + { + "epoch": 11.524485450674238, + "grad_norm": 0.3199457824230194, + "learning_rate": 8.848048261178141e-05, + "loss": 0.020771077275276183, + "step": 81190 + }, + { + "epoch": 11.525904897090134, + "grad_norm": 3.928478956222534, + "learning_rate": 8.847906316536552e-05, + "loss": 0.04103337228298187, + "step": 81200 + }, + { + "epoch": 11.527324343506033, + "grad_norm": 0.09946906566619873, + "learning_rate": 8.84776437189496e-05, + "loss": 0.03343590497970581, + "step": 81210 + }, + { + "epoch": 11.528743789921931, + "grad_norm": 0.22660058736801147, + "learning_rate": 8.847622427253372e-05, + "loss": 0.021284933388233184, + "step": 81220 + }, + { + "epoch": 11.530163236337827, + "grad_norm": 0.03823212906718254, + "learning_rate": 8.847480482611781e-05, + "loss": 0.03749181628227234, + "step": 81230 + }, + { + "epoch": 11.531582682753726, + "grad_norm": 2.1197917461395264, + "learning_rate": 8.847338537970192e-05, + "loss": 0.05521925687789917, + "step": 81240 + }, + { + "epoch": 11.533002129169624, + "grad_norm": 12.29832649230957, + "learning_rate": 8.847196593328602e-05, + "loss": 0.036048969626426695, + "step": 81250 + }, + { + "epoch": 11.534421575585522, + "grad_norm": 0.5513697266578674, + "learning_rate": 8.847054648687012e-05, + "loss": 0.013157932460308075, + "step": 81260 + }, + { + "epoch": 11.535841022001419, + "grad_norm": 9.3153715133667, + "learning_rate": 8.846912704045423e-05, + "loss": 0.07485218048095703, + "step": 81270 + }, + { + "epoch": 11.537260468417317, + "grad_norm": 9.140734672546387, + "learning_rate": 8.846770759403833e-05, + "loss": 0.07693561911582947, + "step": 81280 + }, + { + "epoch": 11.538679914833216, + "grad_norm": 0.37796759605407715, + "learning_rate": 8.846628814762244e-05, + "loss": 0.06537792086601257, + "step": 81290 + }, + { + "epoch": 11.540099361249112, + "grad_norm": 7.079742431640625, + "learning_rate": 8.846486870120654e-05, + "loss": 0.025105878710746765, + "step": 81300 + }, + { + "epoch": 11.54151880766501, + "grad_norm": 0.3768748939037323, + "learning_rate": 8.846344925479063e-05, + "loss": 0.054667305946350095, + "step": 81310 + }, + { + "epoch": 11.542938254080909, + "grad_norm": 4.9976019859313965, + "learning_rate": 8.846202980837473e-05, + "loss": 0.06892080307006836, + "step": 81320 + }, + { + "epoch": 11.544357700496807, + "grad_norm": 0.3432539701461792, + "learning_rate": 8.846061036195884e-05, + "loss": 0.01103949099779129, + "step": 81330 + }, + { + "epoch": 11.545777146912704, + "grad_norm": 2.6538197994232178, + "learning_rate": 8.845919091554294e-05, + "loss": 0.01662140041589737, + "step": 81340 + }, + { + "epoch": 11.547196593328602, + "grad_norm": 0.8760688304901123, + "learning_rate": 8.845777146912705e-05, + "loss": 0.03641084134578705, + "step": 81350 + }, + { + "epoch": 11.5486160397445, + "grad_norm": 5.520683765411377, + "learning_rate": 8.845635202271115e-05, + "loss": 0.01766434609889984, + "step": 81360 + }, + { + "epoch": 11.550035486160397, + "grad_norm": 1.819739818572998, + "learning_rate": 8.845493257629524e-05, + "loss": 0.010377876460552216, + "step": 81370 + }, + { + "epoch": 11.551454932576295, + "grad_norm": 1.5828450918197632, + "learning_rate": 8.845351312987935e-05, + "loss": 0.02364148199558258, + "step": 81380 + }, + { + "epoch": 11.552874378992193, + "grad_norm": 0.13533614575862885, + "learning_rate": 8.845209368346345e-05, + "loss": 0.027328240871429443, + "step": 81390 + }, + { + "epoch": 11.554293825408092, + "grad_norm": 12.122629165649414, + "learning_rate": 8.845067423704756e-05, + "loss": 0.04072291851043701, + "step": 81400 + }, + { + "epoch": 11.555713271823988, + "grad_norm": 10.578282356262207, + "learning_rate": 8.844925479063165e-05, + "loss": 0.009069137275218964, + "step": 81410 + }, + { + "epoch": 11.557132718239886, + "grad_norm": 5.478339672088623, + "learning_rate": 8.844783534421576e-05, + "loss": 0.02398311048746109, + "step": 81420 + }, + { + "epoch": 11.558552164655785, + "grad_norm": 2.616616725921631, + "learning_rate": 8.844641589779986e-05, + "loss": 0.0067227624356746675, + "step": 81430 + }, + { + "epoch": 11.559971611071681, + "grad_norm": 0.09911337494850159, + "learning_rate": 8.844499645138397e-05, + "loss": 0.02870553731918335, + "step": 81440 + }, + { + "epoch": 11.56139105748758, + "grad_norm": 0.030385294929146767, + "learning_rate": 8.844357700496806e-05, + "loss": 0.011173336207866669, + "step": 81450 + }, + { + "epoch": 11.562810503903478, + "grad_norm": 0.06865376234054565, + "learning_rate": 8.844215755855217e-05, + "loss": 0.025823640823364257, + "step": 81460 + }, + { + "epoch": 11.564229950319376, + "grad_norm": 3.7182939052581787, + "learning_rate": 8.844073811213627e-05, + "loss": 0.05339401960372925, + "step": 81470 + }, + { + "epoch": 11.565649396735273, + "grad_norm": 1.3230258226394653, + "learning_rate": 8.843931866572037e-05, + "loss": 0.018890395760536194, + "step": 81480 + }, + { + "epoch": 11.567068843151171, + "grad_norm": 3.3779211044311523, + "learning_rate": 8.843789921930448e-05, + "loss": 0.00660952776670456, + "step": 81490 + }, + { + "epoch": 11.56848828956707, + "grad_norm": 0.18795613944530487, + "learning_rate": 8.843647977288858e-05, + "loss": 0.019702652096748353, + "step": 81500 + }, + { + "epoch": 11.56848828956707, + "eval_accuracy": 0.9763464106313983, + "eval_loss": 0.08224768191576004, + "eval_runtime": 33.7438, + "eval_samples_per_second": 466.071, + "eval_steps_per_second": 14.58, + "step": 81500 + }, + { + "epoch": 11.569907735982966, + "grad_norm": 0.34426751732826233, + "learning_rate": 8.843506032647269e-05, + "loss": 0.03004116714000702, + "step": 81510 + }, + { + "epoch": 11.571327182398864, + "grad_norm": 0.4941084086894989, + "learning_rate": 8.843364088005677e-05, + "loss": 0.022238391637802123, + "step": 81520 + }, + { + "epoch": 11.572746628814762, + "grad_norm": 11.108928680419922, + "learning_rate": 8.843222143364088e-05, + "loss": 0.056462281942367555, + "step": 81530 + }, + { + "epoch": 11.57416607523066, + "grad_norm": 0.16970746219158173, + "learning_rate": 8.843080198722498e-05, + "loss": 0.021629197895526885, + "step": 81540 + }, + { + "epoch": 11.575585521646557, + "grad_norm": 0.09552083164453506, + "learning_rate": 8.842952448545068e-05, + "loss": 0.028470611572265624, + "step": 81550 + }, + { + "epoch": 11.577004968062456, + "grad_norm": 0.07120678573846817, + "learning_rate": 8.842810503903478e-05, + "loss": 0.02539893686771393, + "step": 81560 + }, + { + "epoch": 11.578424414478354, + "grad_norm": 2.5239343643188477, + "learning_rate": 8.842668559261889e-05, + "loss": 0.020925018191337585, + "step": 81570 + }, + { + "epoch": 11.57984386089425, + "grad_norm": 10.473538398742676, + "learning_rate": 8.842526614620298e-05, + "loss": 0.06097403764724731, + "step": 81580 + }, + { + "epoch": 11.581263307310149, + "grad_norm": 7.29902982711792, + "learning_rate": 8.842384669978708e-05, + "loss": 0.03347648978233338, + "step": 81590 + }, + { + "epoch": 11.582682753726047, + "grad_norm": 0.9426771402359009, + "learning_rate": 8.842242725337119e-05, + "loss": 0.02707792818546295, + "step": 81600 + }, + { + "epoch": 11.584102200141945, + "grad_norm": 1.5464801788330078, + "learning_rate": 8.842100780695529e-05, + "loss": 0.029200682044029237, + "step": 81610 + }, + { + "epoch": 11.585521646557842, + "grad_norm": 1.6313835382461548, + "learning_rate": 8.84195883605394e-05, + "loss": 0.036450433731079104, + "step": 81620 + }, + { + "epoch": 11.58694109297374, + "grad_norm": 4.770724296569824, + "learning_rate": 8.84181689141235e-05, + "loss": 0.048863834142684935, + "step": 81630 + }, + { + "epoch": 11.588360539389639, + "grad_norm": 0.085438571870327, + "learning_rate": 8.84167494677076e-05, + "loss": 0.042553862929344176, + "step": 81640 + }, + { + "epoch": 11.589779985805535, + "grad_norm": 0.8502871990203857, + "learning_rate": 8.84153300212917e-05, + "loss": 0.03484118282794953, + "step": 81650 + }, + { + "epoch": 11.591199432221433, + "grad_norm": 0.8441861271858215, + "learning_rate": 8.84139105748758e-05, + "loss": 0.030901208519935608, + "step": 81660 + }, + { + "epoch": 11.592618878637332, + "grad_norm": 6.2987871170043945, + "learning_rate": 8.84124911284599e-05, + "loss": 0.03970724940299988, + "step": 81670 + }, + { + "epoch": 11.59403832505323, + "grad_norm": 4.7662129402160645, + "learning_rate": 8.841107168204401e-05, + "loss": 0.01229364275932312, + "step": 81680 + }, + { + "epoch": 11.595457771469126, + "grad_norm": 1.5325942039489746, + "learning_rate": 8.840965223562811e-05, + "loss": 0.03952722549438477, + "step": 81690 + }, + { + "epoch": 11.596877217885025, + "grad_norm": 2.52839994430542, + "learning_rate": 8.840823278921221e-05, + "loss": 0.02153339087963104, + "step": 81700 + }, + { + "epoch": 11.598296664300923, + "grad_norm": 0.5760644674301147, + "learning_rate": 8.840681334279632e-05, + "loss": 0.0679667830467224, + "step": 81710 + }, + { + "epoch": 11.59971611071682, + "grad_norm": 5.679330825805664, + "learning_rate": 8.840539389638042e-05, + "loss": 0.042963171005249025, + "step": 81720 + }, + { + "epoch": 11.601135557132718, + "grad_norm": 0.11442669481039047, + "learning_rate": 8.840397444996453e-05, + "loss": 0.012864866852760315, + "step": 81730 + }, + { + "epoch": 11.602555003548616, + "grad_norm": 9.763188362121582, + "learning_rate": 8.840255500354861e-05, + "loss": 0.03386241793632507, + "step": 81740 + }, + { + "epoch": 11.603974449964515, + "grad_norm": 1.1611683368682861, + "learning_rate": 8.840113555713272e-05, + "loss": 0.029190993309020995, + "step": 81750 + }, + { + "epoch": 11.605393896380411, + "grad_norm": 0.17620205879211426, + "learning_rate": 8.839971611071682e-05, + "loss": 0.03684349656105042, + "step": 81760 + }, + { + "epoch": 11.60681334279631, + "grad_norm": 1.8045251369476318, + "learning_rate": 8.839829666430093e-05, + "loss": 0.050387269258499144, + "step": 81770 + }, + { + "epoch": 11.608232789212208, + "grad_norm": 0.3409873843193054, + "learning_rate": 8.839687721788503e-05, + "loss": 0.022375202178955077, + "step": 81780 + }, + { + "epoch": 11.609652235628104, + "grad_norm": 1.041752815246582, + "learning_rate": 8.839545777146914e-05, + "loss": 0.021577396988868715, + "step": 81790 + }, + { + "epoch": 11.611071682044003, + "grad_norm": 0.13355006277561188, + "learning_rate": 8.839403832505324e-05, + "loss": 0.015710872411727906, + "step": 81800 + }, + { + "epoch": 11.6124911284599, + "grad_norm": 0.040545929223299026, + "learning_rate": 8.839261887863733e-05, + "loss": 0.062452536821365354, + "step": 81810 + }, + { + "epoch": 11.6139105748758, + "grad_norm": 0.624416708946228, + "learning_rate": 8.839119943222144e-05, + "loss": 0.005656691268086433, + "step": 81820 + }, + { + "epoch": 11.615330021291696, + "grad_norm": 4.042696952819824, + "learning_rate": 8.838977998580554e-05, + "loss": 0.011597123742103577, + "step": 81830 + }, + { + "epoch": 11.616749467707594, + "grad_norm": 0.06087561324238777, + "learning_rate": 8.838836053938965e-05, + "loss": 0.031940600275993346, + "step": 81840 + }, + { + "epoch": 11.618168914123492, + "grad_norm": 10.522989273071289, + "learning_rate": 8.838694109297374e-05, + "loss": 0.019474413990974427, + "step": 81850 + }, + { + "epoch": 11.619588360539389, + "grad_norm": 0.5727241039276123, + "learning_rate": 8.838552164655785e-05, + "loss": 0.0022082440555095673, + "step": 81860 + }, + { + "epoch": 11.621007806955287, + "grad_norm": 0.49038177728652954, + "learning_rate": 8.838410220014194e-05, + "loss": 0.012241747230291367, + "step": 81870 + }, + { + "epoch": 11.622427253371185, + "grad_norm": 1.801270842552185, + "learning_rate": 8.838268275372606e-05, + "loss": 0.011108881235122681, + "step": 81880 + }, + { + "epoch": 11.623846699787084, + "grad_norm": 1.095993161201477, + "learning_rate": 8.838126330731015e-05, + "loss": 0.007870152592658997, + "step": 81890 + }, + { + "epoch": 11.62526614620298, + "grad_norm": 0.8447113633155823, + "learning_rate": 8.837984386089425e-05, + "loss": 0.01694517433643341, + "step": 81900 + }, + { + "epoch": 11.626685592618879, + "grad_norm": 0.6038371324539185, + "learning_rate": 8.837842441447836e-05, + "loss": 0.0033034585416316987, + "step": 81910 + }, + { + "epoch": 11.628105039034777, + "grad_norm": 0.1463266760110855, + "learning_rate": 8.837700496806246e-05, + "loss": 0.03691132664680481, + "step": 81920 + }, + { + "epoch": 11.629524485450673, + "grad_norm": 0.01949205808341503, + "learning_rate": 8.837558552164657e-05, + "loss": 0.030641201138496398, + "step": 81930 + }, + { + "epoch": 11.630943931866572, + "grad_norm": 0.01865651085972786, + "learning_rate": 8.837416607523067e-05, + "loss": 0.06486660838127137, + "step": 81940 + }, + { + "epoch": 11.63236337828247, + "grad_norm": 0.44764140248298645, + "learning_rate": 8.837274662881476e-05, + "loss": 0.03592503070831299, + "step": 81950 + }, + { + "epoch": 11.633782824698368, + "grad_norm": 6.017378807067871, + "learning_rate": 8.837132718239886e-05, + "loss": 0.025605541467666627, + "step": 81960 + }, + { + "epoch": 11.635202271114265, + "grad_norm": 4.395421981811523, + "learning_rate": 8.836990773598297e-05, + "loss": 0.03425087332725525, + "step": 81970 + }, + { + "epoch": 11.636621717530163, + "grad_norm": 3.474912405014038, + "learning_rate": 8.836848828956707e-05, + "loss": 0.007524078339338302, + "step": 81980 + }, + { + "epoch": 11.638041163946061, + "grad_norm": 7.506344318389893, + "learning_rate": 8.836706884315118e-05, + "loss": 0.02990333139896393, + "step": 81990 + }, + { + "epoch": 11.639460610361958, + "grad_norm": 1.1138578653335571, + "learning_rate": 8.836564939673528e-05, + "loss": 0.02096046507358551, + "step": 82000 + }, + { + "epoch": 11.639460610361958, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04104387387633324, + "eval_runtime": 34.0548, + "eval_samples_per_second": 461.814, + "eval_steps_per_second": 14.447, + "step": 82000 + }, + { + "epoch": 11.640880056777856, + "grad_norm": 0.22225643694400787, + "learning_rate": 8.836422995031938e-05, + "loss": 0.02522968649864197, + "step": 82010 + }, + { + "epoch": 11.642299503193755, + "grad_norm": 5.2800140380859375, + "learning_rate": 8.836281050390349e-05, + "loss": 0.055390971899032596, + "step": 82020 + }, + { + "epoch": 11.643718949609653, + "grad_norm": 0.09609229862689972, + "learning_rate": 8.836139105748758e-05, + "loss": 0.012254738807678222, + "step": 82030 + }, + { + "epoch": 11.64513839602555, + "grad_norm": 2.6045665740966797, + "learning_rate": 8.83599716110717e-05, + "loss": 0.059720170497894284, + "step": 82040 + }, + { + "epoch": 11.646557842441448, + "grad_norm": 0.6698890924453735, + "learning_rate": 8.835855216465578e-05, + "loss": 0.022242045402526854, + "step": 82050 + }, + { + "epoch": 11.647977288857346, + "grad_norm": 0.3261481523513794, + "learning_rate": 8.835713271823989e-05, + "loss": 0.01315658837556839, + "step": 82060 + }, + { + "epoch": 11.649396735273243, + "grad_norm": 15.95113468170166, + "learning_rate": 8.835571327182399e-05, + "loss": 0.04952957332134247, + "step": 82070 + }, + { + "epoch": 11.650816181689141, + "grad_norm": 0.34951213002204895, + "learning_rate": 8.83542938254081e-05, + "loss": 0.028381425142288207, + "step": 82080 + }, + { + "epoch": 11.65223562810504, + "grad_norm": 0.33600348234176636, + "learning_rate": 8.83528743789922e-05, + "loss": 0.03296903371810913, + "step": 82090 + }, + { + "epoch": 11.653655074520938, + "grad_norm": 0.44056016206741333, + "learning_rate": 8.835145493257629e-05, + "loss": 0.00819028839468956, + "step": 82100 + }, + { + "epoch": 11.655074520936834, + "grad_norm": 0.9378697276115417, + "learning_rate": 8.83500354861604e-05, + "loss": 0.015890008211135863, + "step": 82110 + }, + { + "epoch": 11.656493967352732, + "grad_norm": 7.173929214477539, + "learning_rate": 8.83486160397445e-05, + "loss": 0.016848720610141754, + "step": 82120 + }, + { + "epoch": 11.65791341376863, + "grad_norm": 0.40014129877090454, + "learning_rate": 8.834719659332861e-05, + "loss": 0.03617895543575287, + "step": 82130 + }, + { + "epoch": 11.659332860184527, + "grad_norm": 2.238469362258911, + "learning_rate": 8.834577714691271e-05, + "loss": 0.028064021468162538, + "step": 82140 + }, + { + "epoch": 11.660752306600425, + "grad_norm": 3.2392852306365967, + "learning_rate": 8.834435770049682e-05, + "loss": 0.026870760321617126, + "step": 82150 + }, + { + "epoch": 11.662171753016324, + "grad_norm": 0.2407623678445816, + "learning_rate": 8.83429382540809e-05, + "loss": 0.028642752766609193, + "step": 82160 + }, + { + "epoch": 11.663591199432222, + "grad_norm": 0.5530653595924377, + "learning_rate": 8.834151880766501e-05, + "loss": 0.0160242035984993, + "step": 82170 + }, + { + "epoch": 11.665010645848119, + "grad_norm": 5.290857315063477, + "learning_rate": 8.834009936124911e-05, + "loss": 0.05292970538139343, + "step": 82180 + }, + { + "epoch": 11.666430092264017, + "grad_norm": 0.8904526233673096, + "learning_rate": 8.833867991483322e-05, + "loss": 0.03078848421573639, + "step": 82190 + }, + { + "epoch": 11.667849538679915, + "grad_norm": 0.06986336410045624, + "learning_rate": 8.833726046841732e-05, + "loss": 0.017547787725925447, + "step": 82200 + }, + { + "epoch": 11.669268985095812, + "grad_norm": 9.345512390136719, + "learning_rate": 8.833584102200142e-05, + "loss": 0.0370622456073761, + "step": 82210 + }, + { + "epoch": 11.67068843151171, + "grad_norm": 2.8718032836914062, + "learning_rate": 8.833442157558553e-05, + "loss": 0.016177193820476533, + "step": 82220 + }, + { + "epoch": 11.672107877927608, + "grad_norm": 0.4981312155723572, + "learning_rate": 8.833300212916963e-05, + "loss": 0.020555400848388673, + "step": 82230 + }, + { + "epoch": 11.673527324343507, + "grad_norm": 5.187900543212891, + "learning_rate": 8.833158268275374e-05, + "loss": 0.03203676640987396, + "step": 82240 + }, + { + "epoch": 11.674946770759403, + "grad_norm": 0.5993130803108215, + "learning_rate": 8.833016323633783e-05, + "loss": 0.03552861511707306, + "step": 82250 + }, + { + "epoch": 11.676366217175302, + "grad_norm": 1.570359468460083, + "learning_rate": 8.832874378992193e-05, + "loss": 0.0360751211643219, + "step": 82260 + }, + { + "epoch": 11.6777856635912, + "grad_norm": 0.01849873550236225, + "learning_rate": 8.832732434350603e-05, + "loss": 0.07889996767044068, + "step": 82270 + }, + { + "epoch": 11.679205110007096, + "grad_norm": 4.044131755828857, + "learning_rate": 8.832590489709014e-05, + "loss": 0.03975077271461487, + "step": 82280 + }, + { + "epoch": 11.680624556422995, + "grad_norm": 0.21400457620620728, + "learning_rate": 8.832448545067424e-05, + "loss": 0.028557685017585755, + "step": 82290 + }, + { + "epoch": 11.682044002838893, + "grad_norm": 2.2320234775543213, + "learning_rate": 8.832306600425835e-05, + "loss": 0.032077842950820924, + "step": 82300 + }, + { + "epoch": 11.683463449254791, + "grad_norm": 6.41154670715332, + "learning_rate": 8.832164655784245e-05, + "loss": 0.04047545492649078, + "step": 82310 + }, + { + "epoch": 11.684882895670688, + "grad_norm": 7.783628463745117, + "learning_rate": 8.832022711142654e-05, + "loss": 0.05753174424171448, + "step": 82320 + }, + { + "epoch": 11.686302342086586, + "grad_norm": 0.2691332697868347, + "learning_rate": 8.831880766501065e-05, + "loss": 0.033080264925956726, + "step": 82330 + }, + { + "epoch": 11.687721788502484, + "grad_norm": 0.08865190297365189, + "learning_rate": 8.831738821859475e-05, + "loss": 0.0328124463558197, + "step": 82340 + }, + { + "epoch": 11.689141234918381, + "grad_norm": 0.06198609247803688, + "learning_rate": 8.831596877217886e-05, + "loss": 0.032992076873779294, + "step": 82350 + }, + { + "epoch": 11.69056068133428, + "grad_norm": 1.0192947387695312, + "learning_rate": 8.831454932576295e-05, + "loss": 0.02295834571123123, + "step": 82360 + }, + { + "epoch": 11.691980127750178, + "grad_norm": 4.939817428588867, + "learning_rate": 8.831312987934706e-05, + "loss": 0.04259155392646789, + "step": 82370 + }, + { + "epoch": 11.693399574166076, + "grad_norm": 0.011144721880555153, + "learning_rate": 8.831171043293115e-05, + "loss": 0.011710671335458755, + "step": 82380 + }, + { + "epoch": 11.694819020581972, + "grad_norm": 17.873878479003906, + "learning_rate": 8.831029098651527e-05, + "loss": 0.03337354063987732, + "step": 82390 + }, + { + "epoch": 11.69623846699787, + "grad_norm": 0.6569585800170898, + "learning_rate": 8.830887154009938e-05, + "loss": 0.02401721030473709, + "step": 82400 + }, + { + "epoch": 11.697657913413769, + "grad_norm": 0.019921043887734413, + "learning_rate": 8.830745209368346e-05, + "loss": 0.046464449167251586, + "step": 82410 + }, + { + "epoch": 11.699077359829666, + "grad_norm": 1.3580775260925293, + "learning_rate": 8.830603264726757e-05, + "loss": 0.022325956821441652, + "step": 82420 + }, + { + "epoch": 11.700496806245564, + "grad_norm": 10.863458633422852, + "learning_rate": 8.830461320085167e-05, + "loss": 0.02395484447479248, + "step": 82430 + }, + { + "epoch": 11.701916252661462, + "grad_norm": 5.869367599487305, + "learning_rate": 8.830319375443578e-05, + "loss": 0.043967399001121524, + "step": 82440 + }, + { + "epoch": 11.70333569907736, + "grad_norm": 0.5498532056808472, + "learning_rate": 8.830177430801988e-05, + "loss": 0.02715992331504822, + "step": 82450 + }, + { + "epoch": 11.704755145493257, + "grad_norm": 0.06902461498975754, + "learning_rate": 8.830035486160397e-05, + "loss": 0.004207936301827431, + "step": 82460 + }, + { + "epoch": 11.706174591909155, + "grad_norm": 3.9329452514648438, + "learning_rate": 8.829893541518807e-05, + "loss": 0.031944799423217776, + "step": 82470 + }, + { + "epoch": 11.707594038325054, + "grad_norm": 4.1940107345581055, + "learning_rate": 8.829751596877218e-05, + "loss": 0.022048255801200865, + "step": 82480 + }, + { + "epoch": 11.70901348474095, + "grad_norm": 0.15101581811904907, + "learning_rate": 8.829609652235629e-05, + "loss": 0.021107760071754456, + "step": 82490 + }, + { + "epoch": 11.710432931156848, + "grad_norm": 0.04076530411839485, + "learning_rate": 8.829467707594039e-05, + "loss": 0.009322655946016311, + "step": 82500 + }, + { + "epoch": 11.710432931156848, + "eval_accuracy": 0.9867107522095759, + "eval_loss": 0.0509491004049778, + "eval_runtime": 33.6657, + "eval_samples_per_second": 467.153, + "eval_steps_per_second": 14.614, + "step": 82500 + }, + { + "epoch": 11.711852377572747, + "grad_norm": 12.230842590332031, + "learning_rate": 8.82932576295245e-05, + "loss": 0.03886268138885498, + "step": 82510 + }, + { + "epoch": 11.713271823988645, + "grad_norm": 9.446490287780762, + "learning_rate": 8.829183818310859e-05, + "loss": 0.01960558593273163, + "step": 82520 + }, + { + "epoch": 11.714691270404542, + "grad_norm": 0.2041768878698349, + "learning_rate": 8.82904187366927e-05, + "loss": 0.028508198261260987, + "step": 82530 + }, + { + "epoch": 11.71611071682044, + "grad_norm": 1.630429983139038, + "learning_rate": 8.82889992902768e-05, + "loss": 0.03760896623134613, + "step": 82540 + }, + { + "epoch": 11.717530163236338, + "grad_norm": 12.508625030517578, + "learning_rate": 8.82875798438609e-05, + "loss": 0.02034131735563278, + "step": 82550 + }, + { + "epoch": 11.718949609652235, + "grad_norm": 3.5060606002807617, + "learning_rate": 8.8286160397445e-05, + "loss": 0.03896633386611938, + "step": 82560 + }, + { + "epoch": 11.720369056068133, + "grad_norm": 0.034930501133203506, + "learning_rate": 8.82847409510291e-05, + "loss": 0.04598281383514404, + "step": 82570 + }, + { + "epoch": 11.721788502484031, + "grad_norm": 3.448381185531616, + "learning_rate": 8.828332150461321e-05, + "loss": 0.036952057480812074, + "step": 82580 + }, + { + "epoch": 11.72320794889993, + "grad_norm": 0.0042781527154147625, + "learning_rate": 8.828190205819731e-05, + "loss": 0.05195838212966919, + "step": 82590 + }, + { + "epoch": 11.724627395315826, + "grad_norm": 3.1850550174713135, + "learning_rate": 8.828048261178142e-05, + "loss": 0.036840036511421204, + "step": 82600 + }, + { + "epoch": 11.726046841731725, + "grad_norm": 6.152586460113525, + "learning_rate": 8.827906316536552e-05, + "loss": 0.018132734298706054, + "step": 82610 + }, + { + "epoch": 11.727466288147623, + "grad_norm": 0.34917500615119934, + "learning_rate": 8.827764371894961e-05, + "loss": 0.035174581408500674, + "step": 82620 + }, + { + "epoch": 11.72888573456352, + "grad_norm": 4.656780242919922, + "learning_rate": 8.827622427253371e-05, + "loss": 0.04415706992149353, + "step": 82630 + }, + { + "epoch": 11.730305180979418, + "grad_norm": 0.2968284785747528, + "learning_rate": 8.827480482611782e-05, + "loss": 0.045205461978912356, + "step": 82640 + }, + { + "epoch": 11.731724627395316, + "grad_norm": 2.32602596282959, + "learning_rate": 8.827338537970192e-05, + "loss": 0.008173373341560364, + "step": 82650 + }, + { + "epoch": 11.733144073811214, + "grad_norm": 0.49746090173721313, + "learning_rate": 8.827196593328603e-05, + "loss": 0.043424248695373535, + "step": 82660 + }, + { + "epoch": 11.73456352022711, + "grad_norm": 7.075344085693359, + "learning_rate": 8.827054648687011e-05, + "loss": 0.018824505805969238, + "step": 82670 + }, + { + "epoch": 11.735982966643009, + "grad_norm": 0.27628007531166077, + "learning_rate": 8.826912704045422e-05, + "loss": 0.035436037182807925, + "step": 82680 + }, + { + "epoch": 11.737402413058907, + "grad_norm": 3.1124513149261475, + "learning_rate": 8.826770759403834e-05, + "loss": 0.0338789314031601, + "step": 82690 + }, + { + "epoch": 11.738821859474804, + "grad_norm": 0.8742479085922241, + "learning_rate": 8.826628814762243e-05, + "loss": 0.022074520587921143, + "step": 82700 + }, + { + "epoch": 11.740241305890702, + "grad_norm": 4.0626373291015625, + "learning_rate": 8.826486870120654e-05, + "loss": 0.03586540520191193, + "step": 82710 + }, + { + "epoch": 11.7416607523066, + "grad_norm": 0.11810215562582016, + "learning_rate": 8.826344925479063e-05, + "loss": 0.030027234554290773, + "step": 82720 + }, + { + "epoch": 11.743080198722499, + "grad_norm": 0.0518762581050396, + "learning_rate": 8.826202980837474e-05, + "loss": 0.023009565472602845, + "step": 82730 + }, + { + "epoch": 11.744499645138395, + "grad_norm": 0.10839356482028961, + "learning_rate": 8.826061036195884e-05, + "loss": 0.05535359382629394, + "step": 82740 + }, + { + "epoch": 11.745919091554294, + "grad_norm": 0.44333168864250183, + "learning_rate": 8.825919091554295e-05, + "loss": 0.015498198568820953, + "step": 82750 + }, + { + "epoch": 11.747338537970192, + "grad_norm": 2.660344123840332, + "learning_rate": 8.825777146912704e-05, + "loss": 0.01779383420944214, + "step": 82760 + }, + { + "epoch": 11.748757984386089, + "grad_norm": 0.39260396361351013, + "learning_rate": 8.825635202271114e-05, + "loss": 0.013475912809371948, + "step": 82770 + }, + { + "epoch": 11.750177430801987, + "grad_norm": 9.97494125366211, + "learning_rate": 8.825493257629525e-05, + "loss": 0.028389009833335876, + "step": 82780 + }, + { + "epoch": 11.751596877217885, + "grad_norm": 0.7091060280799866, + "learning_rate": 8.825351312987935e-05, + "loss": 0.014123716950416565, + "step": 82790 + }, + { + "epoch": 11.753016323633783, + "grad_norm": 0.9680839776992798, + "learning_rate": 8.825209368346346e-05, + "loss": 0.05506055355072022, + "step": 82800 + }, + { + "epoch": 11.75443577004968, + "grad_norm": 0.2549281716346741, + "learning_rate": 8.825067423704756e-05, + "loss": 0.03636166155338287, + "step": 82810 + }, + { + "epoch": 11.755855216465578, + "grad_norm": 3.956082344055176, + "learning_rate": 8.824925479063166e-05, + "loss": 0.012689772248268127, + "step": 82820 + }, + { + "epoch": 11.757274662881477, + "grad_norm": 0.12777554988861084, + "learning_rate": 8.824783534421575e-05, + "loss": 0.014213849604129792, + "step": 82830 + }, + { + "epoch": 11.758694109297373, + "grad_norm": 10.15988826751709, + "learning_rate": 8.824641589779986e-05, + "loss": 0.027217572927474974, + "step": 82840 + }, + { + "epoch": 11.760113555713271, + "grad_norm": 4.8726582527160645, + "learning_rate": 8.824499645138396e-05, + "loss": 0.005710848420858383, + "step": 82850 + }, + { + "epoch": 11.76153300212917, + "grad_norm": 0.3747629225254059, + "learning_rate": 8.824357700496807e-05, + "loss": 0.031127306818962096, + "step": 82860 + }, + { + "epoch": 11.762952448545068, + "grad_norm": 12.532156944274902, + "learning_rate": 8.824215755855217e-05, + "loss": 0.03176976442337036, + "step": 82870 + }, + { + "epoch": 11.764371894960965, + "grad_norm": 1.8649965524673462, + "learning_rate": 8.824073811213627e-05, + "loss": 0.030580675601959227, + "step": 82880 + }, + { + "epoch": 11.765791341376863, + "grad_norm": 0.3033379316329956, + "learning_rate": 8.823931866572038e-05, + "loss": 0.013270074129104614, + "step": 82890 + }, + { + "epoch": 11.767210787792761, + "grad_norm": 0.012772593647241592, + "learning_rate": 8.823789921930448e-05, + "loss": 0.015141868591308593, + "step": 82900 + }, + { + "epoch": 11.768630234208658, + "grad_norm": 2.6985650062561035, + "learning_rate": 8.823647977288859e-05, + "loss": 0.005396616086363793, + "step": 82910 + }, + { + "epoch": 11.770049680624556, + "grad_norm": 1.3272356986999512, + "learning_rate": 8.823506032647268e-05, + "loss": 0.010387630760669708, + "step": 82920 + }, + { + "epoch": 11.771469127040454, + "grad_norm": 5.779232501983643, + "learning_rate": 8.823364088005678e-05, + "loss": 0.030734294652938844, + "step": 82930 + }, + { + "epoch": 11.772888573456353, + "grad_norm": 1.0797303915023804, + "learning_rate": 8.823222143364088e-05, + "loss": 0.03054520785808563, + "step": 82940 + }, + { + "epoch": 11.77430801987225, + "grad_norm": 3.3726065158843994, + "learning_rate": 8.823080198722499e-05, + "loss": 0.0510441780090332, + "step": 82950 + }, + { + "epoch": 11.775727466288147, + "grad_norm": 4.178657054901123, + "learning_rate": 8.822938254080909e-05, + "loss": 0.018844333291053773, + "step": 82960 + }, + { + "epoch": 11.777146912704046, + "grad_norm": 3.163480281829834, + "learning_rate": 8.82279630943932e-05, + "loss": 0.030176666378974915, + "step": 82970 + }, + { + "epoch": 11.778566359119942, + "grad_norm": 0.13545754551887512, + "learning_rate": 8.82265436479773e-05, + "loss": 0.06503672003746033, + "step": 82980 + }, + { + "epoch": 11.77998580553584, + "grad_norm": 0.09676958620548248, + "learning_rate": 8.822512420156139e-05, + "loss": 0.04054889976978302, + "step": 82990 + }, + { + "epoch": 11.781405251951739, + "grad_norm": 0.08766447007656097, + "learning_rate": 8.82237047551455e-05, + "loss": 0.05703636407852173, + "step": 83000 + }, + { + "epoch": 11.781405251951739, + "eval_accuracy": 0.9832771666560692, + "eval_loss": 0.055854376405477524, + "eval_runtime": 33.2418, + "eval_samples_per_second": 473.11, + "eval_steps_per_second": 14.801, + "step": 83000 + }, + { + "epoch": 11.782824698367637, + "grad_norm": 4.778905868530273, + "learning_rate": 8.82222853087296e-05, + "loss": 0.016084206104278565, + "step": 83010 + }, + { + "epoch": 11.784244144783534, + "grad_norm": 1.264276146888733, + "learning_rate": 8.822086586231371e-05, + "loss": 0.024041858315467835, + "step": 83020 + }, + { + "epoch": 11.785663591199432, + "grad_norm": 8.326826095581055, + "learning_rate": 8.82194464158978e-05, + "loss": 0.019390736520290375, + "step": 83030 + }, + { + "epoch": 11.78708303761533, + "grad_norm": 0.09791865199804306, + "learning_rate": 8.82180269694819e-05, + "loss": 0.04285701811313629, + "step": 83040 + }, + { + "epoch": 11.788502484031227, + "grad_norm": 0.4148581027984619, + "learning_rate": 8.8216607523066e-05, + "loss": 0.01719527244567871, + "step": 83050 + }, + { + "epoch": 11.789921930447125, + "grad_norm": 2.2213292121887207, + "learning_rate": 8.821518807665011e-05, + "loss": 0.03687406778335571, + "step": 83060 + }, + { + "epoch": 11.791341376863024, + "grad_norm": 0.1824258267879486, + "learning_rate": 8.821376863023421e-05, + "loss": 0.027744564414024352, + "step": 83070 + }, + { + "epoch": 11.792760823278922, + "grad_norm": 0.2214859426021576, + "learning_rate": 8.821234918381831e-05, + "loss": 0.02254961133003235, + "step": 83080 + }, + { + "epoch": 11.794180269694818, + "grad_norm": 0.35346728563308716, + "learning_rate": 8.821092973740242e-05, + "loss": 0.011183008551597595, + "step": 83090 + }, + { + "epoch": 11.795599716110717, + "grad_norm": 5.680757522583008, + "learning_rate": 8.820951029098652e-05, + "loss": 0.03402817249298096, + "step": 83100 + }, + { + "epoch": 11.797019162526615, + "grad_norm": 0.602576494216919, + "learning_rate": 8.820809084457063e-05, + "loss": 0.03264551162719727, + "step": 83110 + }, + { + "epoch": 11.798438608942512, + "grad_norm": 2.3544020652770996, + "learning_rate": 8.820667139815473e-05, + "loss": 0.025781518220901488, + "step": 83120 + }, + { + "epoch": 11.79985805535841, + "grad_norm": 0.12306859344244003, + "learning_rate": 8.820525195173882e-05, + "loss": 0.028462356328964232, + "step": 83130 + }, + { + "epoch": 11.801277501774308, + "grad_norm": 0.7537848949432373, + "learning_rate": 8.820383250532292e-05, + "loss": 0.03587826788425445, + "step": 83140 + }, + { + "epoch": 11.802696948190206, + "grad_norm": 0.05533721670508385, + "learning_rate": 8.820241305890703e-05, + "loss": 0.028028786182403564, + "step": 83150 + }, + { + "epoch": 11.804116394606103, + "grad_norm": 0.06418702751398087, + "learning_rate": 8.820099361249113e-05, + "loss": 0.01934442073106766, + "step": 83160 + }, + { + "epoch": 11.805535841022001, + "grad_norm": 2.590029239654541, + "learning_rate": 8.819957416607524e-05, + "loss": 0.06280165910720825, + "step": 83170 + }, + { + "epoch": 11.8069552874379, + "grad_norm": 2.505331039428711, + "learning_rate": 8.819815471965934e-05, + "loss": 0.021633738279342653, + "step": 83180 + }, + { + "epoch": 11.808374733853796, + "grad_norm": 0.18088681995868683, + "learning_rate": 8.819673527324343e-05, + "loss": 0.0337184876203537, + "step": 83190 + }, + { + "epoch": 11.809794180269694, + "grad_norm": 0.8823683857917786, + "learning_rate": 8.819531582682755e-05, + "loss": 0.041201424598693845, + "step": 83200 + }, + { + "epoch": 11.811213626685593, + "grad_norm": 2.2337379455566406, + "learning_rate": 8.819389638041164e-05, + "loss": 0.05025644898414612, + "step": 83210 + }, + { + "epoch": 11.812633073101491, + "grad_norm": 1.0469427108764648, + "learning_rate": 8.819247693399575e-05, + "loss": 0.03725016713142395, + "step": 83220 + }, + { + "epoch": 11.814052519517388, + "grad_norm": 0.10811296850442886, + "learning_rate": 8.819105748757985e-05, + "loss": 0.020893281698226927, + "step": 83230 + }, + { + "epoch": 11.815471965933286, + "grad_norm": 1.9570878744125366, + "learning_rate": 8.818963804116395e-05, + "loss": 0.04508139491081238, + "step": 83240 + }, + { + "epoch": 11.816891412349184, + "grad_norm": 10.603166580200195, + "learning_rate": 8.818821859474805e-05, + "loss": 0.02110636681318283, + "step": 83250 + }, + { + "epoch": 11.81831085876508, + "grad_norm": 0.4695393443107605, + "learning_rate": 8.818679914833216e-05, + "loss": 0.0809195578098297, + "step": 83260 + }, + { + "epoch": 11.819730305180979, + "grad_norm": 1.1666871309280396, + "learning_rate": 8.818537970191625e-05, + "loss": 0.10807353258132935, + "step": 83270 + }, + { + "epoch": 11.821149751596877, + "grad_norm": 1.5496456623077393, + "learning_rate": 8.818396025550037e-05, + "loss": 0.006642992049455643, + "step": 83280 + }, + { + "epoch": 11.822569198012776, + "grad_norm": 1.6191848516464233, + "learning_rate": 8.818254080908446e-05, + "loss": 0.016630643606185914, + "step": 83290 + }, + { + "epoch": 11.823988644428672, + "grad_norm": 0.05443901568651199, + "learning_rate": 8.818112136266856e-05, + "loss": 0.026843801140785217, + "step": 83300 + }, + { + "epoch": 11.82540809084457, + "grad_norm": 0.26701775193214417, + "learning_rate": 8.817970191625267e-05, + "loss": 0.009992837160825729, + "step": 83310 + }, + { + "epoch": 11.826827537260469, + "grad_norm": 4.363466739654541, + "learning_rate": 8.817828246983677e-05, + "loss": 0.029452064633369447, + "step": 83320 + }, + { + "epoch": 11.828246983676365, + "grad_norm": 0.06600237637758255, + "learning_rate": 8.817686302342088e-05, + "loss": 0.045746609568595886, + "step": 83330 + }, + { + "epoch": 11.829666430092264, + "grad_norm": 4.926809787750244, + "learning_rate": 8.817544357700496e-05, + "loss": 0.016673028469085693, + "step": 83340 + }, + { + "epoch": 11.831085876508162, + "grad_norm": 1.262709140777588, + "learning_rate": 8.817402413058907e-05, + "loss": 0.026791003346443177, + "step": 83350 + }, + { + "epoch": 11.83250532292406, + "grad_norm": 0.11825452744960785, + "learning_rate": 8.817260468417317e-05, + "loss": 0.038587138056755066, + "step": 83360 + }, + { + "epoch": 11.833924769339957, + "grad_norm": 0.49939799308776855, + "learning_rate": 8.817118523775728e-05, + "loss": 0.03778964877128601, + "step": 83370 + }, + { + "epoch": 11.835344215755855, + "grad_norm": 1.799819827079773, + "learning_rate": 8.816976579134138e-05, + "loss": 0.034172806143760684, + "step": 83380 + }, + { + "epoch": 11.836763662171753, + "grad_norm": 5.9821953773498535, + "learning_rate": 8.816834634492548e-05, + "loss": 0.03201870620250702, + "step": 83390 + }, + { + "epoch": 11.83818310858765, + "grad_norm": 8.615924835205078, + "learning_rate": 8.816692689850959e-05, + "loss": 0.049471884965896606, + "step": 83400 + }, + { + "epoch": 11.839602555003548, + "grad_norm": 0.6330484747886658, + "learning_rate": 8.816550745209369e-05, + "loss": 0.03654695153236389, + "step": 83410 + }, + { + "epoch": 11.841022001419446, + "grad_norm": 0.2800433337688446, + "learning_rate": 8.81640880056778e-05, + "loss": 0.012365755438804627, + "step": 83420 + }, + { + "epoch": 11.842441447835345, + "grad_norm": 4.388822078704834, + "learning_rate": 8.81626685592619e-05, + "loss": 0.03003677725791931, + "step": 83430 + }, + { + "epoch": 11.843860894251241, + "grad_norm": 9.12732219696045, + "learning_rate": 8.816124911284599e-05, + "loss": 0.04735245406627655, + "step": 83440 + }, + { + "epoch": 11.84528034066714, + "grad_norm": 3.422520875930786, + "learning_rate": 8.815982966643009e-05, + "loss": 0.08324976563453675, + "step": 83450 + }, + { + "epoch": 11.846699787083038, + "grad_norm": 6.075384140014648, + "learning_rate": 8.81584102200142e-05, + "loss": 0.024019157886505126, + "step": 83460 + }, + { + "epoch": 11.848119233498934, + "grad_norm": 0.9444994330406189, + "learning_rate": 8.81569907735983e-05, + "loss": 0.030841320753097534, + "step": 83470 + }, + { + "epoch": 11.849538679914833, + "grad_norm": 7.623394012451172, + "learning_rate": 8.815557132718241e-05, + "loss": 0.05833870768547058, + "step": 83480 + }, + { + "epoch": 11.850958126330731, + "grad_norm": 0.26176756620407104, + "learning_rate": 8.81541518807665e-05, + "loss": 0.004750019684433937, + "step": 83490 + }, + { + "epoch": 11.85237757274663, + "grad_norm": 0.06734207272529602, + "learning_rate": 8.81527324343506e-05, + "loss": 0.06027681231498718, + "step": 83500 + }, + { + "epoch": 11.85237757274663, + "eval_accuracy": 0.9829592420677815, + "eval_loss": 0.060624875128269196, + "eval_runtime": 33.763, + "eval_samples_per_second": 465.806, + "eval_steps_per_second": 14.572, + "step": 83500 + }, + { + "epoch": 11.853797019162526, + "grad_norm": 2.9946765899658203, + "learning_rate": 8.815131298793471e-05, + "loss": 0.03661317229270935, + "step": 83510 + }, + { + "epoch": 11.855216465578424, + "grad_norm": 7.433979511260986, + "learning_rate": 8.814989354151881e-05, + "loss": 0.04397961497306824, + "step": 83520 + }, + { + "epoch": 11.856635911994323, + "grad_norm": 2.642326831817627, + "learning_rate": 8.814847409510292e-05, + "loss": 0.027840161323547365, + "step": 83530 + }, + { + "epoch": 11.858055358410219, + "grad_norm": 2.612328052520752, + "learning_rate": 8.8147054648687e-05, + "loss": 0.08996272087097168, + "step": 83540 + }, + { + "epoch": 11.859474804826117, + "grad_norm": 10.099543571472168, + "learning_rate": 8.814563520227112e-05, + "loss": 0.04724225699901581, + "step": 83550 + }, + { + "epoch": 11.860894251242016, + "grad_norm": 2.065953493118286, + "learning_rate": 8.814421575585521e-05, + "loss": 0.023844602704048156, + "step": 83560 + }, + { + "epoch": 11.862313697657914, + "grad_norm": 3.0245158672332764, + "learning_rate": 8.814279630943932e-05, + "loss": 0.07478670477867126, + "step": 83570 + }, + { + "epoch": 11.86373314407381, + "grad_norm": 8.01756477355957, + "learning_rate": 8.814137686302342e-05, + "loss": 0.04426112174987793, + "step": 83580 + }, + { + "epoch": 11.865152590489709, + "grad_norm": 0.4189710319042206, + "learning_rate": 8.813995741660753e-05, + "loss": 0.04640637934207916, + "step": 83590 + }, + { + "epoch": 11.866572036905607, + "grad_norm": 9.745952606201172, + "learning_rate": 8.813853797019163e-05, + "loss": 0.049314913153648374, + "step": 83600 + }, + { + "epoch": 11.867991483321505, + "grad_norm": 1.1365087032318115, + "learning_rate": 8.813711852377573e-05, + "loss": 0.018093612790107728, + "step": 83610 + }, + { + "epoch": 11.869410929737402, + "grad_norm": 0.5661416053771973, + "learning_rate": 8.813569907735984e-05, + "loss": 0.052584463357925416, + "step": 83620 + }, + { + "epoch": 11.8708303761533, + "grad_norm": 7.085361957550049, + "learning_rate": 8.813427963094394e-05, + "loss": 0.03304852247238159, + "step": 83630 + }, + { + "epoch": 11.872249822569199, + "grad_norm": 1.1107380390167236, + "learning_rate": 8.813286018452805e-05, + "loss": 0.011267714947462083, + "step": 83640 + }, + { + "epoch": 11.873669268985095, + "grad_norm": 0.16078133881092072, + "learning_rate": 8.813144073811213e-05, + "loss": 0.012863248586654663, + "step": 83650 + }, + { + "epoch": 11.875088715400993, + "grad_norm": 10.058072090148926, + "learning_rate": 8.813002129169624e-05, + "loss": 0.03249671459197998, + "step": 83660 + }, + { + "epoch": 11.876508161816892, + "grad_norm": 5.011179447174072, + "learning_rate": 8.812860184528034e-05, + "loss": 0.038761311769485475, + "step": 83670 + }, + { + "epoch": 11.87792760823279, + "grad_norm": 2.0408272743225098, + "learning_rate": 8.812718239886445e-05, + "loss": 0.022072888910770416, + "step": 83680 + }, + { + "epoch": 11.879347054648687, + "grad_norm": 8.341216087341309, + "learning_rate": 8.812576295244855e-05, + "loss": 0.0365541934967041, + "step": 83690 + }, + { + "epoch": 11.880766501064585, + "grad_norm": 4.903764247894287, + "learning_rate": 8.812434350603264e-05, + "loss": 0.02184043824672699, + "step": 83700 + }, + { + "epoch": 11.882185947480483, + "grad_norm": 1.901656985282898, + "learning_rate": 8.812292405961676e-05, + "loss": 0.04549020528793335, + "step": 83710 + }, + { + "epoch": 11.88360539389638, + "grad_norm": 0.10097061842679977, + "learning_rate": 8.812150461320085e-05, + "loss": 0.027865698933601378, + "step": 83720 + }, + { + "epoch": 11.885024840312278, + "grad_norm": 1.929040551185608, + "learning_rate": 8.812008516678496e-05, + "loss": 0.005012607201933861, + "step": 83730 + }, + { + "epoch": 11.886444286728176, + "grad_norm": 3.4623374938964844, + "learning_rate": 8.811866572036906e-05, + "loss": 0.010321633517742157, + "step": 83740 + }, + { + "epoch": 11.887863733144075, + "grad_norm": 1.9667080640792847, + "learning_rate": 8.811724627395316e-05, + "loss": 0.02214210033416748, + "step": 83750 + }, + { + "epoch": 11.889283179559971, + "grad_norm": 0.031162558123469353, + "learning_rate": 8.811582682753726e-05, + "loss": 0.03217504918575287, + "step": 83760 + }, + { + "epoch": 11.89070262597587, + "grad_norm": 0.05589601397514343, + "learning_rate": 8.811440738112137e-05, + "loss": 0.0314392626285553, + "step": 83770 + }, + { + "epoch": 11.892122072391768, + "grad_norm": 8.637645721435547, + "learning_rate": 8.811298793470546e-05, + "loss": 0.05045939683914184, + "step": 83780 + }, + { + "epoch": 11.893541518807664, + "grad_norm": 0.4940531253814697, + "learning_rate": 8.811156848828958e-05, + "loss": 0.026433098316192626, + "step": 83790 + }, + { + "epoch": 11.894960965223563, + "grad_norm": 12.007257461547852, + "learning_rate": 8.811014904187367e-05, + "loss": 0.055042076110839847, + "step": 83800 + }, + { + "epoch": 11.896380411639461, + "grad_norm": 0.023529594764113426, + "learning_rate": 8.810872959545777e-05, + "loss": 0.09657434821128845, + "step": 83810 + }, + { + "epoch": 11.89779985805536, + "grad_norm": 2.9010982513427734, + "learning_rate": 8.810731014904188e-05, + "loss": 0.012377361208200455, + "step": 83820 + }, + { + "epoch": 11.899219304471256, + "grad_norm": 3.5865633487701416, + "learning_rate": 8.810589070262598e-05, + "loss": 0.01887798309326172, + "step": 83830 + }, + { + "epoch": 11.900638750887154, + "grad_norm": 0.9775835871696472, + "learning_rate": 8.810447125621009e-05, + "loss": 0.01637519896030426, + "step": 83840 + }, + { + "epoch": 11.902058197303052, + "grad_norm": 0.6207100749015808, + "learning_rate": 8.810305180979417e-05, + "loss": 0.029577887058258055, + "step": 83850 + }, + { + "epoch": 11.903477643718949, + "grad_norm": 1.9363144636154175, + "learning_rate": 8.810163236337828e-05, + "loss": 0.024132755398750306, + "step": 83860 + }, + { + "epoch": 11.904897090134847, + "grad_norm": 0.3028423488140106, + "learning_rate": 8.810021291696238e-05, + "loss": 0.026431560516357422, + "step": 83870 + }, + { + "epoch": 11.906316536550746, + "grad_norm": 6.9190192222595215, + "learning_rate": 8.809879347054649e-05, + "loss": 0.04169048666954041, + "step": 83880 + }, + { + "epoch": 11.907735982966644, + "grad_norm": 1.2599278688430786, + "learning_rate": 8.80973740241306e-05, + "loss": 0.01872767060995102, + "step": 83890 + }, + { + "epoch": 11.90915542938254, + "grad_norm": 0.038407523185014725, + "learning_rate": 8.809595457771469e-05, + "loss": 0.005365389212965965, + "step": 83900 + }, + { + "epoch": 11.910574875798439, + "grad_norm": 0.1457999050617218, + "learning_rate": 8.80945351312988e-05, + "loss": 0.009712480008602142, + "step": 83910 + }, + { + "epoch": 11.911994322214337, + "grad_norm": 0.043060123920440674, + "learning_rate": 8.80931156848829e-05, + "loss": 0.07107031345367432, + "step": 83920 + }, + { + "epoch": 11.913413768630233, + "grad_norm": 0.5718839764595032, + "learning_rate": 8.8091696238467e-05, + "loss": 0.03540462255477905, + "step": 83930 + }, + { + "epoch": 11.914833215046132, + "grad_norm": 0.2524852752685547, + "learning_rate": 8.80902767920511e-05, + "loss": 0.04274202883243561, + "step": 83940 + }, + { + "epoch": 11.91625266146203, + "grad_norm": 1.0457261800765991, + "learning_rate": 8.808885734563521e-05, + "loss": 0.0634374737739563, + "step": 83950 + }, + { + "epoch": 11.917672107877928, + "grad_norm": 14.034346580505371, + "learning_rate": 8.80874378992193e-05, + "loss": 0.06747335195541382, + "step": 83960 + }, + { + "epoch": 11.919091554293825, + "grad_norm": 1.1360832452774048, + "learning_rate": 8.808601845280341e-05, + "loss": 0.026286211609840394, + "step": 83970 + }, + { + "epoch": 11.920511000709723, + "grad_norm": 7.193599224090576, + "learning_rate": 8.808459900638752e-05, + "loss": 0.05016224980354309, + "step": 83980 + }, + { + "epoch": 11.921930447125622, + "grad_norm": 3.6520514488220215, + "learning_rate": 8.808317955997162e-05, + "loss": 0.017744763195514678, + "step": 83990 + }, + { + "epoch": 11.923349893541518, + "grad_norm": 1.2742975950241089, + "learning_rate": 8.808176011355573e-05, + "loss": 0.01798464059829712, + "step": 84000 + }, + { + "epoch": 11.923349893541518, + "eval_accuracy": 0.9836586761620144, + "eval_loss": 0.05493206903338432, + "eval_runtime": 34.3683, + "eval_samples_per_second": 457.601, + "eval_steps_per_second": 14.316, + "step": 84000 + }, + { + "epoch": 11.924769339957416, + "grad_norm": 4.129441261291504, + "learning_rate": 8.808034066713981e-05, + "loss": 0.014805413782596588, + "step": 84010 + }, + { + "epoch": 11.926188786373315, + "grad_norm": 0.5156400203704834, + "learning_rate": 8.807892122072392e-05, + "loss": 0.014667493104934693, + "step": 84020 + }, + { + "epoch": 11.927608232789213, + "grad_norm": 0.2281806617975235, + "learning_rate": 8.807750177430802e-05, + "loss": 0.037797823548316956, + "step": 84030 + }, + { + "epoch": 11.92902767920511, + "grad_norm": 9.229474067687988, + "learning_rate": 8.807608232789213e-05, + "loss": 0.031675410270690915, + "step": 84040 + }, + { + "epoch": 11.930447125621008, + "grad_norm": 0.6090600490570068, + "learning_rate": 8.807466288147623e-05, + "loss": 0.028332659602165224, + "step": 84050 + }, + { + "epoch": 11.931866572036906, + "grad_norm": 2.7820520401000977, + "learning_rate": 8.807324343506033e-05, + "loss": 0.042403769493103025, + "step": 84060 + }, + { + "epoch": 11.933286018452803, + "grad_norm": 8.19253158569336, + "learning_rate": 8.807182398864444e-05, + "loss": 0.06769433617591858, + "step": 84070 + }, + { + "epoch": 11.934705464868701, + "grad_norm": 0.1695857048034668, + "learning_rate": 8.807040454222853e-05, + "loss": 0.03822060525417328, + "step": 84080 + }, + { + "epoch": 11.9361249112846, + "grad_norm": 0.04587777331471443, + "learning_rate": 8.806898509581265e-05, + "loss": 0.03620380461215973, + "step": 84090 + }, + { + "epoch": 11.937544357700498, + "grad_norm": 0.04895970597863197, + "learning_rate": 8.806756564939674e-05, + "loss": 0.0069283261895179745, + "step": 84100 + }, + { + "epoch": 11.938963804116394, + "grad_norm": 0.5421682000160217, + "learning_rate": 8.806614620298084e-05, + "loss": 0.014993944764137268, + "step": 84110 + }, + { + "epoch": 11.940383250532292, + "grad_norm": 9.860722541809082, + "learning_rate": 8.806472675656494e-05, + "loss": 0.035263413190841676, + "step": 84120 + }, + { + "epoch": 11.94180269694819, + "grad_norm": 0.005013750400394201, + "learning_rate": 8.806330731014905e-05, + "loss": 0.018354399502277373, + "step": 84130 + }, + { + "epoch": 11.943222143364087, + "grad_norm": 3.5134024620056152, + "learning_rate": 8.806188786373315e-05, + "loss": 0.05673021674156189, + "step": 84140 + }, + { + "epoch": 11.944641589779986, + "grad_norm": 0.3052522838115692, + "learning_rate": 8.806046841731726e-05, + "loss": 0.016181229054927825, + "step": 84150 + }, + { + "epoch": 11.946061036195884, + "grad_norm": 1.608987808227539, + "learning_rate": 8.805904897090135e-05, + "loss": 0.009563577175140382, + "step": 84160 + }, + { + "epoch": 11.947480482611782, + "grad_norm": 1.9012960195541382, + "learning_rate": 8.805762952448545e-05, + "loss": 0.037493014335632326, + "step": 84170 + }, + { + "epoch": 11.948899929027679, + "grad_norm": 1.537778377532959, + "learning_rate": 8.805621007806956e-05, + "loss": 0.03181539177894592, + "step": 84180 + }, + { + "epoch": 11.950319375443577, + "grad_norm": 4.423129081726074, + "learning_rate": 8.805479063165366e-05, + "loss": 0.02197156846523285, + "step": 84190 + }, + { + "epoch": 11.951738821859475, + "grad_norm": 1.713564395904541, + "learning_rate": 8.805337118523777e-05, + "loss": 0.021484464406967163, + "step": 84200 + }, + { + "epoch": 11.953158268275372, + "grad_norm": 8.237667083740234, + "learning_rate": 8.805195173882185e-05, + "loss": 0.044994819164276126, + "step": 84210 + }, + { + "epoch": 11.95457771469127, + "grad_norm": 0.12371502816677094, + "learning_rate": 8.805053229240597e-05, + "loss": 0.0389464259147644, + "step": 84220 + }, + { + "epoch": 11.955997161107168, + "grad_norm": 2.0792911052703857, + "learning_rate": 8.804911284599006e-05, + "loss": 0.02157101035118103, + "step": 84230 + }, + { + "epoch": 11.957416607523067, + "grad_norm": 6.914712905883789, + "learning_rate": 8.804769339957417e-05, + "loss": 0.027994582056999208, + "step": 84240 + }, + { + "epoch": 11.958836053938963, + "grad_norm": 10.593100547790527, + "learning_rate": 8.804627395315827e-05, + "loss": 0.017365607619285583, + "step": 84250 + }, + { + "epoch": 11.960255500354862, + "grad_norm": 0.14070631563663483, + "learning_rate": 8.804485450674238e-05, + "loss": 0.011767487227916717, + "step": 84260 + }, + { + "epoch": 11.96167494677076, + "grad_norm": 0.10577062517404556, + "learning_rate": 8.804343506032648e-05, + "loss": 0.06398826837539673, + "step": 84270 + }, + { + "epoch": 11.963094393186656, + "grad_norm": 0.2251124233007431, + "learning_rate": 8.804201561391058e-05, + "loss": 0.02591215670108795, + "step": 84280 + }, + { + "epoch": 11.964513839602555, + "grad_norm": 0.45450490713119507, + "learning_rate": 8.804059616749469e-05, + "loss": 0.024134044349193574, + "step": 84290 + }, + { + "epoch": 11.965933286018453, + "grad_norm": 10.233354568481445, + "learning_rate": 8.803917672107879e-05, + "loss": 0.035779574513435365, + "step": 84300 + }, + { + "epoch": 11.967352732434351, + "grad_norm": 10.191351890563965, + "learning_rate": 8.80377572746629e-05, + "loss": 0.02158864587545395, + "step": 84310 + }, + { + "epoch": 11.968772178850248, + "grad_norm": 0.2624339163303375, + "learning_rate": 8.803633782824698e-05, + "loss": 0.03314901888370514, + "step": 84320 + }, + { + "epoch": 11.970191625266146, + "grad_norm": 0.15902841091156006, + "learning_rate": 8.803491838183109e-05, + "loss": 0.013961750268936157, + "step": 84330 + }, + { + "epoch": 11.971611071682045, + "grad_norm": 7.513896942138672, + "learning_rate": 8.803349893541519e-05, + "loss": 0.054462003707885745, + "step": 84340 + }, + { + "epoch": 11.973030518097941, + "grad_norm": 0.7898911237716675, + "learning_rate": 8.80320794889993e-05, + "loss": 0.01811348497867584, + "step": 84350 + }, + { + "epoch": 11.97444996451384, + "grad_norm": 1.5465627908706665, + "learning_rate": 8.80306600425834e-05, + "loss": 0.011357621103525162, + "step": 84360 + }, + { + "epoch": 11.975869410929738, + "grad_norm": 6.9257683753967285, + "learning_rate": 8.80292405961675e-05, + "loss": 0.019734883308410646, + "step": 84370 + }, + { + "epoch": 11.977288857345636, + "grad_norm": 5.433879375457764, + "learning_rate": 8.80278211497516e-05, + "loss": 0.030297473073005676, + "step": 84380 + }, + { + "epoch": 11.978708303761533, + "grad_norm": 0.7244393825531006, + "learning_rate": 8.80264017033357e-05, + "loss": 0.038099372386932374, + "step": 84390 + }, + { + "epoch": 11.98012775017743, + "grad_norm": 1.208174467086792, + "learning_rate": 8.802498225691981e-05, + "loss": 0.011677633225917815, + "step": 84400 + }, + { + "epoch": 11.98154719659333, + "grad_norm": 1.5116193294525146, + "learning_rate": 8.802356281050391e-05, + "loss": 0.033107328414916995, + "step": 84410 + }, + { + "epoch": 11.982966643009226, + "grad_norm": 0.7113156914710999, + "learning_rate": 8.802214336408801e-05, + "loss": 0.04576504826545715, + "step": 84420 + }, + { + "epoch": 11.984386089425124, + "grad_norm": 0.0780448466539383, + "learning_rate": 8.80207239176721e-05, + "loss": 0.01310802698135376, + "step": 84430 + }, + { + "epoch": 11.985805535841022, + "grad_norm": 9.950695991516113, + "learning_rate": 8.801930447125622e-05, + "loss": 0.04769536256790161, + "step": 84440 + }, + { + "epoch": 11.98722498225692, + "grad_norm": 1.4136978387832642, + "learning_rate": 8.801788502484031e-05, + "loss": 0.017603138089179994, + "step": 84450 + }, + { + "epoch": 11.988644428672817, + "grad_norm": 1.4347903728485107, + "learning_rate": 8.801646557842442e-05, + "loss": 0.054668909311294554, + "step": 84460 + }, + { + "epoch": 11.990063875088715, + "grad_norm": 0.16239799559116364, + "learning_rate": 8.801504613200852e-05, + "loss": 0.009764498472213745, + "step": 84470 + }, + { + "epoch": 11.991483321504614, + "grad_norm": 0.02905164659023285, + "learning_rate": 8.801362668559262e-05, + "loss": 0.025337016582489012, + "step": 84480 + }, + { + "epoch": 11.99290276792051, + "grad_norm": 0.4277700185775757, + "learning_rate": 8.801220723917673e-05, + "loss": 0.023854957520961763, + "step": 84490 + }, + { + "epoch": 11.994322214336409, + "grad_norm": 2.598402738571167, + "learning_rate": 8.801078779276083e-05, + "loss": 0.01903749108314514, + "step": 84500 + }, + { + "epoch": 11.994322214336409, + "eval_accuracy": 0.9682075411712342, + "eval_loss": 0.12699328362941742, + "eval_runtime": 32.4308, + "eval_samples_per_second": 484.941, + "eval_steps_per_second": 15.171, + "step": 84500 + }, + { + "epoch": 11.995741660752307, + "grad_norm": 0.46173426508903503, + "learning_rate": 8.800936834634494e-05, + "loss": 0.052141273021698, + "step": 84510 + }, + { + "epoch": 11.997161107168205, + "grad_norm": 0.17388634383678436, + "learning_rate": 8.800794889992902e-05, + "loss": 0.050818198919296266, + "step": 84520 + }, + { + "epoch": 11.998580553584102, + "grad_norm": 4.341207504272461, + "learning_rate": 8.800652945351313e-05, + "loss": 0.03561738431453705, + "step": 84530 + }, + { + "epoch": 12.0, + "grad_norm": 0.30166885256767273, + "learning_rate": 8.800511000709723e-05, + "loss": 0.02941356897354126, + "step": 84540 + }, + { + "epoch": 12.001419446415898, + "grad_norm": 3.268622875213623, + "learning_rate": 8.800369056068134e-05, + "loss": 0.017552968859672547, + "step": 84550 + }, + { + "epoch": 12.002838892831795, + "grad_norm": 2.5731606483459473, + "learning_rate": 8.800227111426544e-05, + "loss": 0.009592925757169723, + "step": 84560 + }, + { + "epoch": 12.004258339247693, + "grad_norm": 0.22703532874584198, + "learning_rate": 8.800085166784954e-05, + "loss": 0.016140460968017578, + "step": 84570 + }, + { + "epoch": 12.005677785663591, + "grad_norm": 1.0973620414733887, + "learning_rate": 8.799943222143365e-05, + "loss": 0.02522848844528198, + "step": 84580 + }, + { + "epoch": 12.00709723207949, + "grad_norm": 0.019226137548685074, + "learning_rate": 8.799801277501774e-05, + "loss": 0.01752144694328308, + "step": 84590 + }, + { + "epoch": 12.008516678495386, + "grad_norm": 5.661785125732422, + "learning_rate": 8.799659332860186e-05, + "loss": 0.10360106229782104, + "step": 84600 + }, + { + "epoch": 12.009936124911285, + "grad_norm": 5.24439811706543, + "learning_rate": 8.799517388218595e-05, + "loss": 0.038429513573646545, + "step": 84610 + }, + { + "epoch": 12.011355571327183, + "grad_norm": 15.420430183410645, + "learning_rate": 8.799375443577006e-05, + "loss": 0.04234688878059387, + "step": 84620 + }, + { + "epoch": 12.01277501774308, + "grad_norm": 0.12376894801855087, + "learning_rate": 8.799233498935415e-05, + "loss": 0.05130982995033264, + "step": 84630 + }, + { + "epoch": 12.014194464158978, + "grad_norm": 0.19059942662715912, + "learning_rate": 8.799091554293826e-05, + "loss": 0.004893422871828079, + "step": 84640 + }, + { + "epoch": 12.015613910574876, + "grad_norm": 7.90791130065918, + "learning_rate": 8.798949609652236e-05, + "loss": 0.020802582800388335, + "step": 84650 + }, + { + "epoch": 12.017033356990774, + "grad_norm": 0.09958259761333466, + "learning_rate": 8.798807665010647e-05, + "loss": 0.008918841928243637, + "step": 84660 + }, + { + "epoch": 12.01845280340667, + "grad_norm": 5.014199733734131, + "learning_rate": 8.798665720369056e-05, + "loss": 0.04075751900672912, + "step": 84670 + }, + { + "epoch": 12.01987224982257, + "grad_norm": 1.203556776046753, + "learning_rate": 8.798523775727466e-05, + "loss": 0.02592116892337799, + "step": 84680 + }, + { + "epoch": 12.021291696238467, + "grad_norm": 5.959599018096924, + "learning_rate": 8.798381831085877e-05, + "loss": 0.017769895493984222, + "step": 84690 + }, + { + "epoch": 12.022711142654364, + "grad_norm": 7.269739151000977, + "learning_rate": 8.798239886444287e-05, + "loss": 0.04146575927734375, + "step": 84700 + }, + { + "epoch": 12.024130589070262, + "grad_norm": 3.2094407081604004, + "learning_rate": 8.798097941802698e-05, + "loss": 0.013438931107521057, + "step": 84710 + }, + { + "epoch": 12.02555003548616, + "grad_norm": 4.8658623695373535, + "learning_rate": 8.797955997161108e-05, + "loss": 0.02216584384441376, + "step": 84720 + }, + { + "epoch": 12.026969481902059, + "grad_norm": 2.480189561843872, + "learning_rate": 8.797814052519518e-05, + "loss": 0.03759796917438507, + "step": 84730 + }, + { + "epoch": 12.028388928317955, + "grad_norm": 9.961448669433594, + "learning_rate": 8.797672107877927e-05, + "loss": 0.022990162670612335, + "step": 84740 + }, + { + "epoch": 12.029808374733854, + "grad_norm": 0.17480824887752533, + "learning_rate": 8.797530163236338e-05, + "loss": 0.011842547357082367, + "step": 84750 + }, + { + "epoch": 12.031227821149752, + "grad_norm": 0.02928129956126213, + "learning_rate": 8.797388218594748e-05, + "loss": 0.042740797996520995, + "step": 84760 + }, + { + "epoch": 12.032647267565649, + "grad_norm": 1.0462205410003662, + "learning_rate": 8.797246273953159e-05, + "loss": 0.009797403216362, + "step": 84770 + }, + { + "epoch": 12.034066713981547, + "grad_norm": 1.6143826246261597, + "learning_rate": 8.797104329311569e-05, + "loss": 0.031755182147026065, + "step": 84780 + }, + { + "epoch": 12.035486160397445, + "grad_norm": 10.038415908813477, + "learning_rate": 8.796962384669979e-05, + "loss": 0.0391110509634018, + "step": 84790 + }, + { + "epoch": 12.036905606813344, + "grad_norm": 0.18434542417526245, + "learning_rate": 8.79682044002839e-05, + "loss": 0.0238394096493721, + "step": 84800 + }, + { + "epoch": 12.03832505322924, + "grad_norm": 0.39857715368270874, + "learning_rate": 8.7966784953868e-05, + "loss": 0.07768634557724, + "step": 84810 + }, + { + "epoch": 12.039744499645138, + "grad_norm": 0.1263289898633957, + "learning_rate": 8.796550745209368e-05, + "loss": 0.029108119010925294, + "step": 84820 + }, + { + "epoch": 12.041163946061037, + "grad_norm": 0.02896190620958805, + "learning_rate": 8.796408800567779e-05, + "loss": 0.04185638725757599, + "step": 84830 + }, + { + "epoch": 12.042583392476933, + "grad_norm": 1.1194353103637695, + "learning_rate": 8.79626685592619e-05, + "loss": 0.013438387215137482, + "step": 84840 + }, + { + "epoch": 12.044002838892832, + "grad_norm": 0.6616882085800171, + "learning_rate": 8.796124911284599e-05, + "loss": 0.02819703221321106, + "step": 84850 + }, + { + "epoch": 12.04542228530873, + "grad_norm": 0.004331925883889198, + "learning_rate": 8.79598296664301e-05, + "loss": 0.03522735834121704, + "step": 84860 + }, + { + "epoch": 12.046841731724628, + "grad_norm": 8.60290813446045, + "learning_rate": 8.79584102200142e-05, + "loss": 0.020204275846481323, + "step": 84870 + }, + { + "epoch": 12.048261178140525, + "grad_norm": 0.041052740067243576, + "learning_rate": 8.79569907735983e-05, + "loss": 0.008595021069049835, + "step": 84880 + }, + { + "epoch": 12.049680624556423, + "grad_norm": 4.408438205718994, + "learning_rate": 8.79555713271824e-05, + "loss": 0.01435663402080536, + "step": 84890 + }, + { + "epoch": 12.051100070972321, + "grad_norm": 18.28489112854004, + "learning_rate": 8.79541518807665e-05, + "loss": 0.036411243677139285, + "step": 84900 + }, + { + "epoch": 12.052519517388218, + "grad_norm": 0.154909148812294, + "learning_rate": 8.79527324343506e-05, + "loss": 0.03882510662078857, + "step": 84910 + }, + { + "epoch": 12.053938963804116, + "grad_norm": 0.7121272683143616, + "learning_rate": 8.795131298793471e-05, + "loss": 0.0175392284989357, + "step": 84920 + }, + { + "epoch": 12.055358410220014, + "grad_norm": 0.04540358856320381, + "learning_rate": 8.794989354151882e-05, + "loss": 0.004501216858625412, + "step": 84930 + }, + { + "epoch": 12.056777856635913, + "grad_norm": 0.15085090696811676, + "learning_rate": 8.794847409510292e-05, + "loss": 0.017955848574638368, + "step": 84940 + }, + { + "epoch": 12.05819730305181, + "grad_norm": 0.16968463361263275, + "learning_rate": 8.794705464868703e-05, + "loss": 0.0195046991109848, + "step": 84950 + }, + { + "epoch": 12.059616749467708, + "grad_norm": 0.21109625697135925, + "learning_rate": 8.794563520227111e-05, + "loss": 0.035113084316253665, + "step": 84960 + }, + { + "epoch": 12.061036195883606, + "grad_norm": 0.023791976273059845, + "learning_rate": 8.794421575585522e-05, + "loss": 0.017290794849395753, + "step": 84970 + }, + { + "epoch": 12.062455642299502, + "grad_norm": 3.2964980602264404, + "learning_rate": 8.794279630943932e-05, + "loss": 0.044614797830581664, + "step": 84980 + }, + { + "epoch": 12.0638750887154, + "grad_norm": 1.710684061050415, + "learning_rate": 8.794137686302343e-05, + "loss": 0.0351711630821228, + "step": 84990 + }, + { + "epoch": 12.065294535131299, + "grad_norm": 5.109529495239258, + "learning_rate": 8.793995741660753e-05, + "loss": 0.02618541419506073, + "step": 85000 + }, + { + "epoch": 12.065294535131299, + "eval_accuracy": 0.9881096203980416, + "eval_loss": 0.044174253940582275, + "eval_runtime": 32.5626, + "eval_samples_per_second": 482.977, + "eval_steps_per_second": 15.109, + "step": 85000 + }, + { + "epoch": 12.066713981547197, + "grad_norm": 2.1556882858276367, + "learning_rate": 8.793853797019163e-05, + "loss": 0.01659655123949051, + "step": 85010 + }, + { + "epoch": 12.068133427963094, + "grad_norm": 1.9246563911437988, + "learning_rate": 8.793711852377574e-05, + "loss": 0.02644846439361572, + "step": 85020 + }, + { + "epoch": 12.069552874378992, + "grad_norm": 0.8818451166152954, + "learning_rate": 8.793569907735983e-05, + "loss": 0.009180599451065063, + "step": 85030 + }, + { + "epoch": 12.07097232079489, + "grad_norm": 0.22749079763889313, + "learning_rate": 8.793427963094394e-05, + "loss": 0.07038314342498779, + "step": 85040 + }, + { + "epoch": 12.072391767210787, + "grad_norm": 0.3684888482093811, + "learning_rate": 8.793286018452804e-05, + "loss": 0.019059973955154418, + "step": 85050 + }, + { + "epoch": 12.073811213626685, + "grad_norm": 1.719611406326294, + "learning_rate": 8.793144073811214e-05, + "loss": 0.004040063545107842, + "step": 85060 + }, + { + "epoch": 12.075230660042584, + "grad_norm": 2.7327284812927246, + "learning_rate": 8.793002129169624e-05, + "loss": 0.025533831119537352, + "step": 85070 + }, + { + "epoch": 12.076650106458482, + "grad_norm": 6.146058082580566, + "learning_rate": 8.792860184528035e-05, + "loss": 0.041575449705123904, + "step": 85080 + }, + { + "epoch": 12.078069552874378, + "grad_norm": 0.6352998614311218, + "learning_rate": 8.792718239886445e-05, + "loss": 0.016179861128330232, + "step": 85090 + }, + { + "epoch": 12.079488999290277, + "grad_norm": 0.15260739624500275, + "learning_rate": 8.792576295244856e-05, + "loss": 0.008620496094226836, + "step": 85100 + }, + { + "epoch": 12.080908445706175, + "grad_norm": 12.496967315673828, + "learning_rate": 8.792434350603265e-05, + "loss": 0.02953541874885559, + "step": 85110 + }, + { + "epoch": 12.082327892122072, + "grad_norm": 0.03197462856769562, + "learning_rate": 8.792292405961675e-05, + "loss": 0.03554803729057312, + "step": 85120 + }, + { + "epoch": 12.08374733853797, + "grad_norm": 0.2803524136543274, + "learning_rate": 8.792150461320086e-05, + "loss": 0.02972387373447418, + "step": 85130 + }, + { + "epoch": 12.085166784953868, + "grad_norm": 0.4536881446838379, + "learning_rate": 8.792008516678496e-05, + "loss": 0.018435779213905334, + "step": 85140 + }, + { + "epoch": 12.086586231369767, + "grad_norm": 3.5018510818481445, + "learning_rate": 8.791866572036907e-05, + "loss": 0.011114455759525299, + "step": 85150 + }, + { + "epoch": 12.088005677785663, + "grad_norm": 1.3714983463287354, + "learning_rate": 8.791724627395315e-05, + "loss": 0.024516399204730987, + "step": 85160 + }, + { + "epoch": 12.089425124201561, + "grad_norm": 5.15742826461792, + "learning_rate": 8.791582682753726e-05, + "loss": 0.006654751300811767, + "step": 85170 + }, + { + "epoch": 12.09084457061746, + "grad_norm": 0.24019791185855865, + "learning_rate": 8.791440738112136e-05, + "loss": 0.025834381580352783, + "step": 85180 + }, + { + "epoch": 12.092264017033356, + "grad_norm": 0.5296810269355774, + "learning_rate": 8.791298793470547e-05, + "loss": 0.04131495654582977, + "step": 85190 + }, + { + "epoch": 12.093683463449254, + "grad_norm": 0.09318207204341888, + "learning_rate": 8.791156848828957e-05, + "loss": 0.010322081297636032, + "step": 85200 + }, + { + "epoch": 12.095102909865153, + "grad_norm": 0.1374143660068512, + "learning_rate": 8.791014904187367e-05, + "loss": 0.08588246107101441, + "step": 85210 + }, + { + "epoch": 12.096522356281051, + "grad_norm": 0.15659472346305847, + "learning_rate": 8.790872959545778e-05, + "loss": 0.014092278480529786, + "step": 85220 + }, + { + "epoch": 12.097941802696948, + "grad_norm": 3.863607883453369, + "learning_rate": 8.790731014904188e-05, + "loss": 0.004379024729132652, + "step": 85230 + }, + { + "epoch": 12.099361249112846, + "grad_norm": 0.365823358297348, + "learning_rate": 8.790589070262599e-05, + "loss": 0.024506431818008424, + "step": 85240 + }, + { + "epoch": 12.100780695528744, + "grad_norm": 0.06521604210138321, + "learning_rate": 8.790447125621008e-05, + "loss": 0.048830336332321166, + "step": 85250 + }, + { + "epoch": 12.10220014194464, + "grad_norm": 4.4670233726501465, + "learning_rate": 8.790305180979418e-05, + "loss": 0.05310940742492676, + "step": 85260 + }, + { + "epoch": 12.103619588360539, + "grad_norm": 0.0818549171090126, + "learning_rate": 8.790163236337828e-05, + "loss": 0.0211592435836792, + "step": 85270 + }, + { + "epoch": 12.105039034776437, + "grad_norm": 14.488906860351562, + "learning_rate": 8.790021291696239e-05, + "loss": 0.06654778718948365, + "step": 85280 + }, + { + "epoch": 12.106458481192336, + "grad_norm": 0.9826055765151978, + "learning_rate": 8.789879347054649e-05, + "loss": 0.022806084156036376, + "step": 85290 + }, + { + "epoch": 12.107877927608232, + "grad_norm": 6.558762550354004, + "learning_rate": 8.78973740241306e-05, + "loss": 0.07291821837425232, + "step": 85300 + }, + { + "epoch": 12.10929737402413, + "grad_norm": 0.09733277559280396, + "learning_rate": 8.78959545777147e-05, + "loss": 0.02255593240261078, + "step": 85310 + }, + { + "epoch": 12.110716820440029, + "grad_norm": 0.38312506675720215, + "learning_rate": 8.789453513129879e-05, + "loss": 0.028494805097579956, + "step": 85320 + }, + { + "epoch": 12.112136266855925, + "grad_norm": 0.46621382236480713, + "learning_rate": 8.78931156848829e-05, + "loss": 0.016409771144390108, + "step": 85330 + }, + { + "epoch": 12.113555713271824, + "grad_norm": 7.448116779327393, + "learning_rate": 8.7891696238467e-05, + "loss": 0.02532399296760559, + "step": 85340 + }, + { + "epoch": 12.114975159687722, + "grad_norm": 0.5830575227737427, + "learning_rate": 8.789027679205111e-05, + "loss": 0.016569083929061888, + "step": 85350 + }, + { + "epoch": 12.11639460610362, + "grad_norm": 1.3412967920303345, + "learning_rate": 8.788885734563521e-05, + "loss": 0.04238985180854797, + "step": 85360 + }, + { + "epoch": 12.117814052519517, + "grad_norm": 0.38850438594818115, + "learning_rate": 8.788743789921931e-05, + "loss": 0.01213361620903015, + "step": 85370 + }, + { + "epoch": 12.119233498935415, + "grad_norm": 13.0916166305542, + "learning_rate": 8.78860184528034e-05, + "loss": 0.026457768678665162, + "step": 85380 + }, + { + "epoch": 12.120652945351313, + "grad_norm": 10.388897895812988, + "learning_rate": 8.788459900638752e-05, + "loss": 0.06054552793502808, + "step": 85390 + }, + { + "epoch": 12.12207239176721, + "grad_norm": 0.6791418790817261, + "learning_rate": 8.788317955997161e-05, + "loss": 0.010788274556398391, + "step": 85400 + }, + { + "epoch": 12.123491838183108, + "grad_norm": 6.098667621612549, + "learning_rate": 8.788176011355572e-05, + "loss": 0.016460536420345305, + "step": 85410 + }, + { + "epoch": 12.124911284599007, + "grad_norm": 0.7227396965026855, + "learning_rate": 8.788034066713982e-05, + "loss": 0.005558209121227264, + "step": 85420 + }, + { + "epoch": 12.126330731014905, + "grad_norm": 0.8304058909416199, + "learning_rate": 8.787892122072392e-05, + "loss": 0.040136903524398804, + "step": 85430 + }, + { + "epoch": 12.127750177430801, + "grad_norm": 10.306302070617676, + "learning_rate": 8.787750177430803e-05, + "loss": 0.04540317952632904, + "step": 85440 + }, + { + "epoch": 12.1291696238467, + "grad_norm": 0.0453554131090641, + "learning_rate": 8.787608232789213e-05, + "loss": 0.030978840589523316, + "step": 85450 + }, + { + "epoch": 12.130589070262598, + "grad_norm": 1.693662405014038, + "learning_rate": 8.787466288147624e-05, + "loss": 0.07573475837707519, + "step": 85460 + }, + { + "epoch": 12.132008516678495, + "grad_norm": 0.27647534012794495, + "learning_rate": 8.787324343506032e-05, + "loss": 0.04110590517520905, + "step": 85470 + }, + { + "epoch": 12.133427963094393, + "grad_norm": 0.5244948267936707, + "learning_rate": 8.787182398864443e-05, + "loss": 0.04320420622825623, + "step": 85480 + }, + { + "epoch": 12.134847409510291, + "grad_norm": 0.07078727334737778, + "learning_rate": 8.787040454222853e-05, + "loss": 0.026226553320884704, + "step": 85490 + }, + { + "epoch": 12.13626685592619, + "grad_norm": 0.5406977534294128, + "learning_rate": 8.786898509581264e-05, + "loss": 0.0808382511138916, + "step": 85500 + }, + { + "epoch": 12.13626685592619, + "eval_accuracy": 0.9654733897119603, + "eval_loss": 0.12036281824111938, + "eval_runtime": 31.5777, + "eval_samples_per_second": 498.042, + "eval_steps_per_second": 15.581, + "step": 85500 + }, + { + "epoch": 12.137686302342086, + "grad_norm": 0.1550203561782837, + "learning_rate": 8.786756564939674e-05, + "loss": 0.06799649596214294, + "step": 85510 + }, + { + "epoch": 12.139105748757984, + "grad_norm": 4.776898384094238, + "learning_rate": 8.786614620298084e-05, + "loss": 0.03676438629627228, + "step": 85520 + }, + { + "epoch": 12.140525195173883, + "grad_norm": 7.222288131713867, + "learning_rate": 8.786472675656495e-05, + "loss": 0.03599470853805542, + "step": 85530 + }, + { + "epoch": 12.14194464158978, + "grad_norm": 1.93971586227417, + "learning_rate": 8.786330731014904e-05, + "loss": 0.01595723330974579, + "step": 85540 + }, + { + "epoch": 12.143364088005677, + "grad_norm": 14.762378692626953, + "learning_rate": 8.786188786373315e-05, + "loss": 0.03826345801353455, + "step": 85550 + }, + { + "epoch": 12.144783534421576, + "grad_norm": 0.04824106767773628, + "learning_rate": 8.786046841731725e-05, + "loss": 0.03802756071090698, + "step": 85560 + }, + { + "epoch": 12.146202980837474, + "grad_norm": 7.385923385620117, + "learning_rate": 8.785904897090135e-05, + "loss": 0.016000357270240784, + "step": 85570 + }, + { + "epoch": 12.14762242725337, + "grad_norm": 0.8453985452651978, + "learning_rate": 8.785762952448545e-05, + "loss": 0.02127433121204376, + "step": 85580 + }, + { + "epoch": 12.149041873669269, + "grad_norm": 0.05099086835980415, + "learning_rate": 8.785621007806956e-05, + "loss": 0.06052837371826172, + "step": 85590 + }, + { + "epoch": 12.150461320085167, + "grad_norm": 0.31436964869499207, + "learning_rate": 8.785479063165366e-05, + "loss": 0.01670397222042084, + "step": 85600 + }, + { + "epoch": 12.151880766501064, + "grad_norm": 3.1501166820526123, + "learning_rate": 8.785337118523777e-05, + "loss": 0.013452109694480897, + "step": 85610 + }, + { + "epoch": 12.153300212916962, + "grad_norm": 1.261924147605896, + "learning_rate": 8.785195173882186e-05, + "loss": 0.019531291723251343, + "step": 85620 + }, + { + "epoch": 12.15471965933286, + "grad_norm": 0.0056761568412184715, + "learning_rate": 8.785053229240596e-05, + "loss": 0.0211712047457695, + "step": 85630 + }, + { + "epoch": 12.156139105748759, + "grad_norm": 0.9235444664955139, + "learning_rate": 8.784911284599007e-05, + "loss": 0.03171195089817047, + "step": 85640 + }, + { + "epoch": 12.157558552164655, + "grad_norm": 0.06382320076227188, + "learning_rate": 8.784769339957417e-05, + "loss": 0.03569975793361664, + "step": 85650 + }, + { + "epoch": 12.158977998580554, + "grad_norm": 2.014090061187744, + "learning_rate": 8.784627395315828e-05, + "loss": 0.045996904373168945, + "step": 85660 + }, + { + "epoch": 12.160397444996452, + "grad_norm": 0.9534714818000793, + "learning_rate": 8.784485450674238e-05, + "loss": 0.10697624683380128, + "step": 85670 + }, + { + "epoch": 12.161816891412348, + "grad_norm": 0.5350373387336731, + "learning_rate": 8.784343506032647e-05, + "loss": 0.010471509397029876, + "step": 85680 + }, + { + "epoch": 12.163236337828247, + "grad_norm": 1.1189122200012207, + "learning_rate": 8.784201561391057e-05, + "loss": 0.020827175676822664, + "step": 85690 + }, + { + "epoch": 12.164655784244145, + "grad_norm": 6.082579612731934, + "learning_rate": 8.784059616749468e-05, + "loss": 0.01840526908636093, + "step": 85700 + }, + { + "epoch": 12.166075230660043, + "grad_norm": 0.36387625336647034, + "learning_rate": 8.783917672107878e-05, + "loss": 0.04171132147312164, + "step": 85710 + }, + { + "epoch": 12.16749467707594, + "grad_norm": 0.13835683465003967, + "learning_rate": 8.783775727466289e-05, + "loss": 0.0185529425740242, + "step": 85720 + }, + { + "epoch": 12.168914123491838, + "grad_norm": 0.04023800790309906, + "learning_rate": 8.783633782824699e-05, + "loss": 0.02126876711845398, + "step": 85730 + }, + { + "epoch": 12.170333569907736, + "grad_norm": 7.292525291442871, + "learning_rate": 8.783491838183109e-05, + "loss": 0.041138219833374026, + "step": 85740 + }, + { + "epoch": 12.171753016323633, + "grad_norm": 0.21447692811489105, + "learning_rate": 8.78334989354152e-05, + "loss": 0.046198248863220215, + "step": 85750 + }, + { + "epoch": 12.173172462739531, + "grad_norm": 3.7342817783355713, + "learning_rate": 8.78320794889993e-05, + "loss": 0.01802656352519989, + "step": 85760 + }, + { + "epoch": 12.17459190915543, + "grad_norm": 2.4353530406951904, + "learning_rate": 8.78306600425834e-05, + "loss": 0.01277841180562973, + "step": 85770 + }, + { + "epoch": 12.176011355571328, + "grad_norm": 1.732850193977356, + "learning_rate": 8.782924059616749e-05, + "loss": 0.008014071732759476, + "step": 85780 + }, + { + "epoch": 12.177430801987224, + "grad_norm": 0.05266590043902397, + "learning_rate": 8.78278211497516e-05, + "loss": 0.04063029289245605, + "step": 85790 + }, + { + "epoch": 12.178850248403123, + "grad_norm": 0.3241954445838928, + "learning_rate": 8.78264017033357e-05, + "loss": 0.012653402984142303, + "step": 85800 + }, + { + "epoch": 12.180269694819021, + "grad_norm": 0.054089661687612534, + "learning_rate": 8.782498225691981e-05, + "loss": 0.005601692199707031, + "step": 85810 + }, + { + "epoch": 12.181689141234918, + "grad_norm": 0.6758180856704712, + "learning_rate": 8.78235628105039e-05, + "loss": 0.008006727695465088, + "step": 85820 + }, + { + "epoch": 12.183108587650816, + "grad_norm": 0.05027813836932182, + "learning_rate": 8.7822143364088e-05, + "loss": 0.014886750280857087, + "step": 85830 + }, + { + "epoch": 12.184528034066714, + "grad_norm": 1.267127275466919, + "learning_rate": 8.782072391767211e-05, + "loss": 0.012593789398670197, + "step": 85840 + }, + { + "epoch": 12.185947480482612, + "grad_norm": 3.4801025390625, + "learning_rate": 8.781930447125621e-05, + "loss": 0.028322494029998778, + "step": 85850 + }, + { + "epoch": 12.187366926898509, + "grad_norm": 0.7395821809768677, + "learning_rate": 8.781788502484032e-05, + "loss": 0.021465349197387695, + "step": 85860 + }, + { + "epoch": 12.188786373314407, + "grad_norm": 3.554295539855957, + "learning_rate": 8.781646557842442e-05, + "loss": 0.0428952544927597, + "step": 85870 + }, + { + "epoch": 12.190205819730306, + "grad_norm": 3.1385884284973145, + "learning_rate": 8.781504613200852e-05, + "loss": 0.031047120690345764, + "step": 85880 + }, + { + "epoch": 12.191625266146202, + "grad_norm": 3.8755886554718018, + "learning_rate": 8.781362668559261e-05, + "loss": 0.012460941076278686, + "step": 85890 + }, + { + "epoch": 12.1930447125621, + "grad_norm": 7.198748588562012, + "learning_rate": 8.781220723917673e-05, + "loss": 0.04200557470321655, + "step": 85900 + }, + { + "epoch": 12.194464158977999, + "grad_norm": 10.587272644042969, + "learning_rate": 8.781078779276082e-05, + "loss": 0.03208665251731872, + "step": 85910 + }, + { + "epoch": 12.195883605393897, + "grad_norm": 0.7728947401046753, + "learning_rate": 8.780936834634493e-05, + "loss": 0.013271036744117736, + "step": 85920 + }, + { + "epoch": 12.197303051809794, + "grad_norm": 2.66263747215271, + "learning_rate": 8.780794889992903e-05, + "loss": 0.031674724817276, + "step": 85930 + }, + { + "epoch": 12.198722498225692, + "grad_norm": 1.868409514427185, + "learning_rate": 8.780652945351313e-05, + "loss": 0.008502017706632614, + "step": 85940 + }, + { + "epoch": 12.20014194464159, + "grad_norm": 0.21075168251991272, + "learning_rate": 8.780511000709724e-05, + "loss": 0.10288900136947632, + "step": 85950 + }, + { + "epoch": 12.201561391057487, + "grad_norm": 7.489137649536133, + "learning_rate": 8.780369056068134e-05, + "loss": 0.02367044985294342, + "step": 85960 + }, + { + "epoch": 12.202980837473385, + "grad_norm": 6.909646511077881, + "learning_rate": 8.780227111426545e-05, + "loss": 0.016005274653434754, + "step": 85970 + }, + { + "epoch": 12.204400283889283, + "grad_norm": 0.8582497239112854, + "learning_rate": 8.780085166784953e-05, + "loss": 0.030254873633384704, + "step": 85980 + }, + { + "epoch": 12.205819730305182, + "grad_norm": 0.6521355509757996, + "learning_rate": 8.779943222143364e-05, + "loss": 0.027053722739219667, + "step": 85990 + }, + { + "epoch": 12.207239176721078, + "grad_norm": 1.3930158615112305, + "learning_rate": 8.779801277501774e-05, + "loss": 0.08021281957626343, + "step": 86000 + }, + { + "epoch": 12.207239176721078, + "eval_accuracy": 0.9818782984676034, + "eval_loss": 0.06434565782546997, + "eval_runtime": 31.8889, + "eval_samples_per_second": 493.181, + "eval_steps_per_second": 15.429, + "step": 86000 + }, + { + "epoch": 12.208658623136976, + "grad_norm": 1.3139967918395996, + "learning_rate": 8.779659332860185e-05, + "loss": 0.017941921949386597, + "step": 86010 + }, + { + "epoch": 12.210078069552875, + "grad_norm": 1.020004391670227, + "learning_rate": 8.779517388218595e-05, + "loss": 0.03532530665397644, + "step": 86020 + }, + { + "epoch": 12.211497515968771, + "grad_norm": 0.030459938570857048, + "learning_rate": 8.779375443577006e-05, + "loss": 0.020873503386974336, + "step": 86030 + }, + { + "epoch": 12.21291696238467, + "grad_norm": 0.11012019962072372, + "learning_rate": 8.779233498935416e-05, + "loss": 0.03752686977386475, + "step": 86040 + }, + { + "epoch": 12.214336408800568, + "grad_norm": 10.345964431762695, + "learning_rate": 8.779091554293825e-05, + "loss": 0.025034779310226442, + "step": 86050 + }, + { + "epoch": 12.215755855216466, + "grad_norm": 4.960020065307617, + "learning_rate": 8.778949609652236e-05, + "loss": 0.06039838194847107, + "step": 86060 + }, + { + "epoch": 12.217175301632363, + "grad_norm": 0.007862528786063194, + "learning_rate": 8.778807665010646e-05, + "loss": 0.02569767236709595, + "step": 86070 + }, + { + "epoch": 12.218594748048261, + "grad_norm": 2.3586621284484863, + "learning_rate": 8.778665720369057e-05, + "loss": 0.007292249798774719, + "step": 86080 + }, + { + "epoch": 12.22001419446416, + "grad_norm": 0.06399425864219666, + "learning_rate": 8.778523775727466e-05, + "loss": 0.012808781862258912, + "step": 86090 + }, + { + "epoch": 12.221433640880056, + "grad_norm": 6.091718673706055, + "learning_rate": 8.778381831085877e-05, + "loss": 0.05899875164031983, + "step": 86100 + }, + { + "epoch": 12.222853087295954, + "grad_norm": 0.7794937491416931, + "learning_rate": 8.778239886444287e-05, + "loss": 0.008084338158369064, + "step": 86110 + }, + { + "epoch": 12.224272533711853, + "grad_norm": 0.4178735613822937, + "learning_rate": 8.778097941802698e-05, + "loss": 0.017458078265190125, + "step": 86120 + }, + { + "epoch": 12.22569198012775, + "grad_norm": 0.5997690558433533, + "learning_rate": 8.777955997161109e-05, + "loss": 0.039548417925834654, + "step": 86130 + }, + { + "epoch": 12.227111426543647, + "grad_norm": 0.3146066963672638, + "learning_rate": 8.777814052519517e-05, + "loss": 0.022693848609924315, + "step": 86140 + }, + { + "epoch": 12.228530872959546, + "grad_norm": 0.6580975651741028, + "learning_rate": 8.777672107877928e-05, + "loss": 0.02807498276233673, + "step": 86150 + }, + { + "epoch": 12.229950319375444, + "grad_norm": 1.1448265314102173, + "learning_rate": 8.777530163236338e-05, + "loss": 0.0626761257648468, + "step": 86160 + }, + { + "epoch": 12.231369765791342, + "grad_norm": 3.368934154510498, + "learning_rate": 8.777388218594749e-05, + "loss": 0.0760569453239441, + "step": 86170 + }, + { + "epoch": 12.232789212207239, + "grad_norm": 2.0013580322265625, + "learning_rate": 8.777246273953159e-05, + "loss": 0.023546977341175078, + "step": 86180 + }, + { + "epoch": 12.234208658623137, + "grad_norm": 0.20961083471775055, + "learning_rate": 8.777104329311569e-05, + "loss": 0.05598819851875305, + "step": 86190 + }, + { + "epoch": 12.235628105039035, + "grad_norm": 0.22564516961574554, + "learning_rate": 8.776962384669978e-05, + "loss": 0.02459408938884735, + "step": 86200 + }, + { + "epoch": 12.237047551454932, + "grad_norm": 10.181661605834961, + "learning_rate": 8.77682044002839e-05, + "loss": 0.03393624722957611, + "step": 86210 + }, + { + "epoch": 12.23846699787083, + "grad_norm": 8.41063117980957, + "learning_rate": 8.7766784953868e-05, + "loss": 0.04015167355537415, + "step": 86220 + }, + { + "epoch": 12.239886444286729, + "grad_norm": 6.153536796569824, + "learning_rate": 8.77653655074521e-05, + "loss": 0.035242652893066405, + "step": 86230 + }, + { + "epoch": 12.241305890702627, + "grad_norm": 0.06803173571825027, + "learning_rate": 8.77639460610362e-05, + "loss": 0.035042256116867065, + "step": 86240 + }, + { + "epoch": 12.242725337118523, + "grad_norm": 0.18211659789085388, + "learning_rate": 8.77625266146203e-05, + "loss": 0.0313386470079422, + "step": 86250 + }, + { + "epoch": 12.244144783534422, + "grad_norm": 0.041296329349279404, + "learning_rate": 8.776110716820441e-05, + "loss": 0.022816309332847597, + "step": 86260 + }, + { + "epoch": 12.24556422995032, + "grad_norm": 1.6136507987976074, + "learning_rate": 8.77596877217885e-05, + "loss": 0.02124781161546707, + "step": 86270 + }, + { + "epoch": 12.246983676366217, + "grad_norm": 2.71836519241333, + "learning_rate": 8.775826827537262e-05, + "loss": 0.0672789990901947, + "step": 86280 + }, + { + "epoch": 12.248403122782115, + "grad_norm": 0.29414162039756775, + "learning_rate": 8.77568488289567e-05, + "loss": 0.014540690183639526, + "step": 86290 + }, + { + "epoch": 12.249822569198013, + "grad_norm": 1.0567787885665894, + "learning_rate": 8.775542938254081e-05, + "loss": 0.02073398381471634, + "step": 86300 + }, + { + "epoch": 12.251242015613911, + "grad_norm": 0.6744303107261658, + "learning_rate": 8.775400993612492e-05, + "loss": 0.010882169753313065, + "step": 86310 + }, + { + "epoch": 12.252661462029808, + "grad_norm": 0.1246824786067009, + "learning_rate": 8.775259048970902e-05, + "loss": 0.006403592228889465, + "step": 86320 + }, + { + "epoch": 12.254080908445706, + "grad_norm": 1.0266505479812622, + "learning_rate": 8.775117104329313e-05, + "loss": 0.008550825715065002, + "step": 86330 + }, + { + "epoch": 12.255500354861605, + "grad_norm": 0.10675075650215149, + "learning_rate": 8.774975159687721e-05, + "loss": 0.0436547189950943, + "step": 86340 + }, + { + "epoch": 12.256919801277501, + "grad_norm": 3.3878304958343506, + "learning_rate": 8.774833215046132e-05, + "loss": 0.04974772930145264, + "step": 86350 + }, + { + "epoch": 12.2583392476934, + "grad_norm": 0.26958853006362915, + "learning_rate": 8.774691270404542e-05, + "loss": 0.02548518478870392, + "step": 86360 + }, + { + "epoch": 12.259758694109298, + "grad_norm": 4.33382511138916, + "learning_rate": 8.774549325762953e-05, + "loss": 0.036443135142326354, + "step": 86370 + }, + { + "epoch": 12.261178140525196, + "grad_norm": 2.351497173309326, + "learning_rate": 8.774407381121363e-05, + "loss": 0.040211799740791324, + "step": 86380 + }, + { + "epoch": 12.262597586941093, + "grad_norm": 4.3762407302856445, + "learning_rate": 8.774265436479774e-05, + "loss": 0.02976323664188385, + "step": 86390 + }, + { + "epoch": 12.264017033356991, + "grad_norm": 0.07373414933681488, + "learning_rate": 8.774123491838184e-05, + "loss": 0.0378105491399765, + "step": 86400 + }, + { + "epoch": 12.26543647977289, + "grad_norm": 0.4650828242301941, + "learning_rate": 8.773981547196594e-05, + "loss": 0.030419424176216125, + "step": 86410 + }, + { + "epoch": 12.266855926188786, + "grad_norm": 1.0882513523101807, + "learning_rate": 8.773839602555005e-05, + "loss": 0.04058949947357178, + "step": 86420 + }, + { + "epoch": 12.268275372604684, + "grad_norm": 0.41338181495666504, + "learning_rate": 8.773697657913414e-05, + "loss": 0.01279766708612442, + "step": 86430 + }, + { + "epoch": 12.269694819020582, + "grad_norm": 1.0868407487869263, + "learning_rate": 8.773555713271825e-05, + "loss": 0.03736717700958252, + "step": 86440 + }, + { + "epoch": 12.27111426543648, + "grad_norm": 0.4957295060157776, + "learning_rate": 8.773413768630234e-05, + "loss": 0.022770945727825165, + "step": 86450 + }, + { + "epoch": 12.272533711852377, + "grad_norm": 7.433931827545166, + "learning_rate": 8.773271823988645e-05, + "loss": 0.04367608726024628, + "step": 86460 + }, + { + "epoch": 12.273953158268275, + "grad_norm": 3.417067527770996, + "learning_rate": 8.773129879347055e-05, + "loss": 0.036573588848114014, + "step": 86470 + }, + { + "epoch": 12.275372604684174, + "grad_norm": 0.04351586103439331, + "learning_rate": 8.772987934705466e-05, + "loss": 0.03823851346969605, + "step": 86480 + }, + { + "epoch": 12.27679205110007, + "grad_norm": 5.9566874504089355, + "learning_rate": 8.772845990063876e-05, + "loss": 0.035113397240638736, + "step": 86490 + }, + { + "epoch": 12.278211497515969, + "grad_norm": 0.16312649846076965, + "learning_rate": 8.772704045422285e-05, + "loss": 0.02089274823665619, + "step": 86500 + }, + { + "epoch": 12.278211497515969, + "eval_accuracy": 0.9846760348445349, + "eval_loss": 0.053695064038038254, + "eval_runtime": 33.0639, + "eval_samples_per_second": 475.655, + "eval_steps_per_second": 14.88, + "step": 86500 + }, + { + "epoch": 12.279630943931867, + "grad_norm": 4.228842735290527, + "learning_rate": 8.772562100780696e-05, + "loss": 0.025933349132537843, + "step": 86510 + }, + { + "epoch": 12.281050390347765, + "grad_norm": 2.5222129821777344, + "learning_rate": 8.772420156139106e-05, + "loss": 0.04900032877922058, + "step": 86520 + }, + { + "epoch": 12.282469836763662, + "grad_norm": 0.12866684794425964, + "learning_rate": 8.772278211497517e-05, + "loss": 0.012445084750652313, + "step": 86530 + }, + { + "epoch": 12.28388928317956, + "grad_norm": 0.30662041902542114, + "learning_rate": 8.772136266855927e-05, + "loss": 0.013581423461437226, + "step": 86540 + }, + { + "epoch": 12.285308729595458, + "grad_norm": 2.5577304363250732, + "learning_rate": 8.771994322214337e-05, + "loss": 0.012446528673171997, + "step": 86550 + }, + { + "epoch": 12.286728176011355, + "grad_norm": 9.133108139038086, + "learning_rate": 8.771866572036907e-05, + "loss": 0.034515559673309326, + "step": 86560 + }, + { + "epoch": 12.288147622427253, + "grad_norm": 2.261014461517334, + "learning_rate": 8.771724627395316e-05, + "loss": 0.04418281614780426, + "step": 86570 + }, + { + "epoch": 12.289567068843152, + "grad_norm": 1.050868034362793, + "learning_rate": 8.771582682753726e-05, + "loss": 0.049887990951538085, + "step": 86580 + }, + { + "epoch": 12.29098651525905, + "grad_norm": 0.37167036533355713, + "learning_rate": 8.771440738112137e-05, + "loss": 0.07505257725715637, + "step": 86590 + }, + { + "epoch": 12.292405961674946, + "grad_norm": 6.5589399337768555, + "learning_rate": 8.771298793470547e-05, + "loss": 0.009983015060424805, + "step": 86600 + }, + { + "epoch": 12.293825408090845, + "grad_norm": 2.896353244781494, + "learning_rate": 8.771156848828958e-05, + "loss": 0.08249455690383911, + "step": 86610 + }, + { + "epoch": 12.295244854506743, + "grad_norm": 0.8286024928092957, + "learning_rate": 8.771014904187366e-05, + "loss": 0.026764780282974243, + "step": 86620 + }, + { + "epoch": 12.29666430092264, + "grad_norm": 3.3944568634033203, + "learning_rate": 8.770872959545777e-05, + "loss": 0.005344720929861069, + "step": 86630 + }, + { + "epoch": 12.298083747338538, + "grad_norm": 0.10840263217687607, + "learning_rate": 8.770731014904187e-05, + "loss": 0.024625831842422487, + "step": 86640 + }, + { + "epoch": 12.299503193754436, + "grad_norm": 7.262483596801758, + "learning_rate": 8.770589070262598e-05, + "loss": 0.014307631552219391, + "step": 86650 + }, + { + "epoch": 12.300922640170334, + "grad_norm": 6.646247386932373, + "learning_rate": 8.770447125621008e-05, + "loss": 0.05552939772605896, + "step": 86660 + }, + { + "epoch": 12.302342086586231, + "grad_norm": 0.5893328189849854, + "learning_rate": 8.770305180979419e-05, + "loss": 0.03609735369682312, + "step": 86670 + }, + { + "epoch": 12.30376153300213, + "grad_norm": 8.235060691833496, + "learning_rate": 8.770163236337829e-05, + "loss": 0.03485163152217865, + "step": 86680 + }, + { + "epoch": 12.305180979418028, + "grad_norm": 0.4184919595718384, + "learning_rate": 8.770021291696239e-05, + "loss": 0.0376891016960144, + "step": 86690 + }, + { + "epoch": 12.306600425833924, + "grad_norm": 0.1280488520860672, + "learning_rate": 8.76987934705465e-05, + "loss": 0.01505531519651413, + "step": 86700 + }, + { + "epoch": 12.308019872249822, + "grad_norm": 7.553830146789551, + "learning_rate": 8.76973740241306e-05, + "loss": 0.06388399600982667, + "step": 86710 + }, + { + "epoch": 12.30943931866572, + "grad_norm": 13.967238426208496, + "learning_rate": 8.76959545777147e-05, + "loss": 0.06212584972381592, + "step": 86720 + }, + { + "epoch": 12.310858765081619, + "grad_norm": 0.37098854780197144, + "learning_rate": 8.769453513129879e-05, + "loss": 0.02619161903858185, + "step": 86730 + }, + { + "epoch": 12.312278211497516, + "grad_norm": 0.16169452667236328, + "learning_rate": 8.76931156848829e-05, + "loss": 0.05232579112052917, + "step": 86740 + }, + { + "epoch": 12.313697657913414, + "grad_norm": 5.871384620666504, + "learning_rate": 8.7691696238467e-05, + "loss": 0.030590057373046875, + "step": 86750 + }, + { + "epoch": 12.315117104329312, + "grad_norm": 0.049284838140010834, + "learning_rate": 8.769027679205111e-05, + "loss": 0.008392262458801269, + "step": 86760 + }, + { + "epoch": 12.316536550745209, + "grad_norm": 0.2415841817855835, + "learning_rate": 8.76888573456352e-05, + "loss": 0.0048000641167163845, + "step": 86770 + }, + { + "epoch": 12.317955997161107, + "grad_norm": 0.17869527637958527, + "learning_rate": 8.76874378992193e-05, + "loss": 0.06827518343925476, + "step": 86780 + }, + { + "epoch": 12.319375443577005, + "grad_norm": 6.73385763168335, + "learning_rate": 8.768601845280341e-05, + "loss": 0.05431227087974548, + "step": 86790 + }, + { + "epoch": 12.320794889992904, + "grad_norm": 0.7754188179969788, + "learning_rate": 8.768459900638751e-05, + "loss": 0.016968098282814027, + "step": 86800 + }, + { + "epoch": 12.3222143364088, + "grad_norm": 0.057217102497816086, + "learning_rate": 8.768317955997162e-05, + "loss": 0.011527723073959351, + "step": 86810 + }, + { + "epoch": 12.323633782824698, + "grad_norm": 0.12364845722913742, + "learning_rate": 8.768176011355572e-05, + "loss": 0.021833422780036926, + "step": 86820 + }, + { + "epoch": 12.325053229240597, + "grad_norm": 7.2985334396362305, + "learning_rate": 8.768034066713982e-05, + "loss": 0.02750059962272644, + "step": 86830 + }, + { + "epoch": 12.326472675656493, + "grad_norm": 2.3215739727020264, + "learning_rate": 8.767892122072391e-05, + "loss": 0.01575092077255249, + "step": 86840 + }, + { + "epoch": 12.327892122072392, + "grad_norm": 0.5124582052230835, + "learning_rate": 8.767750177430802e-05, + "loss": 0.011324916779994965, + "step": 86850 + }, + { + "epoch": 12.32931156848829, + "grad_norm": 0.26997870206832886, + "learning_rate": 8.767608232789212e-05, + "loss": 0.02369029074907303, + "step": 86860 + }, + { + "epoch": 12.330731014904188, + "grad_norm": 0.14223651587963104, + "learning_rate": 8.767466288147623e-05, + "loss": 0.017847174406051637, + "step": 86870 + }, + { + "epoch": 12.332150461320085, + "grad_norm": 7.334930419921875, + "learning_rate": 8.767324343506033e-05, + "loss": 0.023191067576408386, + "step": 86880 + }, + { + "epoch": 12.333569907735983, + "grad_norm": 0.027162916958332062, + "learning_rate": 8.767182398864443e-05, + "loss": 0.018505416810512543, + "step": 86890 + }, + { + "epoch": 12.334989354151881, + "grad_norm": 0.060514308512210846, + "learning_rate": 8.767040454222854e-05, + "loss": 0.011638328433036804, + "step": 86900 + }, + { + "epoch": 12.336408800567778, + "grad_norm": 0.48239290714263916, + "learning_rate": 8.766898509581264e-05, + "loss": 0.015294665098190307, + "step": 86910 + }, + { + "epoch": 12.337828246983676, + "grad_norm": 0.8900887370109558, + "learning_rate": 8.766756564939675e-05, + "loss": 0.04495021998882294, + "step": 86920 + }, + { + "epoch": 12.339247693399575, + "grad_norm": 0.049312490969896317, + "learning_rate": 8.766614620298083e-05, + "loss": 0.026754915714263916, + "step": 86930 + }, + { + "epoch": 12.340667139815473, + "grad_norm": 0.03092627413570881, + "learning_rate": 8.766472675656494e-05, + "loss": 0.007481810450553894, + "step": 86940 + }, + { + "epoch": 12.34208658623137, + "grad_norm": 0.10244758427143097, + "learning_rate": 8.766330731014904e-05, + "loss": 0.003902510926127434, + "step": 86950 + }, + { + "epoch": 12.343506032647268, + "grad_norm": 4.041190147399902, + "learning_rate": 8.766188786373315e-05, + "loss": 0.02546464204788208, + "step": 86960 + }, + { + "epoch": 12.344925479063166, + "grad_norm": 0.008100933395326138, + "learning_rate": 8.766046841731726e-05, + "loss": 0.0286540150642395, + "step": 86970 + }, + { + "epoch": 12.346344925479062, + "grad_norm": 0.07827972620725632, + "learning_rate": 8.765904897090134e-05, + "loss": 0.014019955694675446, + "step": 86980 + }, + { + "epoch": 12.34776437189496, + "grad_norm": 0.503950297832489, + "learning_rate": 8.765762952448546e-05, + "loss": 0.02017439007759094, + "step": 86990 + }, + { + "epoch": 12.349183818310859, + "grad_norm": 8.430594444274902, + "learning_rate": 8.765621007806955e-05, + "loss": 0.02455626130104065, + "step": 87000 + }, + { + "epoch": 12.349183818310859, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.055014777928590775, + "eval_runtime": 33.1479, + "eval_samples_per_second": 474.449, + "eval_steps_per_second": 14.843, + "step": 87000 + }, + { + "epoch": 12.350603264726757, + "grad_norm": 3.624072790145874, + "learning_rate": 8.765479063165366e-05, + "loss": 0.02058798223733902, + "step": 87010 + }, + { + "epoch": 12.352022711142654, + "grad_norm": 5.688584327697754, + "learning_rate": 8.765337118523776e-05, + "loss": 0.02451040893793106, + "step": 87020 + }, + { + "epoch": 12.353442157558552, + "grad_norm": 0.16633503139019012, + "learning_rate": 8.765195173882187e-05, + "loss": 0.012659680843353272, + "step": 87030 + }, + { + "epoch": 12.35486160397445, + "grad_norm": 10.611780166625977, + "learning_rate": 8.765053229240596e-05, + "loss": 0.05080366134643555, + "step": 87040 + }, + { + "epoch": 12.356281050390347, + "grad_norm": 3.00984525680542, + "learning_rate": 8.764911284599007e-05, + "loss": 0.03359057009220123, + "step": 87050 + }, + { + "epoch": 12.357700496806245, + "grad_norm": 4.661736011505127, + "learning_rate": 8.764769339957416e-05, + "loss": 0.011646966636180877, + "step": 87060 + }, + { + "epoch": 12.359119943222144, + "grad_norm": 1.608762264251709, + "learning_rate": 8.764627395315828e-05, + "loss": 0.015019053220748901, + "step": 87070 + }, + { + "epoch": 12.360539389638042, + "grad_norm": 0.6708681583404541, + "learning_rate": 8.764485450674239e-05, + "loss": 0.024743953347206117, + "step": 87080 + }, + { + "epoch": 12.361958836053939, + "grad_norm": 0.05358989164233208, + "learning_rate": 8.764343506032647e-05, + "loss": 0.015583789348602295, + "step": 87090 + }, + { + "epoch": 12.363378282469837, + "grad_norm": 1.0911883115768433, + "learning_rate": 8.764201561391058e-05, + "loss": 0.021644075214862824, + "step": 87100 + }, + { + "epoch": 12.364797728885735, + "grad_norm": 0.013731296174228191, + "learning_rate": 8.764059616749468e-05, + "loss": 0.010372109711170197, + "step": 87110 + }, + { + "epoch": 12.366217175301632, + "grad_norm": 4.24946403503418, + "learning_rate": 8.763917672107879e-05, + "loss": 0.024570387601852418, + "step": 87120 + }, + { + "epoch": 12.36763662171753, + "grad_norm": 0.3531741797924042, + "learning_rate": 8.763775727466289e-05, + "loss": 0.030188676714897156, + "step": 87130 + }, + { + "epoch": 12.369056068133428, + "grad_norm": 0.037166986614465714, + "learning_rate": 8.763633782824698e-05, + "loss": 0.015771086513996124, + "step": 87140 + }, + { + "epoch": 12.370475514549327, + "grad_norm": 0.09290240705013275, + "learning_rate": 8.763491838183108e-05, + "loss": 0.013740764558315277, + "step": 87150 + }, + { + "epoch": 12.371894960965223, + "grad_norm": 6.755671501159668, + "learning_rate": 8.763349893541519e-05, + "loss": 0.045585596561431886, + "step": 87160 + }, + { + "epoch": 12.373314407381121, + "grad_norm": 1.835556149482727, + "learning_rate": 8.76320794889993e-05, + "loss": 0.006282643973827362, + "step": 87170 + }, + { + "epoch": 12.37473385379702, + "grad_norm": 0.2747805714607239, + "learning_rate": 8.76306600425834e-05, + "loss": 0.031342515349388124, + "step": 87180 + }, + { + "epoch": 12.376153300212916, + "grad_norm": 0.30110907554626465, + "learning_rate": 8.76292405961675e-05, + "loss": 0.02236345112323761, + "step": 87190 + }, + { + "epoch": 12.377572746628815, + "grad_norm": 0.16461510956287384, + "learning_rate": 8.76278211497516e-05, + "loss": 0.006141844391822815, + "step": 87200 + }, + { + "epoch": 12.378992193044713, + "grad_norm": 3.9134035110473633, + "learning_rate": 8.76264017033357e-05, + "loss": 0.011970852315425873, + "step": 87210 + }, + { + "epoch": 12.380411639460611, + "grad_norm": 12.888188362121582, + "learning_rate": 8.76249822569198e-05, + "loss": 0.026162534952163696, + "step": 87220 + }, + { + "epoch": 12.381831085876508, + "grad_norm": 1.6967792510986328, + "learning_rate": 8.762356281050391e-05, + "loss": 0.017707546055316926, + "step": 87230 + }, + { + "epoch": 12.383250532292406, + "grad_norm": 0.01907888986170292, + "learning_rate": 8.7622143364088e-05, + "loss": 0.016062094271183013, + "step": 87240 + }, + { + "epoch": 12.384669978708304, + "grad_norm": 1.7975367307662964, + "learning_rate": 8.762072391767211e-05, + "loss": 0.057080841064453124, + "step": 87250 + }, + { + "epoch": 12.3860894251242, + "grad_norm": 0.19056759774684906, + "learning_rate": 8.761930447125622e-05, + "loss": 0.009392456710338592, + "step": 87260 + }, + { + "epoch": 12.3875088715401, + "grad_norm": 0.057802751660346985, + "learning_rate": 8.761788502484032e-05, + "loss": 0.02272159457206726, + "step": 87270 + }, + { + "epoch": 12.388928317955997, + "grad_norm": 0.046848129481077194, + "learning_rate": 8.761646557842443e-05, + "loss": 0.01983257681131363, + "step": 87280 + }, + { + "epoch": 12.390347764371896, + "grad_norm": 6.277688503265381, + "learning_rate": 8.761504613200851e-05, + "loss": 0.02165149450302124, + "step": 87290 + }, + { + "epoch": 12.391767210787792, + "grad_norm": 6.719855308532715, + "learning_rate": 8.761362668559262e-05, + "loss": 0.06963080167770386, + "step": 87300 + }, + { + "epoch": 12.39318665720369, + "grad_norm": 7.310576915740967, + "learning_rate": 8.761220723917672e-05, + "loss": 0.012097867578268051, + "step": 87310 + }, + { + "epoch": 12.394606103619589, + "grad_norm": 0.18734972178936005, + "learning_rate": 8.761078779276083e-05, + "loss": 0.0036718335002660753, + "step": 87320 + }, + { + "epoch": 12.396025550035485, + "grad_norm": 5.503396511077881, + "learning_rate": 8.760936834634493e-05, + "loss": 0.017102155089378356, + "step": 87330 + }, + { + "epoch": 12.397444996451384, + "grad_norm": 1.302686095237732, + "learning_rate": 8.760794889992903e-05, + "loss": 0.03908879160881042, + "step": 87340 + }, + { + "epoch": 12.398864442867282, + "grad_norm": 0.13944663107395172, + "learning_rate": 8.760652945351314e-05, + "loss": 0.04314888119697571, + "step": 87350 + }, + { + "epoch": 12.40028388928318, + "grad_norm": 0.19432903826236725, + "learning_rate": 8.760511000709723e-05, + "loss": 0.014165303111076355, + "step": 87360 + }, + { + "epoch": 12.401703335699077, + "grad_norm": 1.674633264541626, + "learning_rate": 8.760369056068135e-05, + "loss": 0.0025971658527851106, + "step": 87370 + }, + { + "epoch": 12.403122782114975, + "grad_norm": 4.423507213592529, + "learning_rate": 8.760227111426544e-05, + "loss": 0.01057143062353134, + "step": 87380 + }, + { + "epoch": 12.404542228530874, + "grad_norm": 0.0152947548776865, + "learning_rate": 8.760085166784955e-05, + "loss": 0.032986536622047424, + "step": 87390 + }, + { + "epoch": 12.40596167494677, + "grad_norm": 0.5295947194099426, + "learning_rate": 8.759943222143364e-05, + "loss": 0.023621892929077147, + "step": 87400 + }, + { + "epoch": 12.407381121362668, + "grad_norm": 0.017911149188876152, + "learning_rate": 8.759801277501775e-05, + "loss": 0.029540061950683594, + "step": 87410 + }, + { + "epoch": 12.408800567778567, + "grad_norm": 0.042582545429468155, + "learning_rate": 8.759659332860185e-05, + "loss": 0.03940750360488891, + "step": 87420 + }, + { + "epoch": 12.410220014194465, + "grad_norm": 14.954813957214355, + "learning_rate": 8.759517388218596e-05, + "loss": 0.054214847087860105, + "step": 87430 + }, + { + "epoch": 12.411639460610361, + "grad_norm": 0.5235926508903503, + "learning_rate": 8.759375443577005e-05, + "loss": 0.06929436922073365, + "step": 87440 + }, + { + "epoch": 12.41305890702626, + "grad_norm": 9.736461639404297, + "learning_rate": 8.759233498935415e-05, + "loss": 0.18118247985839844, + "step": 87450 + }, + { + "epoch": 12.414478353442158, + "grad_norm": 0.6641530990600586, + "learning_rate": 8.759091554293826e-05, + "loss": 0.030246061086654664, + "step": 87460 + }, + { + "epoch": 12.415897799858055, + "grad_norm": 0.011021087877452374, + "learning_rate": 8.758949609652236e-05, + "loss": 0.005495109036564827, + "step": 87470 + }, + { + "epoch": 12.417317246273953, + "grad_norm": 4.158930778503418, + "learning_rate": 8.758807665010647e-05, + "loss": 0.025970342755317687, + "step": 87480 + }, + { + "epoch": 12.418736692689851, + "grad_norm": 0.2828434109687805, + "learning_rate": 8.758665720369057e-05, + "loss": 0.013630589842796326, + "step": 87490 + }, + { + "epoch": 12.42015613910575, + "grad_norm": 0.26347261667251587, + "learning_rate": 8.758523775727467e-05, + "loss": 0.01589176505804062, + "step": 87500 + }, + { + "epoch": 12.42015613910575, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.06081771478056908, + "eval_runtime": 32.4567, + "eval_samples_per_second": 484.553, + "eval_steps_per_second": 15.159, + "step": 87500 + }, + { + "epoch": 12.421575585521646, + "grad_norm": 0.53838711977005, + "learning_rate": 8.758381831085876e-05, + "loss": 0.013520075380802155, + "step": 87510 + }, + { + "epoch": 12.422995031937544, + "grad_norm": 0.6461429595947266, + "learning_rate": 8.758239886444287e-05, + "loss": 0.028435495495796204, + "step": 87520 + }, + { + "epoch": 12.424414478353443, + "grad_norm": 0.12812745571136475, + "learning_rate": 8.758097941802697e-05, + "loss": 0.010488449782133102, + "step": 87530 + }, + { + "epoch": 12.42583392476934, + "grad_norm": 0.09482559561729431, + "learning_rate": 8.757955997161108e-05, + "loss": 0.029420602321624755, + "step": 87540 + }, + { + "epoch": 12.427253371185238, + "grad_norm": 0.06975217163562775, + "learning_rate": 8.757814052519518e-05, + "loss": 0.02707561254501343, + "step": 87550 + }, + { + "epoch": 12.428672817601136, + "grad_norm": 5.481614112854004, + "learning_rate": 8.757672107877928e-05, + "loss": 0.033937618136405945, + "step": 87560 + }, + { + "epoch": 12.430092264017034, + "grad_norm": 12.287485122680664, + "learning_rate": 8.757530163236339e-05, + "loss": 0.03466455936431885, + "step": 87570 + }, + { + "epoch": 12.43151171043293, + "grad_norm": 0.046677395701408386, + "learning_rate": 8.757388218594749e-05, + "loss": 0.03979707658290863, + "step": 87580 + }, + { + "epoch": 12.432931156848829, + "grad_norm": 11.278783798217773, + "learning_rate": 8.75724627395316e-05, + "loss": 0.044660136103630066, + "step": 87590 + }, + { + "epoch": 12.434350603264727, + "grad_norm": 0.04154336825013161, + "learning_rate": 8.757104329311568e-05, + "loss": 0.011902222782373429, + "step": 87600 + }, + { + "epoch": 12.435770049680624, + "grad_norm": 4.308584213256836, + "learning_rate": 8.756962384669979e-05, + "loss": 0.04704400897026062, + "step": 87610 + }, + { + "epoch": 12.437189496096522, + "grad_norm": 0.3787962794303894, + "learning_rate": 8.756820440028389e-05, + "loss": 0.05933155417442322, + "step": 87620 + }, + { + "epoch": 12.43860894251242, + "grad_norm": 2.361604690551758, + "learning_rate": 8.7566784953868e-05, + "loss": 0.048081979155540466, + "step": 87630 + }, + { + "epoch": 12.440028388928319, + "grad_norm": 0.29323363304138184, + "learning_rate": 8.75653655074521e-05, + "loss": 0.013682647049427033, + "step": 87640 + }, + { + "epoch": 12.441447835344215, + "grad_norm": 0.1297416388988495, + "learning_rate": 8.75639460610362e-05, + "loss": 0.02944377064704895, + "step": 87650 + }, + { + "epoch": 12.442867281760114, + "grad_norm": 2.6698644161224365, + "learning_rate": 8.75625266146203e-05, + "loss": 0.03293330073356628, + "step": 87660 + }, + { + "epoch": 12.444286728176012, + "grad_norm": 6.906935691833496, + "learning_rate": 8.75611071682044e-05, + "loss": 0.019272826611995697, + "step": 87670 + }, + { + "epoch": 12.445706174591908, + "grad_norm": 0.045950256288051605, + "learning_rate": 8.755968772178851e-05, + "loss": 0.03073270320892334, + "step": 87680 + }, + { + "epoch": 12.447125621007807, + "grad_norm": 0.040875449776649475, + "learning_rate": 8.755826827537261e-05, + "loss": 0.11078486442565919, + "step": 87690 + }, + { + "epoch": 12.448545067423705, + "grad_norm": 10.369391441345215, + "learning_rate": 8.755684882895671e-05, + "loss": 0.040163788199424746, + "step": 87700 + }, + { + "epoch": 12.449964513839603, + "grad_norm": 14.18911075592041, + "learning_rate": 8.75554293825408e-05, + "loss": 0.04805826246738434, + "step": 87710 + }, + { + "epoch": 12.4513839602555, + "grad_norm": 0.6210464835166931, + "learning_rate": 8.755400993612492e-05, + "loss": 0.03395252823829651, + "step": 87720 + }, + { + "epoch": 12.452803406671398, + "grad_norm": 0.23483672738075256, + "learning_rate": 8.755259048970901e-05, + "loss": 0.02177533507347107, + "step": 87730 + }, + { + "epoch": 12.454222853087296, + "grad_norm": 0.06254476308822632, + "learning_rate": 8.755117104329312e-05, + "loss": 0.03554804921150208, + "step": 87740 + }, + { + "epoch": 12.455642299503193, + "grad_norm": 0.46991172432899475, + "learning_rate": 8.754975159687722e-05, + "loss": 0.004773439094424248, + "step": 87750 + }, + { + "epoch": 12.457061745919091, + "grad_norm": 0.015528268180787563, + "learning_rate": 8.754833215046132e-05, + "loss": 0.015317653119564057, + "step": 87760 + }, + { + "epoch": 12.45848119233499, + "grad_norm": 0.04909134656190872, + "learning_rate": 8.754691270404543e-05, + "loss": 0.016335402429103852, + "step": 87770 + }, + { + "epoch": 12.459900638750888, + "grad_norm": 2.1774823665618896, + "learning_rate": 8.754549325762953e-05, + "loss": 0.02170611321926117, + "step": 87780 + }, + { + "epoch": 12.461320085166784, + "grad_norm": 0.414140522480011, + "learning_rate": 8.754407381121364e-05, + "loss": 0.04764865934848785, + "step": 87790 + }, + { + "epoch": 12.462739531582683, + "grad_norm": 1.7557913064956665, + "learning_rate": 8.754265436479774e-05, + "loss": 0.05648176074028015, + "step": 87800 + }, + { + "epoch": 12.464158977998581, + "grad_norm": 0.21937665343284607, + "learning_rate": 8.754123491838183e-05, + "loss": 0.0062197927385568615, + "step": 87810 + }, + { + "epoch": 12.465578424414478, + "grad_norm": 6.514895915985107, + "learning_rate": 8.753981547196593e-05, + "loss": 0.037446460127830504, + "step": 87820 + }, + { + "epoch": 12.466997870830376, + "grad_norm": 0.046004533767700195, + "learning_rate": 8.753839602555004e-05, + "loss": 0.020039723813533784, + "step": 87830 + }, + { + "epoch": 12.468417317246274, + "grad_norm": 0.03143101558089256, + "learning_rate": 8.753697657913414e-05, + "loss": 0.004655058309435844, + "step": 87840 + }, + { + "epoch": 12.469836763662173, + "grad_norm": 1.383615255355835, + "learning_rate": 8.753555713271825e-05, + "loss": 0.0048833321779966354, + "step": 87850 + }, + { + "epoch": 12.471256210078069, + "grad_norm": 0.08284850418567657, + "learning_rate": 8.753413768630235e-05, + "loss": 0.03349995911121369, + "step": 87860 + }, + { + "epoch": 12.472675656493967, + "grad_norm": 10.989090919494629, + "learning_rate": 8.753271823988644e-05, + "loss": 0.02032398581504822, + "step": 87870 + }, + { + "epoch": 12.474095102909866, + "grad_norm": 0.06767125427722931, + "learning_rate": 8.753129879347056e-05, + "loss": 0.04788086712360382, + "step": 87880 + }, + { + "epoch": 12.475514549325762, + "grad_norm": 0.33816125988960266, + "learning_rate": 8.752987934705465e-05, + "loss": 0.07616091370582581, + "step": 87890 + }, + { + "epoch": 12.47693399574166, + "grad_norm": 14.20706558227539, + "learning_rate": 8.752845990063876e-05, + "loss": 0.06578752994537354, + "step": 87900 + }, + { + "epoch": 12.478353442157559, + "grad_norm": 2.63919997215271, + "learning_rate": 8.752704045422285e-05, + "loss": 0.02024722993373871, + "step": 87910 + }, + { + "epoch": 12.479772888573457, + "grad_norm": 9.86892318725586, + "learning_rate": 8.752562100780696e-05, + "loss": 0.044533932209014894, + "step": 87920 + }, + { + "epoch": 12.481192334989354, + "grad_norm": 0.9026236534118652, + "learning_rate": 8.752420156139106e-05, + "loss": 0.0754780113697052, + "step": 87930 + }, + { + "epoch": 12.482611781405252, + "grad_norm": 18.482194900512695, + "learning_rate": 8.752278211497517e-05, + "loss": 0.06532397866249084, + "step": 87940 + }, + { + "epoch": 12.48403122782115, + "grad_norm": 0.9888849854469299, + "learning_rate": 8.752136266855926e-05, + "loss": 0.05627738833427429, + "step": 87950 + }, + { + "epoch": 12.485450674237047, + "grad_norm": 4.003131866455078, + "learning_rate": 8.751994322214336e-05, + "loss": 0.05450453758239746, + "step": 87960 + }, + { + "epoch": 12.486870120652945, + "grad_norm": 8.732240676879883, + "learning_rate": 8.751852377572747e-05, + "loss": 0.039267003536224365, + "step": 87970 + }, + { + "epoch": 12.488289567068843, + "grad_norm": 1.3493448495864868, + "learning_rate": 8.751710432931157e-05, + "loss": 0.0159692645072937, + "step": 87980 + }, + { + "epoch": 12.489709013484742, + "grad_norm": 0.08096921443939209, + "learning_rate": 8.751568488289568e-05, + "loss": 0.02429681420326233, + "step": 87990 + }, + { + "epoch": 12.491128459900638, + "grad_norm": 5.899989604949951, + "learning_rate": 8.751426543647978e-05, + "loss": 0.04755156338214874, + "step": 88000 + }, + { + "epoch": 12.491128459900638, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.05461974814534187, + "eval_runtime": 33.2764, + "eval_samples_per_second": 472.617, + "eval_steps_per_second": 14.785, + "step": 88000 + }, + { + "epoch": 12.492547906316537, + "grad_norm": 0.3637841045856476, + "learning_rate": 8.751284599006388e-05, + "loss": 0.010181383788585662, + "step": 88010 + }, + { + "epoch": 12.493967352732435, + "grad_norm": 0.056884367018938065, + "learning_rate": 8.751142654364797e-05, + "loss": 0.02700384259223938, + "step": 88020 + }, + { + "epoch": 12.495386799148331, + "grad_norm": 0.06662727892398834, + "learning_rate": 8.751000709723208e-05, + "loss": 0.024283269047737123, + "step": 88030 + }, + { + "epoch": 12.49680624556423, + "grad_norm": 2.124976873397827, + "learning_rate": 8.750858765081618e-05, + "loss": 0.03328163623809814, + "step": 88040 + }, + { + "epoch": 12.498225691980128, + "grad_norm": 0.18708670139312744, + "learning_rate": 8.750716820440029e-05, + "loss": 0.035369104146957396, + "step": 88050 + }, + { + "epoch": 12.499645138396026, + "grad_norm": 13.181412696838379, + "learning_rate": 8.750574875798439e-05, + "loss": 0.02761123776435852, + "step": 88060 + }, + { + "epoch": 12.501064584811923, + "grad_norm": 0.07607295364141464, + "learning_rate": 8.750432931156849e-05, + "loss": 0.008057416975498199, + "step": 88070 + }, + { + "epoch": 12.502484031227821, + "grad_norm": 0.4912075996398926, + "learning_rate": 8.75029098651526e-05, + "loss": 0.026665246486663817, + "step": 88080 + }, + { + "epoch": 12.50390347764372, + "grad_norm": 6.729429721832275, + "learning_rate": 8.75014904187367e-05, + "loss": 0.02897602915763855, + "step": 88090 + }, + { + "epoch": 12.505322924059616, + "grad_norm": 0.07232243567705154, + "learning_rate": 8.75000709723208e-05, + "loss": 0.026740044355392456, + "step": 88100 + }, + { + "epoch": 12.506742370475514, + "grad_norm": 3.6393470764160156, + "learning_rate": 8.74986515259049e-05, + "loss": 0.009813410043716431, + "step": 88110 + }, + { + "epoch": 12.508161816891413, + "grad_norm": 1.4777772426605225, + "learning_rate": 8.7497232079489e-05, + "loss": 0.024714049696922303, + "step": 88120 + }, + { + "epoch": 12.509581263307311, + "grad_norm": 1.4388811588287354, + "learning_rate": 8.74958126330731e-05, + "loss": 0.06101258993148804, + "step": 88130 + }, + { + "epoch": 12.511000709723207, + "grad_norm": 7.557995319366455, + "learning_rate": 8.749439318665721e-05, + "loss": 0.10327945947647095, + "step": 88140 + }, + { + "epoch": 12.512420156139106, + "grad_norm": 9.990800857543945, + "learning_rate": 8.749297374024131e-05, + "loss": 0.05634713768959045, + "step": 88150 + }, + { + "epoch": 12.513839602555004, + "grad_norm": 4.879364967346191, + "learning_rate": 8.749155429382542e-05, + "loss": 0.06440256237983703, + "step": 88160 + }, + { + "epoch": 12.5152590489709, + "grad_norm": 16.748706817626953, + "learning_rate": 8.749013484740952e-05, + "loss": 0.05296328663825989, + "step": 88170 + }, + { + "epoch": 12.516678495386799, + "grad_norm": 2.7976927757263184, + "learning_rate": 8.748871540099361e-05, + "loss": 0.024095140397548676, + "step": 88180 + }, + { + "epoch": 12.518097941802697, + "grad_norm": 1.1793253421783447, + "learning_rate": 8.748729595457772e-05, + "loss": 0.02114529013633728, + "step": 88190 + }, + { + "epoch": 12.519517388218595, + "grad_norm": 8.5409517288208, + "learning_rate": 8.748587650816182e-05, + "loss": 0.021783101558685302, + "step": 88200 + }, + { + "epoch": 12.520936834634492, + "grad_norm": 6.478908061981201, + "learning_rate": 8.748445706174593e-05, + "loss": 0.019826227426528932, + "step": 88210 + }, + { + "epoch": 12.52235628105039, + "grad_norm": 1.4375824928283691, + "learning_rate": 8.748303761533002e-05, + "loss": 0.05534259080886841, + "step": 88220 + }, + { + "epoch": 12.523775727466289, + "grad_norm": 0.06427717208862305, + "learning_rate": 8.748161816891413e-05, + "loss": 0.051985299587249754, + "step": 88230 + }, + { + "epoch": 12.525195173882185, + "grad_norm": 0.13583670556545258, + "learning_rate": 8.748019872249822e-05, + "loss": 0.022438764572143555, + "step": 88240 + }, + { + "epoch": 12.526614620298083, + "grad_norm": 5.735683917999268, + "learning_rate": 8.747877927608233e-05, + "loss": 0.04711911678314209, + "step": 88250 + }, + { + "epoch": 12.528034066713982, + "grad_norm": 0.3198873996734619, + "learning_rate": 8.747735982966643e-05, + "loss": 0.027721744775772095, + "step": 88260 + }, + { + "epoch": 12.52945351312988, + "grad_norm": 7.128531455993652, + "learning_rate": 8.747594038325053e-05, + "loss": 0.012630848586559296, + "step": 88270 + }, + { + "epoch": 12.530872959545777, + "grad_norm": 0.5762258768081665, + "learning_rate": 8.747452093683464e-05, + "loss": 0.013543166220188141, + "step": 88280 + }, + { + "epoch": 12.532292405961675, + "grad_norm": 1.9634393453598022, + "learning_rate": 8.747310149041874e-05, + "loss": 0.016037152707576753, + "step": 88290 + }, + { + "epoch": 12.533711852377573, + "grad_norm": 6.396450042724609, + "learning_rate": 8.747168204400285e-05, + "loss": 0.043987932801246646, + "step": 88300 + }, + { + "epoch": 12.53513129879347, + "grad_norm": 0.2535754442214966, + "learning_rate": 8.747026259758695e-05, + "loss": 0.08850111961364746, + "step": 88310 + }, + { + "epoch": 12.536550745209368, + "grad_norm": 0.5153784155845642, + "learning_rate": 8.746884315117104e-05, + "loss": 0.006086389720439911, + "step": 88320 + }, + { + "epoch": 12.537970191625266, + "grad_norm": 0.9000504612922668, + "learning_rate": 8.746742370475514e-05, + "loss": 0.05115777254104614, + "step": 88330 + }, + { + "epoch": 12.539389638041165, + "grad_norm": 0.12878790497779846, + "learning_rate": 8.746600425833925e-05, + "loss": 0.008339033275842667, + "step": 88340 + }, + { + "epoch": 12.540809084457061, + "grad_norm": 0.431140661239624, + "learning_rate": 8.746458481192335e-05, + "loss": 0.023614758253097536, + "step": 88350 + }, + { + "epoch": 12.54222853087296, + "grad_norm": 0.34187155961990356, + "learning_rate": 8.746316536550746e-05, + "loss": 0.03767357170581818, + "step": 88360 + }, + { + "epoch": 12.543647977288858, + "grad_norm": 0.11099901050329208, + "learning_rate": 8.746174591909156e-05, + "loss": 0.03305617868900299, + "step": 88370 + }, + { + "epoch": 12.545067423704754, + "grad_norm": 0.17631615698337555, + "learning_rate": 8.746032647267565e-05, + "loss": 0.016865435242652892, + "step": 88380 + }, + { + "epoch": 12.546486870120653, + "grad_norm": 0.03785773366689682, + "learning_rate": 8.745890702625977e-05, + "loss": 0.0281915158033371, + "step": 88390 + }, + { + "epoch": 12.547906316536551, + "grad_norm": 0.6064432263374329, + "learning_rate": 8.745748757984386e-05, + "loss": 0.023923471570014954, + "step": 88400 + }, + { + "epoch": 12.54932576295245, + "grad_norm": 3.660982370376587, + "learning_rate": 8.745606813342797e-05, + "loss": 0.011482635140419006, + "step": 88410 + }, + { + "epoch": 12.550745209368346, + "grad_norm": 3.0567476749420166, + "learning_rate": 8.745464868701206e-05, + "loss": 0.015174010396003723, + "step": 88420 + }, + { + "epoch": 12.552164655784244, + "grad_norm": 0.22067005932331085, + "learning_rate": 8.745322924059617e-05, + "loss": 0.014688675105571748, + "step": 88430 + }, + { + "epoch": 12.553584102200142, + "grad_norm": 6.446946620941162, + "learning_rate": 8.745180979418027e-05, + "loss": 0.020932187139987946, + "step": 88440 + }, + { + "epoch": 12.555003548616039, + "grad_norm": 1.4346462488174438, + "learning_rate": 8.745039034776438e-05, + "loss": 0.064886873960495, + "step": 88450 + }, + { + "epoch": 12.556422995031937, + "grad_norm": 0.07562988251447678, + "learning_rate": 8.744897090134849e-05, + "loss": 0.012400922179222108, + "step": 88460 + }, + { + "epoch": 12.557842441447836, + "grad_norm": 1.1553202867507935, + "learning_rate": 8.744755145493259e-05, + "loss": 0.02751150131225586, + "step": 88470 + }, + { + "epoch": 12.559261887863734, + "grad_norm": 0.09762617945671082, + "learning_rate": 8.744613200851668e-05, + "loss": 0.04214627742767334, + "step": 88480 + }, + { + "epoch": 12.56068133427963, + "grad_norm": 7.1212592124938965, + "learning_rate": 8.744471256210078e-05, + "loss": 0.027154302597045897, + "step": 88490 + }, + { + "epoch": 12.562100780695529, + "grad_norm": 2.778210163116455, + "learning_rate": 8.744329311568489e-05, + "loss": 0.04185508191585541, + "step": 88500 + }, + { + "epoch": 12.562100780695529, + "eval_accuracy": 0.982069053220576, + "eval_loss": 0.06504856050014496, + "eval_runtime": 32.7255, + "eval_samples_per_second": 480.573, + "eval_steps_per_second": 15.034, + "step": 88500 + }, + { + "epoch": 12.563520227111427, + "grad_norm": 1.278740406036377, + "learning_rate": 8.744187366926899e-05, + "loss": 0.019236615300178526, + "step": 88510 + }, + { + "epoch": 12.564939673527324, + "grad_norm": 0.029653554782271385, + "learning_rate": 8.74404542228531e-05, + "loss": 0.02058974504470825, + "step": 88520 + }, + { + "epoch": 12.566359119943222, + "grad_norm": 2.3865671157836914, + "learning_rate": 8.743903477643718e-05, + "loss": 0.02472461760044098, + "step": 88530 + }, + { + "epoch": 12.56777856635912, + "grad_norm": 0.04514153301715851, + "learning_rate": 8.74376153300213e-05, + "loss": 0.016757330298423766, + "step": 88540 + }, + { + "epoch": 12.569198012775018, + "grad_norm": 2.034675359725952, + "learning_rate": 8.74361958836054e-05, + "loss": 0.017388372123241423, + "step": 88550 + }, + { + "epoch": 12.570617459190915, + "grad_norm": 13.655932426452637, + "learning_rate": 8.74347764371895e-05, + "loss": 0.020960594713687896, + "step": 88560 + }, + { + "epoch": 12.572036905606813, + "grad_norm": 0.2972569763660431, + "learning_rate": 8.743335699077361e-05, + "loss": 0.019746491312980653, + "step": 88570 + }, + { + "epoch": 12.573456352022712, + "grad_norm": 0.038133490830659866, + "learning_rate": 8.74319375443577e-05, + "loss": 0.017399133741855623, + "step": 88580 + }, + { + "epoch": 12.574875798438608, + "grad_norm": 8.638161659240723, + "learning_rate": 8.743051809794181e-05, + "loss": 0.041193830966949466, + "step": 88590 + }, + { + "epoch": 12.576295244854506, + "grad_norm": 5.905753135681152, + "learning_rate": 8.74290986515259e-05, + "loss": 0.012840107083320618, + "step": 88600 + }, + { + "epoch": 12.577714691270405, + "grad_norm": 0.0456351675093174, + "learning_rate": 8.742767920511002e-05, + "loss": 0.05107580423355103, + "step": 88610 + }, + { + "epoch": 12.579134137686303, + "grad_norm": 2.247901201248169, + "learning_rate": 8.742625975869411e-05, + "loss": 0.008354905247688293, + "step": 88620 + }, + { + "epoch": 12.5805535841022, + "grad_norm": 4.338565826416016, + "learning_rate": 8.742484031227821e-05, + "loss": 0.011515699326992035, + "step": 88630 + }, + { + "epoch": 12.581973030518098, + "grad_norm": 10.359500885009766, + "learning_rate": 8.742342086586232e-05, + "loss": 0.08170977830886841, + "step": 88640 + }, + { + "epoch": 12.583392476933996, + "grad_norm": 0.3888038396835327, + "learning_rate": 8.742200141944642e-05, + "loss": 0.023364748060703277, + "step": 88650 + }, + { + "epoch": 12.584811923349893, + "grad_norm": 6.801969528198242, + "learning_rate": 8.742058197303053e-05, + "loss": 0.039721333980560304, + "step": 88660 + }, + { + "epoch": 12.586231369765791, + "grad_norm": 1.2758439779281616, + "learning_rate": 8.741916252661463e-05, + "loss": 0.007038282603025437, + "step": 88670 + }, + { + "epoch": 12.58765081618169, + "grad_norm": 0.23825159668922424, + "learning_rate": 8.741774308019873e-05, + "loss": 0.028574222326278688, + "step": 88680 + }, + { + "epoch": 12.589070262597588, + "grad_norm": 0.2223246991634369, + "learning_rate": 8.741632363378282e-05, + "loss": 0.030116242170333863, + "step": 88690 + }, + { + "epoch": 12.590489709013484, + "grad_norm": 6.101010322570801, + "learning_rate": 8.741490418736693e-05, + "loss": 0.022591683268547057, + "step": 88700 + }, + { + "epoch": 12.591909155429382, + "grad_norm": 2.12394642829895, + "learning_rate": 8.741348474095103e-05, + "loss": 0.034531053900718686, + "step": 88710 + }, + { + "epoch": 12.59332860184528, + "grad_norm": 0.01367959938943386, + "learning_rate": 8.741206529453514e-05, + "loss": 0.025950926542282104, + "step": 88720 + }, + { + "epoch": 12.594748048261177, + "grad_norm": 0.10593011975288391, + "learning_rate": 8.741064584811924e-05, + "loss": 0.035642963647842404, + "step": 88730 + }, + { + "epoch": 12.596167494677076, + "grad_norm": 0.04683855548501015, + "learning_rate": 8.740922640170334e-05, + "loss": 0.01912751644849777, + "step": 88740 + }, + { + "epoch": 12.597586941092974, + "grad_norm": 0.07964295893907547, + "learning_rate": 8.740780695528745e-05, + "loss": 0.036763134598732, + "step": 88750 + }, + { + "epoch": 12.599006387508872, + "grad_norm": 14.841459274291992, + "learning_rate": 8.740638750887154e-05, + "loss": 0.036690741777420044, + "step": 88760 + }, + { + "epoch": 12.600425833924769, + "grad_norm": 0.33520805835723877, + "learning_rate": 8.740496806245566e-05, + "loss": 0.053361940383911136, + "step": 88770 + }, + { + "epoch": 12.601845280340667, + "grad_norm": 1.1256662607192993, + "learning_rate": 8.740354861603974e-05, + "loss": 0.01567111909389496, + "step": 88780 + }, + { + "epoch": 12.603264726756565, + "grad_norm": 13.993224143981934, + "learning_rate": 8.740212916962385e-05, + "loss": 0.06229985952377319, + "step": 88790 + }, + { + "epoch": 12.604684173172462, + "grad_norm": 0.03460023179650307, + "learning_rate": 8.740070972320795e-05, + "loss": 0.027052420377731323, + "step": 88800 + }, + { + "epoch": 12.60610361958836, + "grad_norm": 2.6936776638031006, + "learning_rate": 8.739929027679206e-05, + "loss": 0.02551792562007904, + "step": 88810 + }, + { + "epoch": 12.607523066004259, + "grad_norm": 2.129927396774292, + "learning_rate": 8.739787083037616e-05, + "loss": 0.022685115039348603, + "step": 88820 + }, + { + "epoch": 12.608942512420157, + "grad_norm": 0.14080578088760376, + "learning_rate": 8.739645138396027e-05, + "loss": 0.023080846667289732, + "step": 88830 + }, + { + "epoch": 12.610361958836053, + "grad_norm": 11.936066627502441, + "learning_rate": 8.739503193754436e-05, + "loss": 0.030691647529602052, + "step": 88840 + }, + { + "epoch": 12.611781405251952, + "grad_norm": 0.7048534750938416, + "learning_rate": 8.739361249112846e-05, + "loss": 0.012999877333641052, + "step": 88850 + }, + { + "epoch": 12.61320085166785, + "grad_norm": 0.25170546770095825, + "learning_rate": 8.739219304471257e-05, + "loss": 0.02892131209373474, + "step": 88860 + }, + { + "epoch": 12.614620298083747, + "grad_norm": 14.818376541137695, + "learning_rate": 8.739077359829667e-05, + "loss": 0.05256804823875427, + "step": 88870 + }, + { + "epoch": 12.616039744499645, + "grad_norm": 3.001458168029785, + "learning_rate": 8.738935415188078e-05, + "loss": 0.01690353751182556, + "step": 88880 + }, + { + "epoch": 12.617459190915543, + "grad_norm": 2.985347032546997, + "learning_rate": 8.738793470546487e-05, + "loss": 0.01725500375032425, + "step": 88890 + }, + { + "epoch": 12.618878637331441, + "grad_norm": 0.04500554874539375, + "learning_rate": 8.738651525904898e-05, + "loss": 0.038158965110778806, + "step": 88900 + }, + { + "epoch": 12.620298083747338, + "grad_norm": 2.2031068801879883, + "learning_rate": 8.738509581263307e-05, + "loss": 0.025946080684661865, + "step": 88910 + }, + { + "epoch": 12.621717530163236, + "grad_norm": 0.04379592835903168, + "learning_rate": 8.738367636621718e-05, + "loss": 0.0072929747402668, + "step": 88920 + }, + { + "epoch": 12.623136976579135, + "grad_norm": 9.492351531982422, + "learning_rate": 8.738225691980128e-05, + "loss": 0.025685006380081178, + "step": 88930 + }, + { + "epoch": 12.624556422995031, + "grad_norm": 0.008654547855257988, + "learning_rate": 8.738083747338538e-05, + "loss": 0.027894750237464905, + "step": 88940 + }, + { + "epoch": 12.62597586941093, + "grad_norm": 0.26972833275794983, + "learning_rate": 8.737941802696949e-05, + "loss": 0.020068514347076415, + "step": 88950 + }, + { + "epoch": 12.627395315826828, + "grad_norm": 0.13917766511440277, + "learning_rate": 8.737799858055359e-05, + "loss": 0.02202069163322449, + "step": 88960 + }, + { + "epoch": 12.628814762242726, + "grad_norm": 0.10656342655420303, + "learning_rate": 8.73765791341377e-05, + "loss": 0.020247262716293336, + "step": 88970 + }, + { + "epoch": 12.630234208658623, + "grad_norm": 9.946798324584961, + "learning_rate": 8.73751596877218e-05, + "loss": 0.01564723551273346, + "step": 88980 + }, + { + "epoch": 12.63165365507452, + "grad_norm": 0.1037072166800499, + "learning_rate": 8.737374024130589e-05, + "loss": 0.012409783899784088, + "step": 88990 + }, + { + "epoch": 12.63307310149042, + "grad_norm": 1.5739264488220215, + "learning_rate": 8.737232079488999e-05, + "loss": 0.03171505033969879, + "step": 89000 + }, + { + "epoch": 12.63307310149042, + "eval_accuracy": 0.9773637693139188, + "eval_loss": 0.08339247107505798, + "eval_runtime": 31.6288, + "eval_samples_per_second": 497.236, + "eval_steps_per_second": 15.555, + "step": 89000 + }, + { + "epoch": 12.634492547906316, + "grad_norm": 0.10447607189416885, + "learning_rate": 8.73709013484741e-05, + "loss": 0.040927499532699585, + "step": 89010 + }, + { + "epoch": 12.635911994322214, + "grad_norm": 0.1741301566362381, + "learning_rate": 8.73694819020582e-05, + "loss": 0.0033416420221328734, + "step": 89020 + }, + { + "epoch": 12.637331440738112, + "grad_norm": 1.1373984813690186, + "learning_rate": 8.736806245564231e-05, + "loss": 0.059111952781677246, + "step": 89030 + }, + { + "epoch": 12.63875088715401, + "grad_norm": 1.553276777267456, + "learning_rate": 8.736664300922641e-05, + "loss": 0.009425174444913864, + "step": 89040 + }, + { + "epoch": 12.640170333569907, + "grad_norm": 2.646817922592163, + "learning_rate": 8.73652235628105e-05, + "loss": 0.010878220200538635, + "step": 89050 + }, + { + "epoch": 12.641589779985805, + "grad_norm": 3.485379219055176, + "learning_rate": 8.736380411639462e-05, + "loss": 0.03806718289852142, + "step": 89060 + }, + { + "epoch": 12.643009226401704, + "grad_norm": 5.934311389923096, + "learning_rate": 8.736238466997871e-05, + "loss": 0.043131566047668456, + "step": 89070 + }, + { + "epoch": 12.6444286728176, + "grad_norm": 0.9953676462173462, + "learning_rate": 8.736096522356282e-05, + "loss": 0.019209764897823334, + "step": 89080 + }, + { + "epoch": 12.645848119233499, + "grad_norm": 5.589478015899658, + "learning_rate": 8.735954577714691e-05, + "loss": 0.033051124215126036, + "step": 89090 + }, + { + "epoch": 12.647267565649397, + "grad_norm": 5.7257256507873535, + "learning_rate": 8.735812633073102e-05, + "loss": 0.05738807916641235, + "step": 89100 + }, + { + "epoch": 12.648687012065295, + "grad_norm": 1.0921305418014526, + "learning_rate": 8.735670688431512e-05, + "loss": 0.020826832950115205, + "step": 89110 + }, + { + "epoch": 12.650106458481192, + "grad_norm": 0.13475766777992249, + "learning_rate": 8.735528743789923e-05, + "loss": 0.07699592709541321, + "step": 89120 + }, + { + "epoch": 12.65152590489709, + "grad_norm": 0.24702370166778564, + "learning_rate": 8.735386799148332e-05, + "loss": 0.05097814798355103, + "step": 89130 + }, + { + "epoch": 12.652945351312988, + "grad_norm": 3.9055874347686768, + "learning_rate": 8.735244854506742e-05, + "loss": 0.03892681002616882, + "step": 89140 + }, + { + "epoch": 12.654364797728885, + "grad_norm": 3.2908849716186523, + "learning_rate": 8.735102909865153e-05, + "loss": 0.10024460554122924, + "step": 89150 + }, + { + "epoch": 12.655784244144783, + "grad_norm": 11.251198768615723, + "learning_rate": 8.734960965223563e-05, + "loss": 0.03453767895698547, + "step": 89160 + }, + { + "epoch": 12.657203690560682, + "grad_norm": 8.44933795928955, + "learning_rate": 8.734819020581974e-05, + "loss": 0.028436681628227232, + "step": 89170 + }, + { + "epoch": 12.65862313697658, + "grad_norm": 0.1921631097793579, + "learning_rate": 8.734677075940384e-05, + "loss": 0.03966635465621948, + "step": 89180 + }, + { + "epoch": 12.660042583392476, + "grad_norm": 8.453948020935059, + "learning_rate": 8.734535131298795e-05, + "loss": 0.038900962471961974, + "step": 89190 + }, + { + "epoch": 12.661462029808375, + "grad_norm": 2.7020156383514404, + "learning_rate": 8.734393186657203e-05, + "loss": 0.03722269833087921, + "step": 89200 + }, + { + "epoch": 12.662881476224273, + "grad_norm": 0.048562612384557724, + "learning_rate": 8.734251242015614e-05, + "loss": 0.052408403158187865, + "step": 89210 + }, + { + "epoch": 12.66430092264017, + "grad_norm": 9.211213111877441, + "learning_rate": 8.734109297374024e-05, + "loss": 0.09202111959457397, + "step": 89220 + }, + { + "epoch": 12.665720369056068, + "grad_norm": 4.951529502868652, + "learning_rate": 8.733967352732435e-05, + "loss": 0.019138604402542114, + "step": 89230 + }, + { + "epoch": 12.667139815471966, + "grad_norm": 2.420982599258423, + "learning_rate": 8.733825408090845e-05, + "loss": 0.06687188744544983, + "step": 89240 + }, + { + "epoch": 12.668559261887864, + "grad_norm": 6.244063377380371, + "learning_rate": 8.733683463449255e-05, + "loss": 0.030867373943328856, + "step": 89250 + }, + { + "epoch": 12.669978708303761, + "grad_norm": 0.06475788354873657, + "learning_rate": 8.733541518807666e-05, + "loss": 0.021957488358020784, + "step": 89260 + }, + { + "epoch": 12.67139815471966, + "grad_norm": 12.286905288696289, + "learning_rate": 8.733399574166076e-05, + "loss": 0.018048429489135744, + "step": 89270 + }, + { + "epoch": 12.672817601135558, + "grad_norm": 0.29012051224708557, + "learning_rate": 8.733257629524487e-05, + "loss": 0.0073526807129383085, + "step": 89280 + }, + { + "epoch": 12.674237047551454, + "grad_norm": 0.33186063170433044, + "learning_rate": 8.733115684882896e-05, + "loss": 0.013821916282176971, + "step": 89290 + }, + { + "epoch": 12.675656493967352, + "grad_norm": 1.7569838762283325, + "learning_rate": 8.732973740241306e-05, + "loss": 0.03041381239891052, + "step": 89300 + }, + { + "epoch": 12.67707594038325, + "grad_norm": 0.06672785431146622, + "learning_rate": 8.732831795599716e-05, + "loss": 0.02769661545753479, + "step": 89310 + }, + { + "epoch": 12.678495386799149, + "grad_norm": 4.808945655822754, + "learning_rate": 8.732689850958127e-05, + "loss": 0.06973714232444764, + "step": 89320 + }, + { + "epoch": 12.679914833215046, + "grad_norm": 7.926112174987793, + "learning_rate": 8.732547906316537e-05, + "loss": 0.030643126368522643, + "step": 89330 + }, + { + "epoch": 12.681334279630944, + "grad_norm": 0.5075589418411255, + "learning_rate": 8.732405961674948e-05, + "loss": 0.0034361004829406737, + "step": 89340 + }, + { + "epoch": 12.682753726046842, + "grad_norm": 0.45156118273735046, + "learning_rate": 8.732264017033357e-05, + "loss": 0.02787870466709137, + "step": 89350 + }, + { + "epoch": 12.684173172462739, + "grad_norm": 2.541034698486328, + "learning_rate": 8.732122072391767e-05, + "loss": 0.010050937533378601, + "step": 89360 + }, + { + "epoch": 12.685592618878637, + "grad_norm": 7.8918280601501465, + "learning_rate": 8.731980127750178e-05, + "loss": 0.020843583345413207, + "step": 89370 + }, + { + "epoch": 12.687012065294535, + "grad_norm": 0.7082682847976685, + "learning_rate": 8.731838183108588e-05, + "loss": 0.03357086181640625, + "step": 89380 + }, + { + "epoch": 12.688431511710434, + "grad_norm": 0.05603795126080513, + "learning_rate": 8.731696238466999e-05, + "loss": 0.052737241983413695, + "step": 89390 + }, + { + "epoch": 12.68985095812633, + "grad_norm": 0.9145916700363159, + "learning_rate": 8.731554293825408e-05, + "loss": 0.025367552042007448, + "step": 89400 + }, + { + "epoch": 12.691270404542228, + "grad_norm": 0.27008840441703796, + "learning_rate": 8.731412349183819e-05, + "loss": 0.026930320262908935, + "step": 89410 + }, + { + "epoch": 12.692689850958127, + "grad_norm": 0.9023039937019348, + "learning_rate": 8.731270404542228e-05, + "loss": 0.0035582900047302244, + "step": 89420 + }, + { + "epoch": 12.694109297374023, + "grad_norm": 2.4974489212036133, + "learning_rate": 8.73112845990064e-05, + "loss": 0.013577282428741455, + "step": 89430 + }, + { + "epoch": 12.695528743789922, + "grad_norm": 0.05072787031531334, + "learning_rate": 8.730986515259049e-05, + "loss": 0.010160622000694276, + "step": 89440 + }, + { + "epoch": 12.69694819020582, + "grad_norm": 0.5751616358757019, + "learning_rate": 8.730844570617459e-05, + "loss": 0.01841437667608261, + "step": 89450 + }, + { + "epoch": 12.698367636621718, + "grad_norm": 0.13704656064510345, + "learning_rate": 8.73070262597587e-05, + "loss": 0.014767895638942718, + "step": 89460 + }, + { + "epoch": 12.699787083037615, + "grad_norm": 1.5391329526901245, + "learning_rate": 8.73056068133428e-05, + "loss": 0.00814460813999176, + "step": 89470 + }, + { + "epoch": 12.701206529453513, + "grad_norm": 11.336458206176758, + "learning_rate": 8.730418736692691e-05, + "loss": 0.026091352105140686, + "step": 89480 + }, + { + "epoch": 12.702625975869411, + "grad_norm": 0.3561452329158783, + "learning_rate": 8.7302767920511e-05, + "loss": 0.009797683358192444, + "step": 89490 + }, + { + "epoch": 12.704045422285308, + "grad_norm": 4.071054935455322, + "learning_rate": 8.730134847409512e-05, + "loss": 0.008877287060022354, + "step": 89500 + }, + { + "epoch": 12.704045422285308, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.04127642139792442, + "eval_runtime": 34.1649, + "eval_samples_per_second": 460.326, + "eval_steps_per_second": 14.401, + "step": 89500 + }, + { + "epoch": 12.705464868701206, + "grad_norm": 1.4673656225204468, + "learning_rate": 8.72999290276792e-05, + "loss": 0.02310786545276642, + "step": 89510 + }, + { + "epoch": 12.706884315117104, + "grad_norm": 0.3841665983200073, + "learning_rate": 8.729850958126331e-05, + "loss": 0.0045328203588724135, + "step": 89520 + }, + { + "epoch": 12.708303761533003, + "grad_norm": 0.1576647162437439, + "learning_rate": 8.729709013484741e-05, + "loss": 0.022032007575035095, + "step": 89530 + }, + { + "epoch": 12.7097232079489, + "grad_norm": 1.5458643436431885, + "learning_rate": 8.729567068843152e-05, + "loss": 0.04079245924949646, + "step": 89540 + }, + { + "epoch": 12.711142654364798, + "grad_norm": 0.017013955861330032, + "learning_rate": 8.729425124201562e-05, + "loss": 0.035155534744262695, + "step": 89550 + }, + { + "epoch": 12.712562100780696, + "grad_norm": 0.8977913856506348, + "learning_rate": 8.729283179559971e-05, + "loss": 0.00961761102080345, + "step": 89560 + }, + { + "epoch": 12.713981547196592, + "grad_norm": 0.07090606540441513, + "learning_rate": 8.729141234918383e-05, + "loss": 0.03372899293899536, + "step": 89570 + }, + { + "epoch": 12.71540099361249, + "grad_norm": 0.10601519793272018, + "learning_rate": 8.728999290276792e-05, + "loss": 0.022131067514419556, + "step": 89580 + }, + { + "epoch": 12.716820440028389, + "grad_norm": 0.08834227919578552, + "learning_rate": 8.728857345635203e-05, + "loss": 0.004504304751753807, + "step": 89590 + }, + { + "epoch": 12.718239886444287, + "grad_norm": 0.6667763590812683, + "learning_rate": 8.728715400993613e-05, + "loss": 0.03264138400554657, + "step": 89600 + }, + { + "epoch": 12.719659332860184, + "grad_norm": 0.06539282947778702, + "learning_rate": 8.728573456352023e-05, + "loss": 0.027134037017822264, + "step": 89610 + }, + { + "epoch": 12.721078779276082, + "grad_norm": 9.912678718566895, + "learning_rate": 8.728431511710433e-05, + "loss": 0.016111087799072266, + "step": 89620 + }, + { + "epoch": 12.72249822569198, + "grad_norm": 3.6912591457366943, + "learning_rate": 8.728289567068844e-05, + "loss": 0.03288787305355072, + "step": 89630 + }, + { + "epoch": 12.723917672107877, + "grad_norm": 1.6711541414260864, + "learning_rate": 8.728147622427253e-05, + "loss": 0.01833134740591049, + "step": 89640 + }, + { + "epoch": 12.725337118523775, + "grad_norm": 0.7487699389457703, + "learning_rate": 8.728005677785665e-05, + "loss": 0.031482148170471194, + "step": 89650 + }, + { + "epoch": 12.726756564939674, + "grad_norm": 11.18490219116211, + "learning_rate": 8.727863733144074e-05, + "loss": 0.025566044449806213, + "step": 89660 + }, + { + "epoch": 12.728176011355572, + "grad_norm": 0.023123400285840034, + "learning_rate": 8.727721788502484e-05, + "loss": 0.026927375793457033, + "step": 89670 + }, + { + "epoch": 12.729595457771469, + "grad_norm": 2.2637369632720947, + "learning_rate": 8.727579843860895e-05, + "loss": 0.0070426806807518, + "step": 89680 + }, + { + "epoch": 12.731014904187367, + "grad_norm": 1.263985276222229, + "learning_rate": 8.727437899219305e-05, + "loss": 0.015499216318130494, + "step": 89690 + }, + { + "epoch": 12.732434350603265, + "grad_norm": 0.1351531594991684, + "learning_rate": 8.727295954577716e-05, + "loss": 0.0721098005771637, + "step": 89700 + }, + { + "epoch": 12.733853797019162, + "grad_norm": 0.5390442609786987, + "learning_rate": 8.727154009936124e-05, + "loss": 0.007533705979585648, + "step": 89710 + }, + { + "epoch": 12.73527324343506, + "grad_norm": 2.3574366569519043, + "learning_rate": 8.727012065294535e-05, + "loss": 0.010569178313016892, + "step": 89720 + }, + { + "epoch": 12.736692689850958, + "grad_norm": 0.09657580405473709, + "learning_rate": 8.726870120652945e-05, + "loss": 0.029726698994636536, + "step": 89730 + }, + { + "epoch": 12.738112136266857, + "grad_norm": 9.368902206420898, + "learning_rate": 8.726728176011356e-05, + "loss": 0.026808175444602966, + "step": 89740 + }, + { + "epoch": 12.739531582682753, + "grad_norm": 0.05883624777197838, + "learning_rate": 8.726586231369766e-05, + "loss": 0.009979787468910217, + "step": 89750 + }, + { + "epoch": 12.740951029098651, + "grad_norm": 0.6890395879745483, + "learning_rate": 8.726444286728176e-05, + "loss": 0.057667660713195804, + "step": 89760 + }, + { + "epoch": 12.74237047551455, + "grad_norm": 0.05285236984491348, + "learning_rate": 8.726302342086587e-05, + "loss": 0.05786900520324707, + "step": 89770 + }, + { + "epoch": 12.743789921930446, + "grad_norm": 0.22699399292469025, + "learning_rate": 8.726160397444997e-05, + "loss": 0.027164924144744872, + "step": 89780 + }, + { + "epoch": 12.745209368346345, + "grad_norm": 7.963822841644287, + "learning_rate": 8.726018452803408e-05, + "loss": 0.03848668932914734, + "step": 89790 + }, + { + "epoch": 12.746628814762243, + "grad_norm": 0.9460397958755493, + "learning_rate": 8.725876508161817e-05, + "loss": 0.030627134442329406, + "step": 89800 + }, + { + "epoch": 12.748048261178141, + "grad_norm": 16.650693893432617, + "learning_rate": 8.725734563520227e-05, + "loss": 0.038635873794555665, + "step": 89810 + }, + { + "epoch": 12.749467707594038, + "grad_norm": 0.7919514179229736, + "learning_rate": 8.725592618878637e-05, + "loss": 0.02418453395366669, + "step": 89820 + }, + { + "epoch": 12.750887154009936, + "grad_norm": 0.04367893561720848, + "learning_rate": 8.725450674237048e-05, + "loss": 0.013546989858150482, + "step": 89830 + }, + { + "epoch": 12.752306600425834, + "grad_norm": 6.330883979797363, + "learning_rate": 8.725308729595458e-05, + "loss": 0.05271314382553101, + "step": 89840 + }, + { + "epoch": 12.75372604684173, + "grad_norm": 8.027472496032715, + "learning_rate": 8.725166784953869e-05, + "loss": 0.031959670782089236, + "step": 89850 + }, + { + "epoch": 12.75514549325763, + "grad_norm": 0.03651771694421768, + "learning_rate": 8.72502484031228e-05, + "loss": 0.03219501376152038, + "step": 89860 + }, + { + "epoch": 12.756564939673527, + "grad_norm": 0.16615909337997437, + "learning_rate": 8.724882895670688e-05, + "loss": 0.019113722443580627, + "step": 89870 + }, + { + "epoch": 12.757984386089426, + "grad_norm": 6.589572906494141, + "learning_rate": 8.724740951029099e-05, + "loss": 0.06304314136505126, + "step": 89880 + }, + { + "epoch": 12.759403832505322, + "grad_norm": 0.18990400433540344, + "learning_rate": 8.724599006387509e-05, + "loss": 0.03654749989509583, + "step": 89890 + }, + { + "epoch": 12.76082327892122, + "grad_norm": 0.21855756640434265, + "learning_rate": 8.72445706174592e-05, + "loss": 0.018128784000873567, + "step": 89900 + }, + { + "epoch": 12.762242725337119, + "grad_norm": 0.07131622731685638, + "learning_rate": 8.72431511710433e-05, + "loss": 0.043461188673973083, + "step": 89910 + }, + { + "epoch": 12.763662171753015, + "grad_norm": 0.7148534655570984, + "learning_rate": 8.72417317246274e-05, + "loss": 0.04350023567676544, + "step": 89920 + }, + { + "epoch": 12.765081618168914, + "grad_norm": 0.6358012557029724, + "learning_rate": 8.72403122782115e-05, + "loss": 0.02536094784736633, + "step": 89930 + }, + { + "epoch": 12.766501064584812, + "grad_norm": 5.968013286590576, + "learning_rate": 8.72388928317956e-05, + "loss": 0.06381597518920898, + "step": 89940 + }, + { + "epoch": 12.76792051100071, + "grad_norm": 5.589460849761963, + "learning_rate": 8.723747338537972e-05, + "loss": 0.014945511519908906, + "step": 89950 + }, + { + "epoch": 12.769339957416607, + "grad_norm": 0.3752530813217163, + "learning_rate": 8.723605393896381e-05, + "loss": 0.02037852108478546, + "step": 89960 + }, + { + "epoch": 12.770759403832505, + "grad_norm": 8.791796684265137, + "learning_rate": 8.723463449254791e-05, + "loss": 0.03790717720985413, + "step": 89970 + }, + { + "epoch": 12.772178850248403, + "grad_norm": 0.8387459516525269, + "learning_rate": 8.723321504613201e-05, + "loss": 0.03979597389698029, + "step": 89980 + }, + { + "epoch": 12.7735982966643, + "grad_norm": 0.14796994626522064, + "learning_rate": 8.723179559971612e-05, + "loss": 0.02478184700012207, + "step": 89990 + }, + { + "epoch": 12.775017743080198, + "grad_norm": 12.8732271194458, + "learning_rate": 8.723037615330022e-05, + "loss": 0.03733752369880676, + "step": 90000 + }, + { + "epoch": 12.775017743080198, + "eval_accuracy": 0.9837222610796719, + "eval_loss": 0.053042687475681305, + "eval_runtime": 32.4327, + "eval_samples_per_second": 484.912, + "eval_steps_per_second": 15.17, + "step": 90000 + }, + { + "epoch": 12.776437189496097, + "grad_norm": 6.415987968444824, + "learning_rate": 8.722895670688433e-05, + "loss": 0.061280882358551024, + "step": 90010 + }, + { + "epoch": 12.777856635911995, + "grad_norm": 0.22661128640174866, + "learning_rate": 8.722753726046841e-05, + "loss": 0.03263997435569763, + "step": 90020 + }, + { + "epoch": 12.779276082327891, + "grad_norm": 5.485864639282227, + "learning_rate": 8.722611781405252e-05, + "loss": 0.027382856607437132, + "step": 90030 + }, + { + "epoch": 12.78069552874379, + "grad_norm": 2.0795586109161377, + "learning_rate": 8.722469836763663e-05, + "loss": 0.06238746643066406, + "step": 90040 + }, + { + "epoch": 12.782114975159688, + "grad_norm": 4.153426647186279, + "learning_rate": 8.722327892122073e-05, + "loss": 0.07989214658737183, + "step": 90050 + }, + { + "epoch": 12.783534421575585, + "grad_norm": 2.226921558380127, + "learning_rate": 8.722185947480484e-05, + "loss": 0.034956902265548706, + "step": 90060 + }, + { + "epoch": 12.784953867991483, + "grad_norm": 6.154031753540039, + "learning_rate": 8.722044002838892e-05, + "loss": 0.028828498721122742, + "step": 90070 + }, + { + "epoch": 12.786373314407381, + "grad_norm": 0.2934417724609375, + "learning_rate": 8.721902058197304e-05, + "loss": 0.04316558837890625, + "step": 90080 + }, + { + "epoch": 12.78779276082328, + "grad_norm": 0.5022813677787781, + "learning_rate": 8.721760113555713e-05, + "loss": 0.021439780294895173, + "step": 90090 + }, + { + "epoch": 12.789212207239176, + "grad_norm": 0.060481056571006775, + "learning_rate": 8.721618168914124e-05, + "loss": 0.03436042368412018, + "step": 90100 + }, + { + "epoch": 12.790631653655074, + "grad_norm": 8.783007621765137, + "learning_rate": 8.721476224272534e-05, + "loss": 0.023770597577095032, + "step": 90110 + }, + { + "epoch": 12.792051100070973, + "grad_norm": 3.9978301525115967, + "learning_rate": 8.721334279630944e-05, + "loss": 0.01868281066417694, + "step": 90120 + }, + { + "epoch": 12.79347054648687, + "grad_norm": 0.25623878836631775, + "learning_rate": 8.721192334989355e-05, + "loss": 0.018424060940742493, + "step": 90130 + }, + { + "epoch": 12.794889992902768, + "grad_norm": 5.259542465209961, + "learning_rate": 8.721050390347765e-05, + "loss": 0.0191888228058815, + "step": 90140 + }, + { + "epoch": 12.796309439318666, + "grad_norm": 6.814998149871826, + "learning_rate": 8.720908445706176e-05, + "loss": 0.044765892624855044, + "step": 90150 + }, + { + "epoch": 12.797728885734564, + "grad_norm": 7.195475101470947, + "learning_rate": 8.720766501064586e-05, + "loss": 0.03070288598537445, + "step": 90160 + }, + { + "epoch": 12.79914833215046, + "grad_norm": 0.6905927658081055, + "learning_rate": 8.720624556422995e-05, + "loss": 0.018119293451309203, + "step": 90170 + }, + { + "epoch": 12.800567778566359, + "grad_norm": 0.650661051273346, + "learning_rate": 8.720482611781405e-05, + "loss": 0.027859440445899962, + "step": 90180 + }, + { + "epoch": 12.801987224982257, + "grad_norm": 0.24278753995895386, + "learning_rate": 8.720340667139816e-05, + "loss": 0.01636711657047272, + "step": 90190 + }, + { + "epoch": 12.803406671398154, + "grad_norm": 3.6955575942993164, + "learning_rate": 8.720198722498226e-05, + "loss": 0.018730704486370087, + "step": 90200 + }, + { + "epoch": 12.804826117814052, + "grad_norm": 0.05204196646809578, + "learning_rate": 8.720056777856637e-05, + "loss": 0.012679100036621094, + "step": 90210 + }, + { + "epoch": 12.80624556422995, + "grad_norm": 5.993483066558838, + "learning_rate": 8.719914833215047e-05, + "loss": 0.08045894503593445, + "step": 90220 + }, + { + "epoch": 12.807665010645849, + "grad_norm": 0.022297637537121773, + "learning_rate": 8.719772888573456e-05, + "loss": 0.004308861494064331, + "step": 90230 + }, + { + "epoch": 12.809084457061745, + "grad_norm": 1.582815408706665, + "learning_rate": 8.719630943931867e-05, + "loss": 0.01453346610069275, + "step": 90240 + }, + { + "epoch": 12.810503903477644, + "grad_norm": 0.013253006152808666, + "learning_rate": 8.719488999290277e-05, + "loss": 0.007113112509250641, + "step": 90250 + }, + { + "epoch": 12.811923349893542, + "grad_norm": 10.942405700683594, + "learning_rate": 8.719347054648688e-05, + "loss": 0.04160553216934204, + "step": 90260 + }, + { + "epoch": 12.813342796309438, + "grad_norm": 1.6827470064163208, + "learning_rate": 8.719205110007098e-05, + "loss": 0.018131645023822786, + "step": 90270 + }, + { + "epoch": 12.814762242725337, + "grad_norm": 15.465213775634766, + "learning_rate": 8.719063165365508e-05, + "loss": 0.025337904691696167, + "step": 90280 + }, + { + "epoch": 12.816181689141235, + "grad_norm": 0.231109157204628, + "learning_rate": 8.718921220723918e-05, + "loss": 0.02845522463321686, + "step": 90290 + }, + { + "epoch": 12.817601135557133, + "grad_norm": 0.13257861137390137, + "learning_rate": 8.718779276082329e-05, + "loss": 0.016551059484481812, + "step": 90300 + }, + { + "epoch": 12.81902058197303, + "grad_norm": 0.6941499710083008, + "learning_rate": 8.718637331440738e-05, + "loss": 0.03151951730251312, + "step": 90310 + }, + { + "epoch": 12.820440028388928, + "grad_norm": 10.343978881835938, + "learning_rate": 8.71849538679915e-05, + "loss": 0.02398531138896942, + "step": 90320 + }, + { + "epoch": 12.821859474804826, + "grad_norm": 14.737664222717285, + "learning_rate": 8.718353442157559e-05, + "loss": 0.028624552488327026, + "step": 90330 + }, + { + "epoch": 12.823278921220723, + "grad_norm": 2.1050493717193604, + "learning_rate": 8.718211497515969e-05, + "loss": 0.07168601155281067, + "step": 90340 + }, + { + "epoch": 12.824698367636621, + "grad_norm": 6.589676856994629, + "learning_rate": 8.71806955287438e-05, + "loss": 0.045221933722496034, + "step": 90350 + }, + { + "epoch": 12.82611781405252, + "grad_norm": 1.8379383087158203, + "learning_rate": 8.71792760823279e-05, + "loss": 0.05340749621391296, + "step": 90360 + }, + { + "epoch": 12.827537260468418, + "grad_norm": 0.2016642987728119, + "learning_rate": 8.717785663591201e-05, + "loss": 0.01270725429058075, + "step": 90370 + }, + { + "epoch": 12.828956706884314, + "grad_norm": 3.9885146617889404, + "learning_rate": 8.717643718949609e-05, + "loss": 0.03886641561985016, + "step": 90380 + }, + { + "epoch": 12.830376153300213, + "grad_norm": 14.107388496398926, + "learning_rate": 8.71750177430802e-05, + "loss": 0.05452651977539062, + "step": 90390 + }, + { + "epoch": 12.831795599716111, + "grad_norm": 1.8865777254104614, + "learning_rate": 8.71735982966643e-05, + "loss": 0.047424548864364625, + "step": 90400 + }, + { + "epoch": 12.833215046132008, + "grad_norm": 0.6092913150787354, + "learning_rate": 8.717217885024841e-05, + "loss": 0.06444382071495056, + "step": 90410 + }, + { + "epoch": 12.834634492547906, + "grad_norm": 4.197598934173584, + "learning_rate": 8.717075940383251e-05, + "loss": 0.03637649714946747, + "step": 90420 + }, + { + "epoch": 12.836053938963804, + "grad_norm": 0.08683058619499207, + "learning_rate": 8.71693399574166e-05, + "loss": 0.05653232932090759, + "step": 90430 + }, + { + "epoch": 12.837473385379703, + "grad_norm": 0.11898249387741089, + "learning_rate": 8.716792051100072e-05, + "loss": 0.006302830576896667, + "step": 90440 + }, + { + "epoch": 12.838892831795599, + "grad_norm": 6.252366065979004, + "learning_rate": 8.716650106458481e-05, + "loss": 0.0305631160736084, + "step": 90450 + }, + { + "epoch": 12.840312278211497, + "grad_norm": 0.15573939681053162, + "learning_rate": 8.716508161816893e-05, + "loss": 0.01531701534986496, + "step": 90460 + }, + { + "epoch": 12.841731724627396, + "grad_norm": 3.1314682960510254, + "learning_rate": 8.716366217175302e-05, + "loss": 0.013777315616607666, + "step": 90470 + }, + { + "epoch": 12.843151171043292, + "grad_norm": 0.027007605880498886, + "learning_rate": 8.716224272533712e-05, + "loss": 0.038998952507972716, + "step": 90480 + }, + { + "epoch": 12.84457061745919, + "grad_norm": 2.1043355464935303, + "learning_rate": 8.716082327892122e-05, + "loss": 0.025404781103134155, + "step": 90490 + }, + { + "epoch": 12.845990063875089, + "grad_norm": 0.07467706501483917, + "learning_rate": 8.715940383250533e-05, + "loss": 0.006243685632944107, + "step": 90500 + }, + { + "epoch": 12.845990063875089, + "eval_accuracy": 0.9862656577859732, + "eval_loss": 0.047130122780799866, + "eval_runtime": 33.3807, + "eval_samples_per_second": 471.14, + "eval_steps_per_second": 14.739, + "step": 90500 + }, + { + "epoch": 12.847409510290987, + "grad_norm": 1.8158254623413086, + "learning_rate": 8.715798438608943e-05, + "loss": 0.010209519416093826, + "step": 90510 + }, + { + "epoch": 12.848828956706884, + "grad_norm": 2.3448336124420166, + "learning_rate": 8.715656493967354e-05, + "loss": 0.05426924824714661, + "step": 90520 + }, + { + "epoch": 12.850248403122782, + "grad_norm": 11.133930206298828, + "learning_rate": 8.715514549325763e-05, + "loss": 0.07704846858978272, + "step": 90530 + }, + { + "epoch": 12.85166784953868, + "grad_norm": 5.328191757202148, + "learning_rate": 8.715372604684173e-05, + "loss": 0.04808947145938873, + "step": 90540 + }, + { + "epoch": 12.853087295954577, + "grad_norm": 0.2553696036338806, + "learning_rate": 8.715230660042584e-05, + "loss": 0.0963461935520172, + "step": 90550 + }, + { + "epoch": 12.854506742370475, + "grad_norm": 6.152266979217529, + "learning_rate": 8.715088715400994e-05, + "loss": 0.04504083395004273, + "step": 90560 + }, + { + "epoch": 12.855926188786373, + "grad_norm": 0.06309587508440018, + "learning_rate": 8.714946770759405e-05, + "loss": 0.005174351111054421, + "step": 90570 + }, + { + "epoch": 12.857345635202272, + "grad_norm": 0.1511959433555603, + "learning_rate": 8.714804826117815e-05, + "loss": 0.014291897416114807, + "step": 90580 + }, + { + "epoch": 12.858765081618168, + "grad_norm": 13.10915756225586, + "learning_rate": 8.714662881476225e-05, + "loss": 0.04409765303134918, + "step": 90590 + }, + { + "epoch": 12.860184528034067, + "grad_norm": 1.8174325227737427, + "learning_rate": 8.714520936834634e-05, + "loss": 0.011032898724079133, + "step": 90600 + }, + { + "epoch": 12.861603974449965, + "grad_norm": 0.5066937208175659, + "learning_rate": 8.714378992193045e-05, + "loss": 0.01093606948852539, + "step": 90610 + }, + { + "epoch": 12.863023420865863, + "grad_norm": 8.572830200195312, + "learning_rate": 8.714237047551455e-05, + "loss": 0.057040858268737796, + "step": 90620 + }, + { + "epoch": 12.86444286728176, + "grad_norm": 0.1343429684638977, + "learning_rate": 8.714109297374025e-05, + "loss": 0.024809937179088592, + "step": 90630 + }, + { + "epoch": 12.865862313697658, + "grad_norm": 1.4768531322479248, + "learning_rate": 8.713967352732435e-05, + "loss": 0.029907482862472533, + "step": 90640 + }, + { + "epoch": 12.867281760113556, + "grad_norm": 8.033143997192383, + "learning_rate": 8.713825408090846e-05, + "loss": 0.030059036612510682, + "step": 90650 + }, + { + "epoch": 12.868701206529453, + "grad_norm": 0.09025631844997406, + "learning_rate": 8.713683463449254e-05, + "loss": 0.01888950914144516, + "step": 90660 + }, + { + "epoch": 12.870120652945351, + "grad_norm": 0.6663010716438293, + "learning_rate": 8.713541518807665e-05, + "loss": 0.018822093307971955, + "step": 90670 + }, + { + "epoch": 12.87154009936125, + "grad_norm": 0.09493912756443024, + "learning_rate": 8.713399574166075e-05, + "loss": 0.042925047874450686, + "step": 90680 + }, + { + "epoch": 12.872959545777148, + "grad_norm": 0.49431559443473816, + "learning_rate": 8.713257629524486e-05, + "loss": 0.041593000292778015, + "step": 90690 + }, + { + "epoch": 12.874378992193044, + "grad_norm": 1.4785473346710205, + "learning_rate": 8.713115684882897e-05, + "loss": 0.015245826542377472, + "step": 90700 + }, + { + "epoch": 12.875798438608943, + "grad_norm": 10.384889602661133, + "learning_rate": 8.712973740241306e-05, + "loss": 0.027532801032066345, + "step": 90710 + }, + { + "epoch": 12.87721788502484, + "grad_norm": 0.7916569113731384, + "learning_rate": 8.712831795599717e-05, + "loss": 0.020438848435878752, + "step": 90720 + }, + { + "epoch": 12.878637331440737, + "grad_norm": 0.20680011808872223, + "learning_rate": 8.712689850958126e-05, + "loss": 0.018569478392601015, + "step": 90730 + }, + { + "epoch": 12.880056777856636, + "grad_norm": 4.336511135101318, + "learning_rate": 8.712547906316538e-05, + "loss": 0.012251495569944381, + "step": 90740 + }, + { + "epoch": 12.881476224272534, + "grad_norm": 11.582043647766113, + "learning_rate": 8.712405961674947e-05, + "loss": 0.02740319073200226, + "step": 90750 + }, + { + "epoch": 12.882895670688432, + "grad_norm": 1.1124207973480225, + "learning_rate": 8.712264017033357e-05, + "loss": 0.027994123101234437, + "step": 90760 + }, + { + "epoch": 12.884315117104329, + "grad_norm": 4.376797676086426, + "learning_rate": 8.712122072391767e-05, + "loss": 0.017722992599010466, + "step": 90770 + }, + { + "epoch": 12.885734563520227, + "grad_norm": 1.3260844945907593, + "learning_rate": 8.711980127750178e-05, + "loss": 0.025655841827392577, + "step": 90780 + }, + { + "epoch": 12.887154009936125, + "grad_norm": 0.5041131973266602, + "learning_rate": 8.711838183108589e-05, + "loss": 0.02144547700881958, + "step": 90790 + }, + { + "epoch": 12.888573456352022, + "grad_norm": 0.06130081042647362, + "learning_rate": 8.711696238466999e-05, + "loss": 0.004414296522736549, + "step": 90800 + }, + { + "epoch": 12.88999290276792, + "grad_norm": 0.38922789692878723, + "learning_rate": 8.711554293825408e-05, + "loss": 0.0188855916261673, + "step": 90810 + }, + { + "epoch": 12.891412349183819, + "grad_norm": 7.74660062789917, + "learning_rate": 8.711412349183818e-05, + "loss": 0.035584890842437746, + "step": 90820 + }, + { + "epoch": 12.892831795599717, + "grad_norm": 0.9597287774085999, + "learning_rate": 8.711270404542229e-05, + "loss": 0.04100579619407654, + "step": 90830 + }, + { + "epoch": 12.894251242015613, + "grad_norm": 8.280255317687988, + "learning_rate": 8.711128459900639e-05, + "loss": 0.022879604995250703, + "step": 90840 + }, + { + "epoch": 12.895670688431512, + "grad_norm": 0.06499747931957245, + "learning_rate": 8.71098651525905e-05, + "loss": 0.0381752610206604, + "step": 90850 + }, + { + "epoch": 12.89709013484741, + "grad_norm": 0.22900985181331635, + "learning_rate": 8.710844570617458e-05, + "loss": 0.01734771877527237, + "step": 90860 + }, + { + "epoch": 12.898509581263307, + "grad_norm": 2.9776017665863037, + "learning_rate": 8.71070262597587e-05, + "loss": 0.050455224514007566, + "step": 90870 + }, + { + "epoch": 12.899929027679205, + "grad_norm": 0.06448038667440414, + "learning_rate": 8.71056068133428e-05, + "loss": 0.03693827986717224, + "step": 90880 + }, + { + "epoch": 12.901348474095103, + "grad_norm": 0.2390090674161911, + "learning_rate": 8.71041873669269e-05, + "loss": 0.014416129887104034, + "step": 90890 + }, + { + "epoch": 12.902767920511002, + "grad_norm": 1.1335889101028442, + "learning_rate": 8.710276792051101e-05, + "loss": 0.08491934537887573, + "step": 90900 + }, + { + "epoch": 12.904187366926898, + "grad_norm": 1.1026262044906616, + "learning_rate": 8.710134847409511e-05, + "loss": 0.01418365240097046, + "step": 90910 + }, + { + "epoch": 12.905606813342796, + "grad_norm": 1.0436301231384277, + "learning_rate": 8.709992902767921e-05, + "loss": 0.017918017506599427, + "step": 90920 + }, + { + "epoch": 12.907026259758695, + "grad_norm": 5.441536903381348, + "learning_rate": 8.70985095812633e-05, + "loss": 0.012655892968177795, + "step": 90930 + }, + { + "epoch": 12.908445706174591, + "grad_norm": 0.10456965863704681, + "learning_rate": 8.709709013484742e-05, + "loss": 0.015247131884098052, + "step": 90940 + }, + { + "epoch": 12.90986515259049, + "grad_norm": 0.014488224871456623, + "learning_rate": 8.709567068843151e-05, + "loss": 0.01566433012485504, + "step": 90950 + }, + { + "epoch": 12.911284599006388, + "grad_norm": 0.009824546054005623, + "learning_rate": 8.709425124201563e-05, + "loss": 0.003042099252343178, + "step": 90960 + }, + { + "epoch": 12.912704045422286, + "grad_norm": 0.9563096165657043, + "learning_rate": 8.709283179559972e-05, + "loss": 0.03195181488990784, + "step": 90970 + }, + { + "epoch": 12.914123491838183, + "grad_norm": 13.064339637756348, + "learning_rate": 8.709141234918382e-05, + "loss": 0.023437163233757018, + "step": 90980 + }, + { + "epoch": 12.915542938254081, + "grad_norm": 0.026108302175998688, + "learning_rate": 8.708999290276793e-05, + "loss": 0.007652238756418228, + "step": 90990 + }, + { + "epoch": 12.91696238466998, + "grad_norm": 0.28071707487106323, + "learning_rate": 8.708857345635203e-05, + "loss": 0.05530444383621216, + "step": 91000 + }, + { + "epoch": 12.91696238466998, + "eval_accuracy": 0.9828320722324665, + "eval_loss": 0.0642710030078888, + "eval_runtime": 32.7388, + "eval_samples_per_second": 480.377, + "eval_steps_per_second": 15.028, + "step": 91000 + }, + { + "epoch": 12.918381831085876, + "grad_norm": 15.131595611572266, + "learning_rate": 8.708715400993614e-05, + "loss": 0.06838587522506714, + "step": 91010 + }, + { + "epoch": 12.919801277501774, + "grad_norm": 1.486463189125061, + "learning_rate": 8.708573456352022e-05, + "loss": 0.018643665313720702, + "step": 91020 + }, + { + "epoch": 12.921220723917672, + "grad_norm": 2.5784308910369873, + "learning_rate": 8.708431511710433e-05, + "loss": 0.019171588122844696, + "step": 91030 + }, + { + "epoch": 12.92264017033357, + "grad_norm": 0.051978398114442825, + "learning_rate": 8.708289567068843e-05, + "loss": 0.02243105471134186, + "step": 91040 + }, + { + "epoch": 12.924059616749467, + "grad_norm": 0.61635822057724, + "learning_rate": 8.708147622427254e-05, + "loss": 0.002178187668323517, + "step": 91050 + }, + { + "epoch": 12.925479063165366, + "grad_norm": 12.922770500183105, + "learning_rate": 8.708005677785664e-05, + "loss": 0.09026901125907898, + "step": 91060 + }, + { + "epoch": 12.926898509581264, + "grad_norm": 7.460283279418945, + "learning_rate": 8.707863733144074e-05, + "loss": 0.06489250659942628, + "step": 91070 + }, + { + "epoch": 12.92831795599716, + "grad_norm": 7.113394737243652, + "learning_rate": 8.707721788502485e-05, + "loss": 0.047018444538116454, + "step": 91080 + }, + { + "epoch": 12.929737402413059, + "grad_norm": 0.26584914326667786, + "learning_rate": 8.707579843860895e-05, + "loss": 0.028078389167785645, + "step": 91090 + }, + { + "epoch": 12.931156848828957, + "grad_norm": 0.032915230840444565, + "learning_rate": 8.707437899219306e-05, + "loss": 0.04310223460197449, + "step": 91100 + }, + { + "epoch": 12.932576295244855, + "grad_norm": 0.1749131828546524, + "learning_rate": 8.707295954577715e-05, + "loss": 0.02489803433418274, + "step": 91110 + }, + { + "epoch": 12.933995741660752, + "grad_norm": 0.0382566824555397, + "learning_rate": 8.707154009936125e-05, + "loss": 0.04579323828220368, + "step": 91120 + }, + { + "epoch": 12.93541518807665, + "grad_norm": 0.15481582283973694, + "learning_rate": 8.707012065294535e-05, + "loss": 0.02880704402923584, + "step": 91130 + }, + { + "epoch": 12.936834634492548, + "grad_norm": 0.1466173231601715, + "learning_rate": 8.706870120652946e-05, + "loss": 0.0475196361541748, + "step": 91140 + }, + { + "epoch": 12.938254080908445, + "grad_norm": 0.21036475896835327, + "learning_rate": 8.706728176011356e-05, + "loss": 0.020145165920257568, + "step": 91150 + }, + { + "epoch": 12.939673527324343, + "grad_norm": 6.4919657707214355, + "learning_rate": 8.706586231369767e-05, + "loss": 0.03438344597816467, + "step": 91160 + }, + { + "epoch": 12.941092973740242, + "grad_norm": 0.09811893850564957, + "learning_rate": 8.706444286728177e-05, + "loss": 0.003525005280971527, + "step": 91170 + }, + { + "epoch": 12.94251242015614, + "grad_norm": 8.104255676269531, + "learning_rate": 8.706302342086586e-05, + "loss": 0.0142546147108078, + "step": 91180 + }, + { + "epoch": 12.943931866572036, + "grad_norm": 2.091709613800049, + "learning_rate": 8.706160397444997e-05, + "loss": 0.00975591540336609, + "step": 91190 + }, + { + "epoch": 12.945351312987935, + "grad_norm": 0.7164672613143921, + "learning_rate": 8.706018452803407e-05, + "loss": 0.018023268878459932, + "step": 91200 + }, + { + "epoch": 12.946770759403833, + "grad_norm": 0.48031216859817505, + "learning_rate": 8.705876508161818e-05, + "loss": 0.02688279151916504, + "step": 91210 + }, + { + "epoch": 12.94819020581973, + "grad_norm": 0.22432000935077667, + "learning_rate": 8.705734563520227e-05, + "loss": 0.06052901148796082, + "step": 91220 + }, + { + "epoch": 12.949609652235628, + "grad_norm": 0.17801009118556976, + "learning_rate": 8.705592618878638e-05, + "loss": 0.024128471314907075, + "step": 91230 + }, + { + "epoch": 12.951029098651526, + "grad_norm": 0.3342018723487854, + "learning_rate": 8.705450674237047e-05, + "loss": 0.022981099784374237, + "step": 91240 + }, + { + "epoch": 12.952448545067424, + "grad_norm": 0.14481079578399658, + "learning_rate": 8.705308729595459e-05, + "loss": 0.01762939989566803, + "step": 91250 + }, + { + "epoch": 12.953867991483321, + "grad_norm": 1.8251750469207764, + "learning_rate": 8.705166784953868e-05, + "loss": 0.06004202365875244, + "step": 91260 + }, + { + "epoch": 12.95528743789922, + "grad_norm": 0.30406421422958374, + "learning_rate": 8.70502484031228e-05, + "loss": 0.017629969120025634, + "step": 91270 + }, + { + "epoch": 12.956706884315118, + "grad_norm": 0.018968552350997925, + "learning_rate": 8.704882895670689e-05, + "loss": 0.03772530853748322, + "step": 91280 + }, + { + "epoch": 12.958126330731014, + "grad_norm": 0.34433281421661377, + "learning_rate": 8.704740951029099e-05, + "loss": 0.010790017247200013, + "step": 91290 + }, + { + "epoch": 12.959545777146912, + "grad_norm": 7.834738731384277, + "learning_rate": 8.70459900638751e-05, + "loss": 0.02302001416683197, + "step": 91300 + }, + { + "epoch": 12.96096522356281, + "grad_norm": 1.474596619606018, + "learning_rate": 8.70445706174592e-05, + "loss": 0.020346474647521973, + "step": 91310 + }, + { + "epoch": 12.962384669978709, + "grad_norm": 0.15759314596652985, + "learning_rate": 8.704315117104331e-05, + "loss": 0.04993346631526947, + "step": 91320 + }, + { + "epoch": 12.963804116394606, + "grad_norm": 0.3664032518863678, + "learning_rate": 8.704173172462739e-05, + "loss": 0.023996689915657045, + "step": 91330 + }, + { + "epoch": 12.965223562810504, + "grad_norm": 2.838346004486084, + "learning_rate": 8.70403122782115e-05, + "loss": 0.031538930535316465, + "step": 91340 + }, + { + "epoch": 12.966643009226402, + "grad_norm": 6.921056270599365, + "learning_rate": 8.70388928317956e-05, + "loss": 0.08351738452911377, + "step": 91350 + }, + { + "epoch": 12.968062455642299, + "grad_norm": 7.643664360046387, + "learning_rate": 8.703747338537971e-05, + "loss": 0.03136724233627319, + "step": 91360 + }, + { + "epoch": 12.969481902058197, + "grad_norm": 0.6630023121833801, + "learning_rate": 8.703605393896381e-05, + "loss": 0.018689344823360442, + "step": 91370 + }, + { + "epoch": 12.970901348474095, + "grad_norm": 1.5902106761932373, + "learning_rate": 8.70346344925479e-05, + "loss": 0.06974080801010132, + "step": 91380 + }, + { + "epoch": 12.972320794889994, + "grad_norm": 0.12607437372207642, + "learning_rate": 8.703321504613202e-05, + "loss": 0.033649593591690063, + "step": 91390 + }, + { + "epoch": 12.97374024130589, + "grad_norm": 6.137654781341553, + "learning_rate": 8.703179559971611e-05, + "loss": 0.021195295453071594, + "step": 91400 + }, + { + "epoch": 12.975159687721789, + "grad_norm": 0.14958131313323975, + "learning_rate": 8.703037615330022e-05, + "loss": 0.05002725124359131, + "step": 91410 + }, + { + "epoch": 12.976579134137687, + "grad_norm": 2.645599365234375, + "learning_rate": 8.702895670688432e-05, + "loss": 0.01684072017669678, + "step": 91420 + }, + { + "epoch": 12.977998580553583, + "grad_norm": 3.953803062438965, + "learning_rate": 8.702753726046842e-05, + "loss": 0.058473384380340575, + "step": 91430 + }, + { + "epoch": 12.979418026969482, + "grad_norm": 0.7704017162322998, + "learning_rate": 8.702611781405252e-05, + "loss": 0.03435961604118347, + "step": 91440 + }, + { + "epoch": 12.98083747338538, + "grad_norm": 0.45102816820144653, + "learning_rate": 8.702469836763663e-05, + "loss": 0.02208094298839569, + "step": 91450 + }, + { + "epoch": 12.982256919801278, + "grad_norm": 0.9969324469566345, + "learning_rate": 8.702327892122072e-05, + "loss": 0.04234072864055634, + "step": 91460 + }, + { + "epoch": 12.983676366217175, + "grad_norm": 0.06572148203849792, + "learning_rate": 8.702185947480484e-05, + "loss": 0.04454489350318909, + "step": 91470 + }, + { + "epoch": 12.985095812633073, + "grad_norm": 4.023639678955078, + "learning_rate": 8.702044002838893e-05, + "loss": 0.034791293740272525, + "step": 91480 + }, + { + "epoch": 12.986515259048971, + "grad_norm": 4.597278118133545, + "learning_rate": 8.701902058197303e-05, + "loss": 0.015966880321502685, + "step": 91490 + }, + { + "epoch": 12.987934705464868, + "grad_norm": 0.13566672801971436, + "learning_rate": 8.701760113555714e-05, + "loss": 0.02301221191883087, + "step": 91500 + }, + { + "epoch": 12.987934705464868, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.052828311920166016, + "eval_runtime": 32.743, + "eval_samples_per_second": 480.316, + "eval_steps_per_second": 15.026, + "step": 91500 + }, + { + "epoch": 12.989354151880766, + "grad_norm": 0.3484897017478943, + "learning_rate": 8.701618168914124e-05, + "loss": 0.04626628756523132, + "step": 91510 + }, + { + "epoch": 12.990773598296665, + "grad_norm": 0.4147621989250183, + "learning_rate": 8.701476224272535e-05, + "loss": 0.008229909092187881, + "step": 91520 + }, + { + "epoch": 12.992193044712563, + "grad_norm": 0.07728651911020279, + "learning_rate": 8.701334279630943e-05, + "loss": 0.011543260514736175, + "step": 91530 + }, + { + "epoch": 12.99361249112846, + "grad_norm": 0.470676988363266, + "learning_rate": 8.701192334989354e-05, + "loss": 0.030454790592193602, + "step": 91540 + }, + { + "epoch": 12.995031937544358, + "grad_norm": 15.090569496154785, + "learning_rate": 8.701050390347764e-05, + "loss": 0.03786148726940155, + "step": 91550 + }, + { + "epoch": 12.996451383960256, + "grad_norm": 3.7643697261810303, + "learning_rate": 8.700908445706175e-05, + "loss": 0.01893087774515152, + "step": 91560 + }, + { + "epoch": 12.997870830376153, + "grad_norm": 0.3498643636703491, + "learning_rate": 8.700766501064585e-05, + "loss": 0.006908781081438065, + "step": 91570 + }, + { + "epoch": 12.99929027679205, + "grad_norm": 14.163825035095215, + "learning_rate": 8.700624556422995e-05, + "loss": 0.04151703715324402, + "step": 91580 + }, + { + "epoch": 13.00070972320795, + "grad_norm": 16.969526290893555, + "learning_rate": 8.700482611781406e-05, + "loss": 0.056925737857818605, + "step": 91590 + }, + { + "epoch": 13.002129169623847, + "grad_norm": 0.5581228137016296, + "learning_rate": 8.700340667139816e-05, + "loss": 0.008616255968809128, + "step": 91600 + }, + { + "epoch": 13.003548616039744, + "grad_norm": 1.8690402507781982, + "learning_rate": 8.700198722498227e-05, + "loss": 0.015176343917846679, + "step": 91610 + }, + { + "epoch": 13.004968062455642, + "grad_norm": 0.2172848880290985, + "learning_rate": 8.700056777856636e-05, + "loss": 0.012149479985237122, + "step": 91620 + }, + { + "epoch": 13.00638750887154, + "grad_norm": 0.11255660653114319, + "learning_rate": 8.699914833215048e-05, + "loss": 0.030363094806671143, + "step": 91630 + }, + { + "epoch": 13.007806955287437, + "grad_norm": 1.7092961072921753, + "learning_rate": 8.699772888573456e-05, + "loss": 0.010361893475055695, + "step": 91640 + }, + { + "epoch": 13.009226401703335, + "grad_norm": 0.0253172367811203, + "learning_rate": 8.699630943931867e-05, + "loss": 0.026502841711044313, + "step": 91650 + }, + { + "epoch": 13.010645848119234, + "grad_norm": 1.3576223850250244, + "learning_rate": 8.699488999290277e-05, + "loss": 0.019933232665061952, + "step": 91660 + }, + { + "epoch": 13.012065294535132, + "grad_norm": 0.20457276701927185, + "learning_rate": 8.699347054648688e-05, + "loss": 0.041373640298843384, + "step": 91670 + }, + { + "epoch": 13.013484740951029, + "grad_norm": 0.9398382306098938, + "learning_rate": 8.699205110007098e-05, + "loss": 0.029924097657203674, + "step": 91680 + }, + { + "epoch": 13.014904187366927, + "grad_norm": 0.24676869809627533, + "learning_rate": 8.699063165365507e-05, + "loss": 0.02562679350376129, + "step": 91690 + }, + { + "epoch": 13.016323633782825, + "grad_norm": 0.5171564221382141, + "learning_rate": 8.698921220723918e-05, + "loss": 0.0322356641292572, + "step": 91700 + }, + { + "epoch": 13.017743080198722, + "grad_norm": 3.6011831760406494, + "learning_rate": 8.698779276082328e-05, + "loss": 0.010308434069156647, + "step": 91710 + }, + { + "epoch": 13.01916252661462, + "grad_norm": 6.4173808097839355, + "learning_rate": 8.698637331440739e-05, + "loss": 0.031694459915161136, + "step": 91720 + }, + { + "epoch": 13.020581973030518, + "grad_norm": 3.998835563659668, + "learning_rate": 8.698495386799149e-05, + "loss": 0.0390471339225769, + "step": 91730 + }, + { + "epoch": 13.022001419446417, + "grad_norm": 0.6538220643997192, + "learning_rate": 8.698353442157559e-05, + "loss": 0.02336165904998779, + "step": 91740 + }, + { + "epoch": 13.023420865862313, + "grad_norm": 0.06361285597085953, + "learning_rate": 8.698211497515968e-05, + "loss": 0.028032290935516357, + "step": 91750 + }, + { + "epoch": 13.024840312278211, + "grad_norm": 0.8600550889968872, + "learning_rate": 8.69806955287438e-05, + "loss": 0.03384045958518982, + "step": 91760 + }, + { + "epoch": 13.02625975869411, + "grad_norm": 5.134207248687744, + "learning_rate": 8.697927608232789e-05, + "loss": 0.014662571251392365, + "step": 91770 + }, + { + "epoch": 13.027679205110006, + "grad_norm": 3.7682700157165527, + "learning_rate": 8.6977856635912e-05, + "loss": 0.013019509613513947, + "step": 91780 + }, + { + "epoch": 13.029098651525905, + "grad_norm": 0.17727281153202057, + "learning_rate": 8.69764371894961e-05, + "loss": 0.008013653755187988, + "step": 91790 + }, + { + "epoch": 13.030518097941803, + "grad_norm": 0.04075111076235771, + "learning_rate": 8.69750177430802e-05, + "loss": 0.019158007204532625, + "step": 91800 + }, + { + "epoch": 13.031937544357701, + "grad_norm": 5.8046722412109375, + "learning_rate": 8.697359829666431e-05, + "loss": 0.03163691759109497, + "step": 91810 + }, + { + "epoch": 13.033356990773598, + "grad_norm": 6.866926193237305, + "learning_rate": 8.69721788502484e-05, + "loss": 0.016943201422691345, + "step": 91820 + }, + { + "epoch": 13.034776437189496, + "grad_norm": 1.013522744178772, + "learning_rate": 8.697075940383252e-05, + "loss": 0.019465875625610352, + "step": 91830 + }, + { + "epoch": 13.036195883605394, + "grad_norm": 4.783105373382568, + "learning_rate": 8.69693399574166e-05, + "loss": 0.026390090584754944, + "step": 91840 + }, + { + "epoch": 13.037615330021291, + "grad_norm": 7.49722146987915, + "learning_rate": 8.696792051100071e-05, + "loss": 0.029819828271865845, + "step": 91850 + }, + { + "epoch": 13.03903477643719, + "grad_norm": 0.009413005784153938, + "learning_rate": 8.696650106458481e-05, + "loss": 0.013858243823051453, + "step": 91860 + }, + { + "epoch": 13.040454222853088, + "grad_norm": 9.378253936767578, + "learning_rate": 8.696508161816892e-05, + "loss": 0.02688908874988556, + "step": 91870 + }, + { + "epoch": 13.041873669268986, + "grad_norm": 1.6610782146453857, + "learning_rate": 8.696366217175302e-05, + "loss": 0.041138678789138794, + "step": 91880 + }, + { + "epoch": 13.043293115684882, + "grad_norm": 0.13649091124534607, + "learning_rate": 8.696224272533712e-05, + "loss": 0.009144684672355652, + "step": 91890 + }, + { + "epoch": 13.04471256210078, + "grad_norm": 0.15896634757518768, + "learning_rate": 8.696082327892123e-05, + "loss": 0.0353781670331955, + "step": 91900 + }, + { + "epoch": 13.046132008516679, + "grad_norm": 8.799978256225586, + "learning_rate": 8.695940383250532e-05, + "loss": 0.03496268391609192, + "step": 91910 + }, + { + "epoch": 13.047551454932576, + "grad_norm": 2.411184310913086, + "learning_rate": 8.695798438608943e-05, + "loss": 0.022661095857620238, + "step": 91920 + }, + { + "epoch": 13.048970901348474, + "grad_norm": 4.598940372467041, + "learning_rate": 8.695656493967353e-05, + "loss": 0.034996187686920165, + "step": 91930 + }, + { + "epoch": 13.050390347764372, + "grad_norm": 1.2868263721466064, + "learning_rate": 8.695514549325763e-05, + "loss": 0.06741084456443787, + "step": 91940 + }, + { + "epoch": 13.05180979418027, + "grad_norm": 5.6947832107543945, + "learning_rate": 8.695372604684173e-05, + "loss": 0.03682016432285309, + "step": 91950 + }, + { + "epoch": 13.053229240596167, + "grad_norm": 3.202509880065918, + "learning_rate": 8.695230660042584e-05, + "loss": 0.015489163994789123, + "step": 91960 + }, + { + "epoch": 13.054648687012065, + "grad_norm": 0.07919025421142578, + "learning_rate": 8.695088715400994e-05, + "loss": 0.0048958022147417065, + "step": 91970 + }, + { + "epoch": 13.056068133427964, + "grad_norm": 0.06812314689159393, + "learning_rate": 8.694946770759405e-05, + "loss": 0.02271898239850998, + "step": 91980 + }, + { + "epoch": 13.05748757984386, + "grad_norm": 1.773010015487671, + "learning_rate": 8.694804826117814e-05, + "loss": 0.024782709777355194, + "step": 91990 + }, + { + "epoch": 13.058907026259758, + "grad_norm": 0.07601115107536316, + "learning_rate": 8.694662881476224e-05, + "loss": 0.025223162770271302, + "step": 92000 + }, + { + "epoch": 13.058907026259758, + "eval_accuracy": 0.9814332040440008, + "eval_loss": 0.06536781042814255, + "eval_runtime": 31.0818, + "eval_samples_per_second": 505.988, + "eval_steps_per_second": 15.829, + "step": 92000 + }, + { + "epoch": 13.060326472675657, + "grad_norm": 0.6883729100227356, + "learning_rate": 8.694520936834635e-05, + "loss": 0.02039031982421875, + "step": 92010 + }, + { + "epoch": 13.061745919091555, + "grad_norm": 0.024349577724933624, + "learning_rate": 8.694378992193045e-05, + "loss": 0.016295403242111206, + "step": 92020 + }, + { + "epoch": 13.063165365507452, + "grad_norm": 0.008295398205518723, + "learning_rate": 8.694237047551456e-05, + "loss": 0.010328300297260284, + "step": 92030 + }, + { + "epoch": 13.06458481192335, + "grad_norm": 0.0963822677731514, + "learning_rate": 8.694095102909866e-05, + "loss": 0.03061448335647583, + "step": 92040 + }, + { + "epoch": 13.066004258339248, + "grad_norm": 5.504027843475342, + "learning_rate": 8.693953158268275e-05, + "loss": 0.10789980888366699, + "step": 92050 + }, + { + "epoch": 13.067423704755145, + "grad_norm": 7.110039710998535, + "learning_rate": 8.693811213626685e-05, + "loss": 0.03994630575180054, + "step": 92060 + }, + { + "epoch": 13.068843151171043, + "grad_norm": 0.028146283701062202, + "learning_rate": 8.693669268985096e-05, + "loss": 0.013843922317028046, + "step": 92070 + }, + { + "epoch": 13.070262597586941, + "grad_norm": 0.307533860206604, + "learning_rate": 8.693527324343506e-05, + "loss": 0.010846273601055145, + "step": 92080 + }, + { + "epoch": 13.07168204400284, + "grad_norm": 0.11381033062934875, + "learning_rate": 8.693385379701917e-05, + "loss": 0.08111209273338318, + "step": 92090 + }, + { + "epoch": 13.073101490418736, + "grad_norm": 0.8268828988075256, + "learning_rate": 8.693243435060327e-05, + "loss": 0.0200673907995224, + "step": 92100 + }, + { + "epoch": 13.074520936834634, + "grad_norm": 0.2431095540523529, + "learning_rate": 8.693101490418737e-05, + "loss": 0.01655070036649704, + "step": 92110 + }, + { + "epoch": 13.075940383250533, + "grad_norm": 2.5469141006469727, + "learning_rate": 8.692959545777148e-05, + "loss": 0.014377766847610473, + "step": 92120 + }, + { + "epoch": 13.07735982966643, + "grad_norm": 5.483829975128174, + "learning_rate": 8.692817601135557e-05, + "loss": 0.05107632875442505, + "step": 92130 + }, + { + "epoch": 13.078779276082328, + "grad_norm": 1.2713391780853271, + "learning_rate": 8.692675656493969e-05, + "loss": 0.021682539582252504, + "step": 92140 + }, + { + "epoch": 13.080198722498226, + "grad_norm": 6.252452850341797, + "learning_rate": 8.692533711852377e-05, + "loss": 0.04136396050453186, + "step": 92150 + }, + { + "epoch": 13.081618168914124, + "grad_norm": 0.012155876494944096, + "learning_rate": 8.692391767210788e-05, + "loss": 0.029287290573120118, + "step": 92160 + }, + { + "epoch": 13.08303761533002, + "grad_norm": 0.014887611381709576, + "learning_rate": 8.692249822569198e-05, + "loss": 0.012705713510513306, + "step": 92170 + }, + { + "epoch": 13.084457061745919, + "grad_norm": 0.02529442124068737, + "learning_rate": 8.692107877927609e-05, + "loss": 0.020171231031417845, + "step": 92180 + }, + { + "epoch": 13.085876508161817, + "grad_norm": 1.0245634317398071, + "learning_rate": 8.69196593328602e-05, + "loss": 0.019703012704849244, + "step": 92190 + }, + { + "epoch": 13.087295954577714, + "grad_norm": 1.0482151508331299, + "learning_rate": 8.691823988644428e-05, + "loss": 0.05168004035949707, + "step": 92200 + }, + { + "epoch": 13.088715400993612, + "grad_norm": 0.7686527967453003, + "learning_rate": 8.69168204400284e-05, + "loss": 0.007350246608257294, + "step": 92210 + }, + { + "epoch": 13.09013484740951, + "grad_norm": 0.020817680284380913, + "learning_rate": 8.691540099361249e-05, + "loss": 0.038484251499176024, + "step": 92220 + }, + { + "epoch": 13.091554293825409, + "grad_norm": 11.199978828430176, + "learning_rate": 8.69139815471966e-05, + "loss": 0.04380442500114441, + "step": 92230 + }, + { + "epoch": 13.092973740241305, + "grad_norm": 0.44425156712532043, + "learning_rate": 8.69125621007807e-05, + "loss": 0.05576305985450745, + "step": 92240 + }, + { + "epoch": 13.094393186657204, + "grad_norm": 0.09422028064727783, + "learning_rate": 8.69111426543648e-05, + "loss": 0.01708393394947052, + "step": 92250 + }, + { + "epoch": 13.095812633073102, + "grad_norm": 0.2461843192577362, + "learning_rate": 8.69097232079489e-05, + "loss": 0.027886903285980223, + "step": 92260 + }, + { + "epoch": 13.097232079488998, + "grad_norm": 0.05524897575378418, + "learning_rate": 8.6908303761533e-05, + "loss": 0.01066754013299942, + "step": 92270 + }, + { + "epoch": 13.098651525904897, + "grad_norm": 0.015682321041822433, + "learning_rate": 8.690688431511712e-05, + "loss": 0.011989720910787583, + "step": 92280 + }, + { + "epoch": 13.100070972320795, + "grad_norm": 0.3898911774158478, + "learning_rate": 8.690546486870121e-05, + "loss": 0.0231645867228508, + "step": 92290 + }, + { + "epoch": 13.101490418736693, + "grad_norm": 0.054293807595968246, + "learning_rate": 8.690404542228532e-05, + "loss": 0.0053061418235301975, + "step": 92300 + }, + { + "epoch": 13.10290986515259, + "grad_norm": 0.1619403213262558, + "learning_rate": 8.690262597586941e-05, + "loss": 0.07225641012191772, + "step": 92310 + }, + { + "epoch": 13.104329311568488, + "grad_norm": 0.12324342131614685, + "learning_rate": 8.690120652945352e-05, + "loss": 0.031182992458343505, + "step": 92320 + }, + { + "epoch": 13.105748757984387, + "grad_norm": 0.034161925315856934, + "learning_rate": 8.689978708303762e-05, + "loss": 0.03507125377655029, + "step": 92330 + }, + { + "epoch": 13.107168204400283, + "grad_norm": 0.6346161365509033, + "learning_rate": 8.689836763662173e-05, + "loss": 0.02153913825750351, + "step": 92340 + }, + { + "epoch": 13.108587650816181, + "grad_norm": 0.7318549156188965, + "learning_rate": 8.689694819020583e-05, + "loss": 0.014379370212554931, + "step": 92350 + }, + { + "epoch": 13.11000709723208, + "grad_norm": 2.773247241973877, + "learning_rate": 8.689552874378992e-05, + "loss": 0.03619228601455689, + "step": 92360 + }, + { + "epoch": 13.111426543647978, + "grad_norm": 4.637772083282471, + "learning_rate": 8.689410929737403e-05, + "loss": 0.038220956921577454, + "step": 92370 + }, + { + "epoch": 13.112845990063875, + "grad_norm": 0.9285563826560974, + "learning_rate": 8.689268985095813e-05, + "loss": 0.05401658415794373, + "step": 92380 + }, + { + "epoch": 13.114265436479773, + "grad_norm": 0.4850822389125824, + "learning_rate": 8.689127040454224e-05, + "loss": 0.030674123764038087, + "step": 92390 + }, + { + "epoch": 13.115684882895671, + "grad_norm": 0.06712858378887177, + "learning_rate": 8.688985095812634e-05, + "loss": 0.048642593622207644, + "step": 92400 + }, + { + "epoch": 13.117104329311568, + "grad_norm": 5.168540000915527, + "learning_rate": 8.688843151171044e-05, + "loss": 0.021758055686950682, + "step": 92410 + }, + { + "epoch": 13.118523775727466, + "grad_norm": 0.37047693133354187, + "learning_rate": 8.688701206529453e-05, + "loss": 0.03085559904575348, + "step": 92420 + }, + { + "epoch": 13.119943222143364, + "grad_norm": 0.7145349383354187, + "learning_rate": 8.688559261887864e-05, + "loss": 0.011856220662593842, + "step": 92430 + }, + { + "epoch": 13.121362668559263, + "grad_norm": 0.41536930203437805, + "learning_rate": 8.688417317246274e-05, + "loss": 0.11770002841949463, + "step": 92440 + }, + { + "epoch": 13.12278211497516, + "grad_norm": 1.966375708580017, + "learning_rate": 8.688275372604685e-05, + "loss": 0.010399091243743896, + "step": 92450 + }, + { + "epoch": 13.124201561391057, + "grad_norm": 1.3598144054412842, + "learning_rate": 8.688133427963095e-05, + "loss": 0.033311480283737184, + "step": 92460 + }, + { + "epoch": 13.125621007806956, + "grad_norm": 16.98819923400879, + "learning_rate": 8.687991483321505e-05, + "loss": 0.07230284214019775, + "step": 92470 + }, + { + "epoch": 13.127040454222852, + "grad_norm": 0.3322114646434784, + "learning_rate": 8.687849538679916e-05, + "loss": 0.03124167025089264, + "step": 92480 + }, + { + "epoch": 13.12845990063875, + "grad_norm": 0.3290840983390808, + "learning_rate": 8.687707594038326e-05, + "loss": 0.037420186400413516, + "step": 92490 + }, + { + "epoch": 13.129879347054649, + "grad_norm": 11.666303634643555, + "learning_rate": 8.687565649396737e-05, + "loss": 0.034588441252708435, + "step": 92500 + }, + { + "epoch": 13.129879347054649, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.047509148716926575, + "eval_runtime": 31.0917, + "eval_samples_per_second": 505.826, + "eval_steps_per_second": 15.824, + "step": 92500 + }, + { + "epoch": 13.131298793470547, + "grad_norm": 0.64284348487854, + "learning_rate": 8.687423704755145e-05, + "loss": 0.023534731566905977, + "step": 92510 + }, + { + "epoch": 13.132718239886444, + "grad_norm": 1.2833255529403687, + "learning_rate": 8.687281760113556e-05, + "loss": 0.030319365859031677, + "step": 92520 + }, + { + "epoch": 13.134137686302342, + "grad_norm": 1.0851508378982544, + "learning_rate": 8.687139815471966e-05, + "loss": 0.01361302137374878, + "step": 92530 + }, + { + "epoch": 13.13555713271824, + "grad_norm": 0.20210212469100952, + "learning_rate": 8.686997870830377e-05, + "loss": 0.012545563280582428, + "step": 92540 + }, + { + "epoch": 13.136976579134137, + "grad_norm": 0.15086351335048676, + "learning_rate": 8.686855926188787e-05, + "loss": 0.022943997383117677, + "step": 92550 + }, + { + "epoch": 13.138396025550035, + "grad_norm": 8.776838302612305, + "learning_rate": 8.686713981547196e-05, + "loss": 0.05501660704612732, + "step": 92560 + }, + { + "epoch": 13.139815471965933, + "grad_norm": 0.151241272687912, + "learning_rate": 8.686572036905608e-05, + "loss": 0.0084109365940094, + "step": 92570 + }, + { + "epoch": 13.141234918381832, + "grad_norm": 1.0454845428466797, + "learning_rate": 8.686430092264017e-05, + "loss": 0.013821640610694885, + "step": 92580 + }, + { + "epoch": 13.142654364797728, + "grad_norm": 1.373452067375183, + "learning_rate": 8.686288147622428e-05, + "loss": 0.023174571990966796, + "step": 92590 + }, + { + "epoch": 13.144073811213627, + "grad_norm": 0.09910205751657486, + "learning_rate": 8.686146202980838e-05, + "loss": 0.01848770081996918, + "step": 92600 + }, + { + "epoch": 13.145493257629525, + "grad_norm": 5.731260299682617, + "learning_rate": 8.686004258339248e-05, + "loss": 0.01092785894870758, + "step": 92610 + }, + { + "epoch": 13.146912704045421, + "grad_norm": 5.1644415855407715, + "learning_rate": 8.685862313697658e-05, + "loss": 0.034691983461380006, + "step": 92620 + }, + { + "epoch": 13.14833215046132, + "grad_norm": 3.380138397216797, + "learning_rate": 8.685720369056069e-05, + "loss": 0.013407303392887116, + "step": 92630 + }, + { + "epoch": 13.149751596877218, + "grad_norm": 1.7176895141601562, + "learning_rate": 8.685578424414478e-05, + "loss": 0.020493271946907043, + "step": 92640 + }, + { + "epoch": 13.151171043293116, + "grad_norm": 1.9893896579742432, + "learning_rate": 8.68543647977289e-05, + "loss": 0.018180091679096223, + "step": 92650 + }, + { + "epoch": 13.152590489709013, + "grad_norm": 2.009446859359741, + "learning_rate": 8.685294535131299e-05, + "loss": 0.03541998267173767, + "step": 92660 + }, + { + "epoch": 13.154009936124911, + "grad_norm": 5.200847625732422, + "learning_rate": 8.685152590489709e-05, + "loss": 0.014680585265159607, + "step": 92670 + }, + { + "epoch": 13.15542938254081, + "grad_norm": 6.761829853057861, + "learning_rate": 8.68501064584812e-05, + "loss": 0.012983223795890808, + "step": 92680 + }, + { + "epoch": 13.156848828956706, + "grad_norm": 0.011524581350386143, + "learning_rate": 8.68486870120653e-05, + "loss": 0.008690010756254196, + "step": 92690 + }, + { + "epoch": 13.158268275372604, + "grad_norm": 0.10512061417102814, + "learning_rate": 8.684726756564941e-05, + "loss": 0.011312702298164367, + "step": 92700 + }, + { + "epoch": 13.159687721788503, + "grad_norm": 4.294369220733643, + "learning_rate": 8.68458481192335e-05, + "loss": 0.028549957275390624, + "step": 92710 + }, + { + "epoch": 13.161107168204401, + "grad_norm": 8.672225952148438, + "learning_rate": 8.68444286728176e-05, + "loss": 0.05349311828613281, + "step": 92720 + }, + { + "epoch": 13.162526614620297, + "grad_norm": 3.49619197845459, + "learning_rate": 8.68430092264017e-05, + "loss": 0.043832501769065856, + "step": 92730 + }, + { + "epoch": 13.163946061036196, + "grad_norm": 3.8927085399627686, + "learning_rate": 8.684158977998581e-05, + "loss": 0.024391159415245056, + "step": 92740 + }, + { + "epoch": 13.165365507452094, + "grad_norm": 0.07379254698753357, + "learning_rate": 8.684017033356991e-05, + "loss": 0.010248324275016785, + "step": 92750 + }, + { + "epoch": 13.16678495386799, + "grad_norm": 2.031379461288452, + "learning_rate": 8.683875088715402e-05, + "loss": 0.0023462004959583283, + "step": 92760 + }, + { + "epoch": 13.168204400283889, + "grad_norm": 0.2536148428916931, + "learning_rate": 8.683733144073812e-05, + "loss": 0.02251149117946625, + "step": 92770 + }, + { + "epoch": 13.169623846699787, + "grad_norm": 1.5524146556854248, + "learning_rate": 8.683591199432222e-05, + "loss": 0.013939085602760314, + "step": 92780 + }, + { + "epoch": 13.171043293115686, + "grad_norm": 0.209752157330513, + "learning_rate": 8.683449254790633e-05, + "loss": 0.03190666139125824, + "step": 92790 + }, + { + "epoch": 13.172462739531582, + "grad_norm": 0.20207785069942474, + "learning_rate": 8.683307310149042e-05, + "loss": 0.024697883427143096, + "step": 92800 + }, + { + "epoch": 13.17388218594748, + "grad_norm": 0.018222937360405922, + "learning_rate": 8.683165365507453e-05, + "loss": 0.020527932047843932, + "step": 92810 + }, + { + "epoch": 13.175301632363379, + "grad_norm": 0.008755608461797237, + "learning_rate": 8.683023420865862e-05, + "loss": 0.0073832511901855465, + "step": 92820 + }, + { + "epoch": 13.176721078779275, + "grad_norm": 4.745551109313965, + "learning_rate": 8.682881476224273e-05, + "loss": 0.012114915996789932, + "step": 92830 + }, + { + "epoch": 13.178140525195174, + "grad_norm": 0.12335074692964554, + "learning_rate": 8.682739531582683e-05, + "loss": 0.003640429675579071, + "step": 92840 + }, + { + "epoch": 13.179559971611072, + "grad_norm": 0.1021898165345192, + "learning_rate": 8.682597586941094e-05, + "loss": 0.0027994271367788315, + "step": 92850 + }, + { + "epoch": 13.18097941802697, + "grad_norm": 13.2909574508667, + "learning_rate": 8.682455642299504e-05, + "loss": 0.02914237380027771, + "step": 92860 + }, + { + "epoch": 13.182398864442867, + "grad_norm": 0.0916062444448471, + "learning_rate": 8.682313697657913e-05, + "loss": 0.026697584986686708, + "step": 92870 + }, + { + "epoch": 13.183818310858765, + "grad_norm": 0.274517297744751, + "learning_rate": 8.682171753016324e-05, + "loss": 0.008112631738185883, + "step": 92880 + }, + { + "epoch": 13.185237757274663, + "grad_norm": 1.534758448600769, + "learning_rate": 8.682029808374734e-05, + "loss": 0.04348610639572144, + "step": 92890 + }, + { + "epoch": 13.18665720369056, + "grad_norm": 0.2583373785018921, + "learning_rate": 8.681887863733145e-05, + "loss": 0.006182187795639038, + "step": 92900 + }, + { + "epoch": 13.188076650106458, + "grad_norm": 0.037416551262140274, + "learning_rate": 8.681745919091555e-05, + "loss": 0.04729160368442535, + "step": 92910 + }, + { + "epoch": 13.189496096522356, + "grad_norm": 0.8491297960281372, + "learning_rate": 8.681603974449965e-05, + "loss": 0.005653556436300278, + "step": 92920 + }, + { + "epoch": 13.190915542938255, + "grad_norm": 0.0102075831964612, + "learning_rate": 8.681462029808374e-05, + "loss": 0.020110359787940978, + "step": 92930 + }, + { + "epoch": 13.192334989354151, + "grad_norm": 0.16059798002243042, + "learning_rate": 8.681320085166785e-05, + "loss": 0.013830339908599854, + "step": 92940 + }, + { + "epoch": 13.19375443577005, + "grad_norm": 5.683797836303711, + "learning_rate": 8.681192334989354e-05, + "loss": 0.046293017268180844, + "step": 92950 + }, + { + "epoch": 13.195173882185948, + "grad_norm": 10.07907772064209, + "learning_rate": 8.681050390347765e-05, + "loss": 0.028960409760475158, + "step": 92960 + }, + { + "epoch": 13.196593328601844, + "grad_norm": 8.026473045349121, + "learning_rate": 8.680908445706175e-05, + "loss": 0.024904248118400574, + "step": 92970 + }, + { + "epoch": 13.198012775017743, + "grad_norm": 1.5866436958312988, + "learning_rate": 8.680766501064586e-05, + "loss": 0.0035437196493148804, + "step": 92980 + }, + { + "epoch": 13.199432221433641, + "grad_norm": 0.333400160074234, + "learning_rate": 8.680624556422996e-05, + "loss": 0.0219614177942276, + "step": 92990 + }, + { + "epoch": 13.20085166784954, + "grad_norm": 2.6004791259765625, + "learning_rate": 8.680482611781405e-05, + "loss": 0.020504592359066008, + "step": 93000 + }, + { + "epoch": 13.20085166784954, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.04106009751558304, + "eval_runtime": 31.3606, + "eval_samples_per_second": 501.489, + "eval_steps_per_second": 15.688, + "step": 93000 + }, + { + "epoch": 13.202271114265436, + "grad_norm": 3.023986577987671, + "learning_rate": 8.680340667139815e-05, + "loss": 0.02961946725845337, + "step": 93010 + }, + { + "epoch": 13.203690560681334, + "grad_norm": 8.1243314743042, + "learning_rate": 8.680198722498226e-05, + "loss": 0.08623704910278321, + "step": 93020 + }, + { + "epoch": 13.205110007097232, + "grad_norm": 10.625955581665039, + "learning_rate": 8.680056777856637e-05, + "loss": 0.03337146937847137, + "step": 93030 + }, + { + "epoch": 13.206529453513129, + "grad_norm": 0.01585538126528263, + "learning_rate": 8.679914833215047e-05, + "loss": 0.04821697473526001, + "step": 93040 + }, + { + "epoch": 13.207948899929027, + "grad_norm": 7.857775688171387, + "learning_rate": 8.679772888573457e-05, + "loss": 0.024783408641815184, + "step": 93050 + }, + { + "epoch": 13.209368346344926, + "grad_norm": 1.573050618171692, + "learning_rate": 8.679630943931867e-05, + "loss": 0.012848149240016937, + "step": 93060 + }, + { + "epoch": 13.210787792760824, + "grad_norm": 8.90145206451416, + "learning_rate": 8.679488999290278e-05, + "loss": 0.04379624426364899, + "step": 93070 + }, + { + "epoch": 13.21220723917672, + "grad_norm": 4.176666259765625, + "learning_rate": 8.679347054648687e-05, + "loss": 0.022595225274562834, + "step": 93080 + }, + { + "epoch": 13.213626685592619, + "grad_norm": 3.151440143585205, + "learning_rate": 8.679205110007098e-05, + "loss": 0.013112765550613404, + "step": 93090 + }, + { + "epoch": 13.215046132008517, + "grad_norm": 0.012762556783854961, + "learning_rate": 8.679063165365507e-05, + "loss": 0.010376498848199845, + "step": 93100 + }, + { + "epoch": 13.216465578424414, + "grad_norm": 0.3497304618358612, + "learning_rate": 8.678921220723918e-05, + "loss": 0.022116436064243315, + "step": 93110 + }, + { + "epoch": 13.217885024840312, + "grad_norm": 8.522994995117188, + "learning_rate": 8.678779276082329e-05, + "loss": 0.0177663192152977, + "step": 93120 + }, + { + "epoch": 13.21930447125621, + "grad_norm": 13.146580696105957, + "learning_rate": 8.678637331440739e-05, + "loss": 0.014370155334472657, + "step": 93130 + }, + { + "epoch": 13.220723917672109, + "grad_norm": 0.05478169769048691, + "learning_rate": 8.67849538679915e-05, + "loss": 0.015843385457992555, + "step": 93140 + }, + { + "epoch": 13.222143364088005, + "grad_norm": 0.5907186269760132, + "learning_rate": 8.678353442157558e-05, + "loss": 0.014893335103988648, + "step": 93150 + }, + { + "epoch": 13.223562810503903, + "grad_norm": 0.2738039195537567, + "learning_rate": 8.678211497515969e-05, + "loss": 0.01633225381374359, + "step": 93160 + }, + { + "epoch": 13.224982256919802, + "grad_norm": 0.016736086457967758, + "learning_rate": 8.678069552874379e-05, + "loss": 0.008309248834848404, + "step": 93170 + }, + { + "epoch": 13.2264017033357, + "grad_norm": 0.06897296756505966, + "learning_rate": 8.67792760823279e-05, + "loss": 0.028543704748153688, + "step": 93180 + }, + { + "epoch": 13.227821149751597, + "grad_norm": 5.2853851318359375, + "learning_rate": 8.6777856635912e-05, + "loss": 0.015082527697086335, + "step": 93190 + }, + { + "epoch": 13.229240596167495, + "grad_norm": 15.268448829650879, + "learning_rate": 8.67764371894961e-05, + "loss": 0.01713217794895172, + "step": 93200 + }, + { + "epoch": 13.230660042583393, + "grad_norm": 1.7831941843032837, + "learning_rate": 8.677501774308021e-05, + "loss": 0.02639639675617218, + "step": 93210 + }, + { + "epoch": 13.23207948899929, + "grad_norm": 0.28602883219718933, + "learning_rate": 8.67735982966643e-05, + "loss": 0.031729042530059814, + "step": 93220 + }, + { + "epoch": 13.233498935415188, + "grad_norm": 0.7817912101745605, + "learning_rate": 8.677217885024842e-05, + "loss": 0.019844482839107513, + "step": 93230 + }, + { + "epoch": 13.234918381831086, + "grad_norm": 0.062219344079494476, + "learning_rate": 8.677075940383251e-05, + "loss": 0.04821741878986359, + "step": 93240 + }, + { + "epoch": 13.236337828246985, + "grad_norm": 0.8284728527069092, + "learning_rate": 8.676933995741661e-05, + "loss": 0.030682769417762757, + "step": 93250 + }, + { + "epoch": 13.237757274662881, + "grad_norm": 0.8677076101303101, + "learning_rate": 8.676792051100071e-05, + "loss": 0.007623846083879471, + "step": 93260 + }, + { + "epoch": 13.23917672107878, + "grad_norm": 0.1573665589094162, + "learning_rate": 8.676650106458482e-05, + "loss": 0.04373520612716675, + "step": 93270 + }, + { + "epoch": 13.240596167494678, + "grad_norm": 0.17571642994880676, + "learning_rate": 8.676508161816892e-05, + "loss": 0.04299502968788147, + "step": 93280 + }, + { + "epoch": 13.242015613910574, + "grad_norm": 2.4545087814331055, + "learning_rate": 8.676366217175303e-05, + "loss": 0.03129469752311707, + "step": 93290 + }, + { + "epoch": 13.243435060326473, + "grad_norm": 0.0385487824678421, + "learning_rate": 8.676238466997871e-05, + "loss": 0.08912227153778077, + "step": 93300 + }, + { + "epoch": 13.24485450674237, + "grad_norm": 0.9710257649421692, + "learning_rate": 8.676096522356282e-05, + "loss": 0.020866674184799195, + "step": 93310 + }, + { + "epoch": 13.24627395315827, + "grad_norm": 9.935172080993652, + "learning_rate": 8.675954577714692e-05, + "loss": 0.013743373751640319, + "step": 93320 + }, + { + "epoch": 13.247693399574166, + "grad_norm": 0.5209399461746216, + "learning_rate": 8.675812633073102e-05, + "loss": 0.014395973086357117, + "step": 93330 + }, + { + "epoch": 13.249112845990064, + "grad_norm": 0.09892766177654266, + "learning_rate": 8.675670688431511e-05, + "loss": 0.027313077449798585, + "step": 93340 + }, + { + "epoch": 13.250532292405962, + "grad_norm": 0.27284589409828186, + "learning_rate": 8.675528743789923e-05, + "loss": 0.030682840943336488, + "step": 93350 + }, + { + "epoch": 13.251951738821859, + "grad_norm": 1.8673031330108643, + "learning_rate": 8.675386799148332e-05, + "loss": 0.02939329147338867, + "step": 93360 + }, + { + "epoch": 13.253371185237757, + "grad_norm": 0.022475466132164, + "learning_rate": 8.675244854506743e-05, + "loss": 0.02068745642900467, + "step": 93370 + }, + { + "epoch": 13.254790631653655, + "grad_norm": 0.3351653516292572, + "learning_rate": 8.675102909865153e-05, + "loss": 0.04050408899784088, + "step": 93380 + }, + { + "epoch": 13.256210078069554, + "grad_norm": 1.726203203201294, + "learning_rate": 8.674960965223563e-05, + "loss": 0.029091688990592956, + "step": 93390 + }, + { + "epoch": 13.25762952448545, + "grad_norm": 4.862390518188477, + "learning_rate": 8.674819020581974e-05, + "loss": 0.00784970447421074, + "step": 93400 + }, + { + "epoch": 13.259048970901349, + "grad_norm": 2.797797203063965, + "learning_rate": 8.674677075940384e-05, + "loss": 0.025647896528244018, + "step": 93410 + }, + { + "epoch": 13.260468417317247, + "grad_norm": 0.0536530502140522, + "learning_rate": 8.674535131298795e-05, + "loss": 0.026149827241897582, + "step": 93420 + }, + { + "epoch": 13.261887863733143, + "grad_norm": 0.2407999038696289, + "learning_rate": 8.674393186657203e-05, + "loss": 0.029965820908546447, + "step": 93430 + }, + { + "epoch": 13.263307310149042, + "grad_norm": 7.153860569000244, + "learning_rate": 8.674251242015614e-05, + "loss": 0.04421953558921814, + "step": 93440 + }, + { + "epoch": 13.26472675656494, + "grad_norm": 0.01831003837287426, + "learning_rate": 8.674109297374024e-05, + "loss": 0.007175080478191376, + "step": 93450 + }, + { + "epoch": 13.266146202980838, + "grad_norm": 0.4497462809085846, + "learning_rate": 8.673967352732435e-05, + "loss": 0.03221384286880493, + "step": 93460 + }, + { + "epoch": 13.267565649396735, + "grad_norm": 0.14525070786476135, + "learning_rate": 8.673825408090845e-05, + "loss": 0.015599586069583893, + "step": 93470 + }, + { + "epoch": 13.268985095812633, + "grad_norm": 0.11716824769973755, + "learning_rate": 8.673683463449255e-05, + "loss": 0.02416829764842987, + "step": 93480 + }, + { + "epoch": 13.270404542228531, + "grad_norm": 0.3827669322490692, + "learning_rate": 8.673541518807666e-05, + "loss": 0.019712349772453307, + "step": 93490 + }, + { + "epoch": 13.271823988644428, + "grad_norm": 0.3044072091579437, + "learning_rate": 8.673399574166075e-05, + "loss": 0.01277415156364441, + "step": 93500 + }, + { + "epoch": 13.271823988644428, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.04419074207544327, + "eval_runtime": 31.0429, + "eval_samples_per_second": 506.621, + "eval_steps_per_second": 15.849, + "step": 93500 + }, + { + "epoch": 13.273243435060326, + "grad_norm": 0.38797178864479065, + "learning_rate": 8.673257629524486e-05, + "loss": 0.03536054491996765, + "step": 93510 + }, + { + "epoch": 13.274662881476225, + "grad_norm": 0.00916266068816185, + "learning_rate": 8.673115684882896e-05, + "loss": 0.005046528205275536, + "step": 93520 + }, + { + "epoch": 13.276082327892123, + "grad_norm": 2.2260570526123047, + "learning_rate": 8.672973740241306e-05, + "loss": 0.06993077993392945, + "step": 93530 + }, + { + "epoch": 13.27750177430802, + "grad_norm": 0.5128903388977051, + "learning_rate": 8.672831795599716e-05, + "loss": 0.004808105900883675, + "step": 93540 + }, + { + "epoch": 13.278921220723918, + "grad_norm": 0.10632356256246567, + "learning_rate": 8.672689850958127e-05, + "loss": 0.027431467175483705, + "step": 93550 + }, + { + "epoch": 13.280340667139816, + "grad_norm": 0.857125461101532, + "learning_rate": 8.672547906316537e-05, + "loss": 0.0861474871635437, + "step": 93560 + }, + { + "epoch": 13.281760113555713, + "grad_norm": 0.12263695150613785, + "learning_rate": 8.672405961674948e-05, + "loss": 0.030454087257385253, + "step": 93570 + }, + { + "epoch": 13.283179559971611, + "grad_norm": 0.5237917900085449, + "learning_rate": 8.672264017033357e-05, + "loss": 0.004111305624246597, + "step": 93580 + }, + { + "epoch": 13.28459900638751, + "grad_norm": 0.19974131882190704, + "learning_rate": 8.672122072391767e-05, + "loss": 0.023677754402160644, + "step": 93590 + }, + { + "epoch": 13.286018452803408, + "grad_norm": 0.04327812418341637, + "learning_rate": 8.671980127750178e-05, + "loss": 0.00693313330411911, + "step": 93600 + }, + { + "epoch": 13.287437899219304, + "grad_norm": 0.00680148508399725, + "learning_rate": 8.671838183108588e-05, + "loss": 0.017087599635124205, + "step": 93610 + }, + { + "epoch": 13.288857345635202, + "grad_norm": 0.0410841628909111, + "learning_rate": 8.671696238466999e-05, + "loss": 0.010396718978881836, + "step": 93620 + }, + { + "epoch": 13.2902767920511, + "grad_norm": 1.3464971780776978, + "learning_rate": 8.671554293825407e-05, + "loss": 0.03353613018989563, + "step": 93630 + }, + { + "epoch": 13.291696238466997, + "grad_norm": 4.748608112335205, + "learning_rate": 8.671412349183819e-05, + "loss": 0.01375894695520401, + "step": 93640 + }, + { + "epoch": 13.293115684882896, + "grad_norm": 0.13896571099758148, + "learning_rate": 8.671270404542228e-05, + "loss": 0.030356216430664062, + "step": 93650 + }, + { + "epoch": 13.294535131298794, + "grad_norm": 0.026233388110995293, + "learning_rate": 8.67112845990064e-05, + "loss": 0.01820582151412964, + "step": 93660 + }, + { + "epoch": 13.295954577714692, + "grad_norm": 0.09100806713104248, + "learning_rate": 8.670986515259049e-05, + "loss": 0.022753316164016723, + "step": 93670 + }, + { + "epoch": 13.297374024130589, + "grad_norm": 0.013288362883031368, + "learning_rate": 8.67084457061746e-05, + "loss": 0.032885891199111936, + "step": 93680 + }, + { + "epoch": 13.298793470546487, + "grad_norm": 5.086958408355713, + "learning_rate": 8.67070262597587e-05, + "loss": 0.05920916795730591, + "step": 93690 + }, + { + "epoch": 13.300212916962385, + "grad_norm": 7.441182613372803, + "learning_rate": 8.67056068133428e-05, + "loss": 0.0234588697552681, + "step": 93700 + }, + { + "epoch": 13.301632363378282, + "grad_norm": 1.1889616250991821, + "learning_rate": 8.670418736692691e-05, + "loss": 0.01023445576429367, + "step": 93710 + }, + { + "epoch": 13.30305180979418, + "grad_norm": 0.1379549503326416, + "learning_rate": 8.6702767920511e-05, + "loss": 0.02517501711845398, + "step": 93720 + }, + { + "epoch": 13.304471256210078, + "grad_norm": 8.683314323425293, + "learning_rate": 8.670134847409512e-05, + "loss": 0.04826049506664276, + "step": 93730 + }, + { + "epoch": 13.305890702625977, + "grad_norm": 12.325695037841797, + "learning_rate": 8.66999290276792e-05, + "loss": 0.01828770339488983, + "step": 93740 + }, + { + "epoch": 13.307310149041873, + "grad_norm": 0.6374432444572449, + "learning_rate": 8.669850958126331e-05, + "loss": 0.012936475872993469, + "step": 93750 + }, + { + "epoch": 13.308729595457772, + "grad_norm": 3.119178295135498, + "learning_rate": 8.669709013484741e-05, + "loss": 0.02451300173997879, + "step": 93760 + }, + { + "epoch": 13.31014904187367, + "grad_norm": 14.731306076049805, + "learning_rate": 8.669567068843152e-05, + "loss": 0.06916946768760682, + "step": 93770 + }, + { + "epoch": 13.311568488289566, + "grad_norm": 1.009304165840149, + "learning_rate": 8.669425124201562e-05, + "loss": 0.028861042857170106, + "step": 93780 + }, + { + "epoch": 13.312987934705465, + "grad_norm": 0.009057086892426014, + "learning_rate": 8.669283179559971e-05, + "loss": 0.035729244351387024, + "step": 93790 + }, + { + "epoch": 13.314407381121363, + "grad_norm": 4.11885929107666, + "learning_rate": 8.669141234918382e-05, + "loss": 0.013351409137248993, + "step": 93800 + }, + { + "epoch": 13.315826827537261, + "grad_norm": 11.137663841247559, + "learning_rate": 8.668999290276792e-05, + "loss": 0.0382326602935791, + "step": 93810 + }, + { + "epoch": 13.317246273953158, + "grad_norm": 0.0717635303735733, + "learning_rate": 8.668857345635203e-05, + "loss": 0.022524161636829375, + "step": 93820 + }, + { + "epoch": 13.318665720369056, + "grad_norm": 4.912062168121338, + "learning_rate": 8.668715400993613e-05, + "loss": 0.05251979231834412, + "step": 93830 + }, + { + "epoch": 13.320085166784954, + "grad_norm": 0.7680724859237671, + "learning_rate": 8.668573456352023e-05, + "loss": 0.012150004506111145, + "step": 93840 + }, + { + "epoch": 13.321504613200851, + "grad_norm": 0.008376783691346645, + "learning_rate": 8.668431511710432e-05, + "loss": 0.036633032560348514, + "step": 93850 + }, + { + "epoch": 13.32292405961675, + "grad_norm": 6.526537895202637, + "learning_rate": 8.668289567068844e-05, + "loss": 0.0329626202583313, + "step": 93860 + }, + { + "epoch": 13.324343506032648, + "grad_norm": 0.5995200276374817, + "learning_rate": 8.668147622427253e-05, + "loss": 0.02415698319673538, + "step": 93870 + }, + { + "epoch": 13.325762952448546, + "grad_norm": 0.16087555885314941, + "learning_rate": 8.668005677785664e-05, + "loss": 0.027556967735290528, + "step": 93880 + }, + { + "epoch": 13.327182398864442, + "grad_norm": 0.6079142093658447, + "learning_rate": 8.667863733144074e-05, + "loss": 0.03587366044521332, + "step": 93890 + }, + { + "epoch": 13.32860184528034, + "grad_norm": 0.10928544402122498, + "learning_rate": 8.667721788502484e-05, + "loss": 0.049797934293746945, + "step": 93900 + }, + { + "epoch": 13.330021291696239, + "grad_norm": 0.17567208409309387, + "learning_rate": 8.667579843860895e-05, + "loss": 0.021293030679225923, + "step": 93910 + }, + { + "epoch": 13.331440738112136, + "grad_norm": 0.13670562207698822, + "learning_rate": 8.667437899219305e-05, + "loss": 0.008856196701526643, + "step": 93920 + }, + { + "epoch": 13.332860184528034, + "grad_norm": 0.24669358134269714, + "learning_rate": 8.667295954577716e-05, + "loss": 0.02744440734386444, + "step": 93930 + }, + { + "epoch": 13.334279630943932, + "grad_norm": 0.15545465052127838, + "learning_rate": 8.667154009936124e-05, + "loss": 0.007605926692485809, + "step": 93940 + }, + { + "epoch": 13.33569907735983, + "grad_norm": 0.403153657913208, + "learning_rate": 8.667012065294535e-05, + "loss": 0.017333367466926576, + "step": 93950 + }, + { + "epoch": 13.337118523775727, + "grad_norm": 0.053572166711091995, + "learning_rate": 8.666870120652945e-05, + "loss": 0.020798361301422118, + "step": 93960 + }, + { + "epoch": 13.338537970191625, + "grad_norm": 2.455256938934326, + "learning_rate": 8.666728176011356e-05, + "loss": 0.029119691252708434, + "step": 93970 + }, + { + "epoch": 13.339957416607524, + "grad_norm": 11.778632164001465, + "learning_rate": 8.666586231369767e-05, + "loss": 0.07675871849060059, + "step": 93980 + }, + { + "epoch": 13.34137686302342, + "grad_norm": 0.9911078214645386, + "learning_rate": 8.666444286728176e-05, + "loss": 0.005806304514408112, + "step": 93990 + }, + { + "epoch": 13.342796309439318, + "grad_norm": 0.21370607614517212, + "learning_rate": 8.666302342086587e-05, + "loss": 0.017598675191402437, + "step": 94000 + }, + { + "epoch": 13.342796309439318, + "eval_accuracy": 0.9859477331976855, + "eval_loss": 0.05328426882624626, + "eval_runtime": 31.1265, + "eval_samples_per_second": 505.261, + "eval_steps_per_second": 15.806, + "step": 94000 + }, + { + "epoch": 13.344215755855217, + "grad_norm": 0.42926645278930664, + "learning_rate": 8.666160397444996e-05, + "loss": 0.01731862425804138, + "step": 94010 + }, + { + "epoch": 13.345635202271115, + "grad_norm": 0.45466873049736023, + "learning_rate": 8.666018452803408e-05, + "loss": 0.014458924531936646, + "step": 94020 + }, + { + "epoch": 13.347054648687012, + "grad_norm": 0.20981672406196594, + "learning_rate": 8.665876508161817e-05, + "loss": 0.02389024496078491, + "step": 94030 + }, + { + "epoch": 13.34847409510291, + "grad_norm": 0.041170816868543625, + "learning_rate": 8.665734563520228e-05, + "loss": 0.005668449029326439, + "step": 94040 + }, + { + "epoch": 13.349893541518808, + "grad_norm": 0.0988631471991539, + "learning_rate": 8.665592618878637e-05, + "loss": 0.02667955160140991, + "step": 94050 + }, + { + "epoch": 13.351312987934705, + "grad_norm": 11.238661766052246, + "learning_rate": 8.665450674237048e-05, + "loss": 0.03834929466247559, + "step": 94060 + }, + { + "epoch": 13.352732434350603, + "grad_norm": 0.005214661359786987, + "learning_rate": 8.665308729595459e-05, + "loss": 0.023474456369876863, + "step": 94070 + }, + { + "epoch": 13.354151880766501, + "grad_norm": 0.0774008110165596, + "learning_rate": 8.665166784953869e-05, + "loss": 0.05068536996841431, + "step": 94080 + }, + { + "epoch": 13.3555713271824, + "grad_norm": 0.35609862208366394, + "learning_rate": 8.66502484031228e-05, + "loss": 0.02888200879096985, + "step": 94090 + }, + { + "epoch": 13.356990773598296, + "grad_norm": 0.4439794719219208, + "learning_rate": 8.664882895670688e-05, + "loss": 0.023132362961769105, + "step": 94100 + }, + { + "epoch": 13.358410220014195, + "grad_norm": 3.6220321655273438, + "learning_rate": 8.664740951029099e-05, + "loss": 0.04306410849094391, + "step": 94110 + }, + { + "epoch": 13.359829666430093, + "grad_norm": 1.7350175380706787, + "learning_rate": 8.664599006387509e-05, + "loss": 0.03134117722511291, + "step": 94120 + }, + { + "epoch": 13.36124911284599, + "grad_norm": 20.512210845947266, + "learning_rate": 8.66445706174592e-05, + "loss": 0.027336391806602477, + "step": 94130 + }, + { + "epoch": 13.362668559261888, + "grad_norm": 0.024507010355591774, + "learning_rate": 8.66431511710433e-05, + "loss": 0.023677150905132293, + "step": 94140 + }, + { + "epoch": 13.364088005677786, + "grad_norm": 0.743320643901825, + "learning_rate": 8.66417317246274e-05, + "loss": 0.02231661379337311, + "step": 94150 + }, + { + "epoch": 13.365507452093684, + "grad_norm": 2.4547150135040283, + "learning_rate": 8.66403122782115e-05, + "loss": 0.016991636157035826, + "step": 94160 + }, + { + "epoch": 13.36692689850958, + "grad_norm": 2.07804274559021, + "learning_rate": 8.66388928317956e-05, + "loss": 0.01103825718164444, + "step": 94170 + }, + { + "epoch": 13.36834634492548, + "grad_norm": 0.6872746348381042, + "learning_rate": 8.663747338537971e-05, + "loss": 0.01877520829439163, + "step": 94180 + }, + { + "epoch": 13.369765791341377, + "grad_norm": 0.15541785955429077, + "learning_rate": 8.663605393896381e-05, + "loss": 0.006646855175495148, + "step": 94190 + }, + { + "epoch": 13.371185237757274, + "grad_norm": 0.5737395286560059, + "learning_rate": 8.663463449254791e-05, + "loss": 0.02360581010580063, + "step": 94200 + }, + { + "epoch": 13.372604684173172, + "grad_norm": 1.936894416809082, + "learning_rate": 8.6633215046132e-05, + "loss": 0.09669648408889771, + "step": 94210 + }, + { + "epoch": 13.37402413058907, + "grad_norm": 0.7344012260437012, + "learning_rate": 8.663179559971612e-05, + "loss": 0.00841248333454132, + "step": 94220 + }, + { + "epoch": 13.375443577004969, + "grad_norm": 1.448033094406128, + "learning_rate": 8.663037615330021e-05, + "loss": 0.016919110715389252, + "step": 94230 + }, + { + "epoch": 13.376863023420865, + "grad_norm": 0.5463376641273499, + "learning_rate": 8.662895670688433e-05, + "loss": 0.00848272666335106, + "step": 94240 + }, + { + "epoch": 13.378282469836764, + "grad_norm": 1.1138267517089844, + "learning_rate": 8.662753726046842e-05, + "loss": 0.019846946001052856, + "step": 94250 + }, + { + "epoch": 13.379701916252662, + "grad_norm": 3.6157217025756836, + "learning_rate": 8.662611781405252e-05, + "loss": 0.04727371633052826, + "step": 94260 + }, + { + "epoch": 13.381121362668559, + "grad_norm": 0.05200067535042763, + "learning_rate": 8.662469836763663e-05, + "loss": 0.05219040513038635, + "step": 94270 + }, + { + "epoch": 13.382540809084457, + "grad_norm": 0.8348428010940552, + "learning_rate": 8.662327892122073e-05, + "loss": 0.005376967415213585, + "step": 94280 + }, + { + "epoch": 13.383960255500355, + "grad_norm": 8.173595428466797, + "learning_rate": 8.662185947480484e-05, + "loss": 0.07077938914299012, + "step": 94290 + }, + { + "epoch": 13.385379701916253, + "grad_norm": 6.709331512451172, + "learning_rate": 8.662044002838892e-05, + "loss": 0.05493432879447937, + "step": 94300 + }, + { + "epoch": 13.38679914833215, + "grad_norm": 1.8870387077331543, + "learning_rate": 8.661902058197303e-05, + "loss": 0.044583475589752196, + "step": 94310 + }, + { + "epoch": 13.388218594748048, + "grad_norm": 1.3507314920425415, + "learning_rate": 8.661760113555713e-05, + "loss": 0.015715819597244263, + "step": 94320 + }, + { + "epoch": 13.389638041163947, + "grad_norm": 0.0798988863825798, + "learning_rate": 8.661618168914124e-05, + "loss": 0.01828896105289459, + "step": 94330 + }, + { + "epoch": 13.391057487579843, + "grad_norm": 4.10715913772583, + "learning_rate": 8.661476224272534e-05, + "loss": 0.017719167470932006, + "step": 94340 + }, + { + "epoch": 13.392476933995741, + "grad_norm": 0.8093958497047424, + "learning_rate": 8.661334279630944e-05, + "loss": 0.04272227585315704, + "step": 94350 + }, + { + "epoch": 13.39389638041164, + "grad_norm": 0.11310017108917236, + "learning_rate": 8.661192334989355e-05, + "loss": 0.016726674139499666, + "step": 94360 + }, + { + "epoch": 13.395315826827538, + "grad_norm": 0.01135955099016428, + "learning_rate": 8.661050390347765e-05, + "loss": 0.013713881373405457, + "step": 94370 + }, + { + "epoch": 13.396735273243435, + "grad_norm": 1.1644693613052368, + "learning_rate": 8.660908445706176e-05, + "loss": 0.004590839147567749, + "step": 94380 + }, + { + "epoch": 13.398154719659333, + "grad_norm": 13.32076358795166, + "learning_rate": 8.660766501064585e-05, + "loss": 0.030149951577186584, + "step": 94390 + }, + { + "epoch": 13.399574166075231, + "grad_norm": 1.9010841846466064, + "learning_rate": 8.660624556422997e-05, + "loss": 0.03028929829597473, + "step": 94400 + }, + { + "epoch": 13.400993612491128, + "grad_norm": 15.902448654174805, + "learning_rate": 8.660482611781405e-05, + "loss": 0.07759106159210205, + "step": 94410 + }, + { + "epoch": 13.402413058907026, + "grad_norm": 4.793128490447998, + "learning_rate": 8.660340667139816e-05, + "loss": 0.05131710171699524, + "step": 94420 + }, + { + "epoch": 13.403832505322924, + "grad_norm": 2.036078929901123, + "learning_rate": 8.660198722498226e-05, + "loss": 0.004698502644896507, + "step": 94430 + }, + { + "epoch": 13.405251951738823, + "grad_norm": 0.16924896836280823, + "learning_rate": 8.660056777856637e-05, + "loss": 0.005888786166906357, + "step": 94440 + }, + { + "epoch": 13.40667139815472, + "grad_norm": 0.2803502380847931, + "learning_rate": 8.659914833215047e-05, + "loss": 0.02985817790031433, + "step": 94450 + }, + { + "epoch": 13.408090844570618, + "grad_norm": 10.484100341796875, + "learning_rate": 8.659772888573456e-05, + "loss": 0.0780340552330017, + "step": 94460 + }, + { + "epoch": 13.409510290986516, + "grad_norm": 0.08539588004350662, + "learning_rate": 8.659630943931867e-05, + "loss": 0.012192347645759582, + "step": 94470 + }, + { + "epoch": 13.410929737402412, + "grad_norm": 0.2572302222251892, + "learning_rate": 8.659488999290277e-05, + "loss": 0.008727478981018066, + "step": 94480 + }, + { + "epoch": 13.41234918381831, + "grad_norm": 0.186640664935112, + "learning_rate": 8.659347054648688e-05, + "loss": 0.058405518531799316, + "step": 94490 + }, + { + "epoch": 13.413768630234209, + "grad_norm": 0.7377075552940369, + "learning_rate": 8.659205110007098e-05, + "loss": 0.04591574370861053, + "step": 94500 + }, + { + "epoch": 13.413768630234209, + "eval_accuracy": 0.97990716602022, + "eval_loss": 0.06794610619544983, + "eval_runtime": 31.1242, + "eval_samples_per_second": 505.298, + "eval_steps_per_second": 15.808, + "step": 94500 + }, + { + "epoch": 13.415188076650107, + "grad_norm": 0.030627934262156487, + "learning_rate": 8.659063165365508e-05, + "loss": 0.022979114949703217, + "step": 94510 + }, + { + "epoch": 13.416607523066004, + "grad_norm": 3.4133455753326416, + "learning_rate": 8.658921220723917e-05, + "loss": 0.023091521859169007, + "step": 94520 + }, + { + "epoch": 13.418026969481902, + "grad_norm": 0.8833838701248169, + "learning_rate": 8.658779276082329e-05, + "loss": 0.015490736067295074, + "step": 94530 + }, + { + "epoch": 13.4194464158978, + "grad_norm": 0.6688404679298401, + "learning_rate": 8.658637331440738e-05, + "loss": 0.013762807846069336, + "step": 94540 + }, + { + "epoch": 13.420865862313697, + "grad_norm": 1.5333127975463867, + "learning_rate": 8.65849538679915e-05, + "loss": 0.016186425089836122, + "step": 94550 + }, + { + "epoch": 13.422285308729595, + "grad_norm": 0.24415715038776398, + "learning_rate": 8.658353442157559e-05, + "loss": 0.014823892712593078, + "step": 94560 + }, + { + "epoch": 13.423704755145494, + "grad_norm": 7.537097930908203, + "learning_rate": 8.658211497515969e-05, + "loss": 0.06450334787368775, + "step": 94570 + }, + { + "epoch": 13.425124201561392, + "grad_norm": 0.8393383622169495, + "learning_rate": 8.65806955287438e-05, + "loss": 0.03933723270893097, + "step": 94580 + }, + { + "epoch": 13.426543647977288, + "grad_norm": 5.340954303741455, + "learning_rate": 8.65792760823279e-05, + "loss": 0.02385113388299942, + "step": 94590 + }, + { + "epoch": 13.427963094393187, + "grad_norm": 2.525688648223877, + "learning_rate": 8.657785663591201e-05, + "loss": 0.07299022674560547, + "step": 94600 + }, + { + "epoch": 13.429382540809085, + "grad_norm": 0.3474077582359314, + "learning_rate": 8.657643718949609e-05, + "loss": 0.03391514718532562, + "step": 94610 + }, + { + "epoch": 13.430801987224982, + "grad_norm": 0.13079656660556793, + "learning_rate": 8.65750177430802e-05, + "loss": 0.0268646240234375, + "step": 94620 + }, + { + "epoch": 13.43222143364088, + "grad_norm": 0.34679096937179565, + "learning_rate": 8.65735982966643e-05, + "loss": 0.021389296650886534, + "step": 94630 + }, + { + "epoch": 13.433640880056778, + "grad_norm": 0.4974942207336426, + "learning_rate": 8.657217885024841e-05, + "loss": 0.003970606997609138, + "step": 94640 + }, + { + "epoch": 13.435060326472676, + "grad_norm": 4.566431045532227, + "learning_rate": 8.657075940383251e-05, + "loss": 0.009872384369373322, + "step": 94650 + }, + { + "epoch": 13.436479772888573, + "grad_norm": 0.22306831181049347, + "learning_rate": 8.65693399574166e-05, + "loss": 0.01926995664834976, + "step": 94660 + }, + { + "epoch": 13.437899219304471, + "grad_norm": 0.18319228291511536, + "learning_rate": 8.656792051100072e-05, + "loss": 0.005128199979662895, + "step": 94670 + }, + { + "epoch": 13.43931866572037, + "grad_norm": 1.5891705751419067, + "learning_rate": 8.656650106458481e-05, + "loss": 0.03338150680065155, + "step": 94680 + }, + { + "epoch": 13.440738112136266, + "grad_norm": 0.11429005116224289, + "learning_rate": 8.656508161816892e-05, + "loss": 0.0355482429265976, + "step": 94690 + }, + { + "epoch": 13.442157558552164, + "grad_norm": 0.0894903615117073, + "learning_rate": 8.656366217175302e-05, + "loss": 0.007524222135543823, + "step": 94700 + }, + { + "epoch": 13.443577004968063, + "grad_norm": 0.026772212237119675, + "learning_rate": 8.656224272533713e-05, + "loss": 0.024022915959358217, + "step": 94710 + }, + { + "epoch": 13.444996451383961, + "grad_norm": 3.8690943717956543, + "learning_rate": 8.656082327892122e-05, + "loss": 0.02425812780857086, + "step": 94720 + }, + { + "epoch": 13.446415897799858, + "grad_norm": 3.525021553039551, + "learning_rate": 8.655940383250533e-05, + "loss": 0.0785856544971466, + "step": 94730 + }, + { + "epoch": 13.447835344215756, + "grad_norm": 0.6416861414909363, + "learning_rate": 8.655798438608942e-05, + "loss": 0.018032850325107576, + "step": 94740 + }, + { + "epoch": 13.449254790631654, + "grad_norm": 0.18148157000541687, + "learning_rate": 8.655656493967354e-05, + "loss": 0.016194966435432435, + "step": 94750 + }, + { + "epoch": 13.45067423704755, + "grad_norm": 6.061134338378906, + "learning_rate": 8.655514549325763e-05, + "loss": 0.06400618553161622, + "step": 94760 + }, + { + "epoch": 13.452093683463449, + "grad_norm": 2.7746856212615967, + "learning_rate": 8.655372604684173e-05, + "loss": 0.017115673422813414, + "step": 94770 + }, + { + "epoch": 13.453513129879347, + "grad_norm": 0.9594934582710266, + "learning_rate": 8.655230660042584e-05, + "loss": 0.03562757968902588, + "step": 94780 + }, + { + "epoch": 13.454932576295246, + "grad_norm": 13.996777534484863, + "learning_rate": 8.655088715400994e-05, + "loss": 0.02590930163860321, + "step": 94790 + }, + { + "epoch": 13.456352022711142, + "grad_norm": 0.16600783169269562, + "learning_rate": 8.654946770759405e-05, + "loss": 0.018148021399974824, + "step": 94800 + }, + { + "epoch": 13.45777146912704, + "grad_norm": 1.3046753406524658, + "learning_rate": 8.654804826117815e-05, + "loss": 0.003467951714992523, + "step": 94810 + }, + { + "epoch": 13.459190915542939, + "grad_norm": 0.07563643157482147, + "learning_rate": 8.654662881476224e-05, + "loss": 0.016790592670440675, + "step": 94820 + }, + { + "epoch": 13.460610361958835, + "grad_norm": 0.4039790630340576, + "learning_rate": 8.654520936834634e-05, + "loss": 0.024634437263011934, + "step": 94830 + }, + { + "epoch": 13.462029808374734, + "grad_norm": 0.04403165355324745, + "learning_rate": 8.654378992193045e-05, + "loss": 0.02208777964115143, + "step": 94840 + }, + { + "epoch": 13.463449254790632, + "grad_norm": 4.115147113800049, + "learning_rate": 8.654237047551455e-05, + "loss": 0.008800669014453888, + "step": 94850 + }, + { + "epoch": 13.46486870120653, + "grad_norm": 0.6078055500984192, + "learning_rate": 8.654095102909866e-05, + "loss": 0.04899186193943024, + "step": 94860 + }, + { + "epoch": 13.466288147622427, + "grad_norm": 3.0943551063537598, + "learning_rate": 8.653953158268276e-05, + "loss": 0.04084070026874542, + "step": 94870 + }, + { + "epoch": 13.467707594038325, + "grad_norm": 0.12317100912332535, + "learning_rate": 8.653811213626686e-05, + "loss": 0.0065141826868057254, + "step": 94880 + }, + { + "epoch": 13.469127040454223, + "grad_norm": 1.6963871717453003, + "learning_rate": 8.653669268985097e-05, + "loss": 0.021698565781116487, + "step": 94890 + }, + { + "epoch": 13.47054648687012, + "grad_norm": 8.212491035461426, + "learning_rate": 8.653527324343506e-05, + "loss": 0.07611912488937378, + "step": 94900 + }, + { + "epoch": 13.471965933286018, + "grad_norm": 0.40329378843307495, + "learning_rate": 8.653385379701918e-05, + "loss": 0.020160472393035887, + "step": 94910 + }, + { + "epoch": 13.473385379701917, + "grad_norm": 0.9991295337677002, + "learning_rate": 8.653243435060326e-05, + "loss": 0.0287177711725235, + "step": 94920 + }, + { + "epoch": 13.474804826117815, + "grad_norm": 8.761367797851562, + "learning_rate": 8.653101490418737e-05, + "loss": 0.05888093113899231, + "step": 94930 + }, + { + "epoch": 13.476224272533711, + "grad_norm": 0.6481591463088989, + "learning_rate": 8.652959545777147e-05, + "loss": 0.019050560891628265, + "step": 94940 + }, + { + "epoch": 13.47764371894961, + "grad_norm": 0.06678199023008347, + "learning_rate": 8.652817601135558e-05, + "loss": 0.052511191368103026, + "step": 94950 + }, + { + "epoch": 13.479063165365508, + "grad_norm": 0.018576741218566895, + "learning_rate": 8.652675656493968e-05, + "loss": 0.037995684146881106, + "step": 94960 + }, + { + "epoch": 13.480482611781405, + "grad_norm": 0.029905835166573524, + "learning_rate": 8.652533711852377e-05, + "loss": 0.03668951392173767, + "step": 94970 + }, + { + "epoch": 13.481902058197303, + "grad_norm": 7.639389991760254, + "learning_rate": 8.652391767210788e-05, + "loss": 0.08303702473640442, + "step": 94980 + }, + { + "epoch": 13.483321504613201, + "grad_norm": 0.044263552874326706, + "learning_rate": 8.652249822569198e-05, + "loss": 0.04615318775177002, + "step": 94990 + }, + { + "epoch": 13.4847409510291, + "grad_norm": 6.359818458557129, + "learning_rate": 8.652107877927609e-05, + "loss": 0.01875213086605072, + "step": 95000 + }, + { + "epoch": 13.4847409510291, + "eval_accuracy": 0.9851211292681376, + "eval_loss": 0.052980098873376846, + "eval_runtime": 32.4787, + "eval_samples_per_second": 484.225, + "eval_steps_per_second": 15.148, + "step": 95000 + }, + { + "epoch": 13.486160397444996, + "grad_norm": 4.879207134246826, + "learning_rate": 8.651965933286019e-05, + "loss": 0.05217199921607971, + "step": 95010 + }, + { + "epoch": 13.487579843860894, + "grad_norm": 1.1770670413970947, + "learning_rate": 8.651823988644429e-05, + "loss": 0.012905190885066985, + "step": 95020 + }, + { + "epoch": 13.488999290276793, + "grad_norm": 0.13578958809375763, + "learning_rate": 8.651682044002838e-05, + "loss": 0.024967202544212343, + "step": 95030 + }, + { + "epoch": 13.490418736692689, + "grad_norm": 0.02280462346971035, + "learning_rate": 8.65154009936125e-05, + "loss": 0.00866934135556221, + "step": 95040 + }, + { + "epoch": 13.491838183108587, + "grad_norm": 1.2240324020385742, + "learning_rate": 8.651398154719659e-05, + "loss": 0.014538370072841644, + "step": 95050 + }, + { + "epoch": 13.493257629524486, + "grad_norm": 1.3486486673355103, + "learning_rate": 8.65125621007807e-05, + "loss": 0.035960334539413455, + "step": 95060 + }, + { + "epoch": 13.494677075940384, + "grad_norm": 0.38771259784698486, + "learning_rate": 8.65111426543648e-05, + "loss": 0.020747166872024537, + "step": 95070 + }, + { + "epoch": 13.49609652235628, + "grad_norm": 0.48206791281700134, + "learning_rate": 8.65097232079489e-05, + "loss": 0.037840792536735536, + "step": 95080 + }, + { + "epoch": 13.497515968772179, + "grad_norm": 8.286971092224121, + "learning_rate": 8.650830376153301e-05, + "loss": 0.0744866132736206, + "step": 95090 + }, + { + "epoch": 13.498935415188077, + "grad_norm": 4.623902797698975, + "learning_rate": 8.65068843151171e-05, + "loss": 0.027271512150764465, + "step": 95100 + }, + { + "epoch": 13.500354861603974, + "grad_norm": 0.2418900579214096, + "learning_rate": 8.650546486870122e-05, + "loss": 0.007887350022792816, + "step": 95110 + }, + { + "epoch": 13.501774308019872, + "grad_norm": 0.0539235882461071, + "learning_rate": 8.650404542228531e-05, + "loss": 0.011406297981739043, + "step": 95120 + }, + { + "epoch": 13.50319375443577, + "grad_norm": 0.07959360629320145, + "learning_rate": 8.650262597586941e-05, + "loss": 0.007848554849624633, + "step": 95130 + }, + { + "epoch": 13.504613200851669, + "grad_norm": 0.01729063130915165, + "learning_rate": 8.650120652945351e-05, + "loss": 0.028363698720932008, + "step": 95140 + }, + { + "epoch": 13.506032647267565, + "grad_norm": 1.3998548984527588, + "learning_rate": 8.649978708303762e-05, + "loss": 0.008569182455539703, + "step": 95150 + }, + { + "epoch": 13.507452093683463, + "grad_norm": 0.3003092408180237, + "learning_rate": 8.649836763662172e-05, + "loss": 0.02904551327228546, + "step": 95160 + }, + { + "epoch": 13.508871540099362, + "grad_norm": 7.425189971923828, + "learning_rate": 8.649694819020583e-05, + "loss": 0.018622586131095888, + "step": 95170 + }, + { + "epoch": 13.510290986515258, + "grad_norm": 0.015005892142653465, + "learning_rate": 8.649552874378993e-05, + "loss": 0.006870198249816895, + "step": 95180 + }, + { + "epoch": 13.511710432931157, + "grad_norm": 0.02674536220729351, + "learning_rate": 8.649410929737402e-05, + "loss": 0.0023563139140605925, + "step": 95190 + }, + { + "epoch": 13.513129879347055, + "grad_norm": 0.13404254615306854, + "learning_rate": 8.649268985095813e-05, + "loss": 0.009416007995605468, + "step": 95200 + }, + { + "epoch": 13.514549325762953, + "grad_norm": 0.2148085981607437, + "learning_rate": 8.649127040454223e-05, + "loss": 0.019373995065689088, + "step": 95210 + }, + { + "epoch": 13.51596877217885, + "grad_norm": 0.11902511119842529, + "learning_rate": 8.648985095812634e-05, + "loss": 0.02425927072763443, + "step": 95220 + }, + { + "epoch": 13.517388218594748, + "grad_norm": 0.9130202531814575, + "learning_rate": 8.648843151171043e-05, + "loss": 0.0049303267151117325, + "step": 95230 + }, + { + "epoch": 13.518807665010646, + "grad_norm": 0.2348577082157135, + "learning_rate": 8.648701206529454e-05, + "loss": 0.03883711695671081, + "step": 95240 + }, + { + "epoch": 13.520227111426543, + "grad_norm": 0.02371363900601864, + "learning_rate": 8.648559261887863e-05, + "loss": 0.049692931771278384, + "step": 95250 + }, + { + "epoch": 13.521646557842441, + "grad_norm": 1.7642638683319092, + "learning_rate": 8.648417317246275e-05, + "loss": 0.026677578687667847, + "step": 95260 + }, + { + "epoch": 13.52306600425834, + "grad_norm": 0.6842426657676697, + "learning_rate": 8.648275372604686e-05, + "loss": 0.0024492625147104264, + "step": 95270 + }, + { + "epoch": 13.524485450674238, + "grad_norm": 1.9417275190353394, + "learning_rate": 8.648133427963094e-05, + "loss": 0.005500277504324913, + "step": 95280 + }, + { + "epoch": 13.525904897090134, + "grad_norm": 0.02574915811419487, + "learning_rate": 8.647991483321505e-05, + "loss": 0.006903509795665741, + "step": 95290 + }, + { + "epoch": 13.527324343506033, + "grad_norm": 0.14615212380886078, + "learning_rate": 8.647849538679915e-05, + "loss": 0.014113113284111023, + "step": 95300 + }, + { + "epoch": 13.528743789921931, + "grad_norm": 0.8143966197967529, + "learning_rate": 8.647707594038326e-05, + "loss": 0.020416779816150664, + "step": 95310 + }, + { + "epoch": 13.530163236337827, + "grad_norm": 0.25222253799438477, + "learning_rate": 8.647565649396736e-05, + "loss": 0.08081262707710266, + "step": 95320 + }, + { + "epoch": 13.531582682753726, + "grad_norm": 2.65850830078125, + "learning_rate": 8.647423704755145e-05, + "loss": 0.009225034713745117, + "step": 95330 + }, + { + "epoch": 13.533002129169624, + "grad_norm": 0.4264056384563446, + "learning_rate": 8.647281760113555e-05, + "loss": 0.018319667875766756, + "step": 95340 + }, + { + "epoch": 13.534421575585522, + "grad_norm": 0.144210085272789, + "learning_rate": 8.647139815471966e-05, + "loss": 0.06945294141769409, + "step": 95350 + }, + { + "epoch": 13.535841022001419, + "grad_norm": 0.4107845723628998, + "learning_rate": 8.647012065294535e-05, + "loss": 0.038562634587287904, + "step": 95360 + }, + { + "epoch": 13.537260468417317, + "grad_norm": 6.5514912605285645, + "learning_rate": 8.646870120652946e-05, + "loss": 0.01930883377790451, + "step": 95370 + }, + { + "epoch": 13.538679914833216, + "grad_norm": 0.014782809652388096, + "learning_rate": 8.646728176011356e-05, + "loss": 0.06311725378036499, + "step": 95380 + }, + { + "epoch": 13.540099361249112, + "grad_norm": 0.554907500743866, + "learning_rate": 8.646586231369767e-05, + "loss": 0.04180940389633179, + "step": 95390 + }, + { + "epoch": 13.54151880766501, + "grad_norm": 1.3440015316009521, + "learning_rate": 8.646444286728176e-05, + "loss": 0.04394156336784363, + "step": 95400 + }, + { + "epoch": 13.542938254080909, + "grad_norm": 10.791940689086914, + "learning_rate": 8.646302342086586e-05, + "loss": 0.05378941297531128, + "step": 95410 + }, + { + "epoch": 13.544357700496807, + "grad_norm": 0.39911729097366333, + "learning_rate": 8.646160397444997e-05, + "loss": 0.0075015932321548465, + "step": 95420 + }, + { + "epoch": 13.545777146912704, + "grad_norm": 0.045785821974277496, + "learning_rate": 8.646018452803407e-05, + "loss": 0.029005283117294313, + "step": 95430 + }, + { + "epoch": 13.547196593328602, + "grad_norm": 0.010025689378380775, + "learning_rate": 8.645876508161818e-05, + "loss": 0.026248654723167418, + "step": 95440 + }, + { + "epoch": 13.5486160397445, + "grad_norm": 2.720860004425049, + "learning_rate": 8.645734563520228e-05, + "loss": 0.004437939077615738, + "step": 95450 + }, + { + "epoch": 13.550035486160397, + "grad_norm": 4.066869735717773, + "learning_rate": 8.645592618878638e-05, + "loss": 0.01580238789319992, + "step": 95460 + }, + { + "epoch": 13.551454932576295, + "grad_norm": 0.23954829573631287, + "learning_rate": 8.645450674237047e-05, + "loss": 0.011441273987293244, + "step": 95470 + }, + { + "epoch": 13.552874378992193, + "grad_norm": 12.648468017578125, + "learning_rate": 8.645308729595458e-05, + "loss": 0.024931295216083525, + "step": 95480 + }, + { + "epoch": 13.554293825408092, + "grad_norm": 0.7105810046195984, + "learning_rate": 8.645166784953868e-05, + "loss": 0.029153740406036376, + "step": 95490 + }, + { + "epoch": 13.555713271823988, + "grad_norm": 0.045782580971717834, + "learning_rate": 8.645024840312279e-05, + "loss": 0.01563691794872284, + "step": 95500 + }, + { + "epoch": 13.555713271823988, + "eval_accuracy": 0.9883003751510142, + "eval_loss": 0.04102558270096779, + "eval_runtime": 31.3169, + "eval_samples_per_second": 502.189, + "eval_steps_per_second": 15.71, + "step": 95500 + }, + { + "epoch": 13.557132718239886, + "grad_norm": 1.4033604860305786, + "learning_rate": 8.644882895670689e-05, + "loss": 0.03285573124885559, + "step": 95510 + }, + { + "epoch": 13.558552164655785, + "grad_norm": 1.4092446565628052, + "learning_rate": 8.644740951029099e-05, + "loss": 0.005036625638604164, + "step": 95520 + }, + { + "epoch": 13.559971611071681, + "grad_norm": 0.03376542776823044, + "learning_rate": 8.64459900638751e-05, + "loss": 0.021902407705783843, + "step": 95530 + }, + { + "epoch": 13.56139105748758, + "grad_norm": 0.022708173841238022, + "learning_rate": 8.64445706174592e-05, + "loss": 0.022810864448547363, + "step": 95540 + }, + { + "epoch": 13.562810503903478, + "grad_norm": 1.8710218667984009, + "learning_rate": 8.64431511710433e-05, + "loss": 0.029252752661705017, + "step": 95550 + }, + { + "epoch": 13.564229950319376, + "grad_norm": 0.5325424075126648, + "learning_rate": 8.644173172462739e-05, + "loss": 0.04581056237220764, + "step": 95560 + }, + { + "epoch": 13.565649396735273, + "grad_norm": 12.632488250732422, + "learning_rate": 8.64403122782115e-05, + "loss": 0.04354757368564606, + "step": 95570 + }, + { + "epoch": 13.567068843151171, + "grad_norm": 1.2110655307769775, + "learning_rate": 8.64388928317956e-05, + "loss": 0.021378204226493835, + "step": 95580 + }, + { + "epoch": 13.56848828956707, + "grad_norm": 0.4826580882072449, + "learning_rate": 8.643747338537971e-05, + "loss": 0.019974629580974578, + "step": 95590 + }, + { + "epoch": 13.569907735982966, + "grad_norm": 0.2284783273935318, + "learning_rate": 8.643605393896381e-05, + "loss": 0.04820126593112946, + "step": 95600 + }, + { + "epoch": 13.571327182398864, + "grad_norm": 1.714429497718811, + "learning_rate": 8.64346344925479e-05, + "loss": 0.0038776598870754243, + "step": 95610 + }, + { + "epoch": 13.572746628814762, + "grad_norm": 5.034719944000244, + "learning_rate": 8.643321504613202e-05, + "loss": 0.03729407787322998, + "step": 95620 + }, + { + "epoch": 13.57416607523066, + "grad_norm": 0.28962454199790955, + "learning_rate": 8.643179559971611e-05, + "loss": 0.01688399910926819, + "step": 95630 + }, + { + "epoch": 13.575585521646557, + "grad_norm": 0.36011746525764465, + "learning_rate": 8.643037615330022e-05, + "loss": 0.038062408566474915, + "step": 95640 + }, + { + "epoch": 13.577004968062456, + "grad_norm": 1.5610812902450562, + "learning_rate": 8.642895670688432e-05, + "loss": 0.06204630136489868, + "step": 95650 + }, + { + "epoch": 13.578424414478354, + "grad_norm": 0.0474555566906929, + "learning_rate": 8.642753726046842e-05, + "loss": 0.002244003117084503, + "step": 95660 + }, + { + "epoch": 13.57984386089425, + "grad_norm": 0.05775681510567665, + "learning_rate": 8.642611781405252e-05, + "loss": 0.023053470253944396, + "step": 95670 + }, + { + "epoch": 13.581263307310149, + "grad_norm": 0.07157592475414276, + "learning_rate": 8.642469836763663e-05, + "loss": 0.02459646463394165, + "step": 95680 + }, + { + "epoch": 13.582682753726047, + "grad_norm": 0.05639144778251648, + "learning_rate": 8.642327892122072e-05, + "loss": 0.009374721348285675, + "step": 95690 + }, + { + "epoch": 13.584102200141945, + "grad_norm": 0.32128065824508667, + "learning_rate": 8.642185947480483e-05, + "loss": 0.010814450681209564, + "step": 95700 + }, + { + "epoch": 13.585521646557842, + "grad_norm": 2.1439898014068604, + "learning_rate": 8.642044002838893e-05, + "loss": 0.03374948799610138, + "step": 95710 + }, + { + "epoch": 13.58694109297374, + "grad_norm": 0.643042266368866, + "learning_rate": 8.641902058197303e-05, + "loss": 0.017416924238204956, + "step": 95720 + }, + { + "epoch": 13.588360539389639, + "grad_norm": 0.3461391031742096, + "learning_rate": 8.641760113555714e-05, + "loss": 0.03184410333633423, + "step": 95730 + }, + { + "epoch": 13.589779985805535, + "grad_norm": 0.12013855576515198, + "learning_rate": 8.641618168914124e-05, + "loss": 0.029846155643463136, + "step": 95740 + }, + { + "epoch": 13.591199432221433, + "grad_norm": 2.5400469303131104, + "learning_rate": 8.641476224272535e-05, + "loss": 0.07584856748580933, + "step": 95750 + }, + { + "epoch": 13.592618878637332, + "grad_norm": 2.2745046615600586, + "learning_rate": 8.641334279630945e-05, + "loss": 0.014420546591281891, + "step": 95760 + }, + { + "epoch": 13.59403832505323, + "grad_norm": 0.3573761284351349, + "learning_rate": 8.641192334989354e-05, + "loss": 0.04467960298061371, + "step": 95770 + }, + { + "epoch": 13.595457771469126, + "grad_norm": 2.67861270904541, + "learning_rate": 8.641050390347764e-05, + "loss": 0.018556292355060577, + "step": 95780 + }, + { + "epoch": 13.596877217885025, + "grad_norm": 0.11460601538419724, + "learning_rate": 8.640908445706175e-05, + "loss": 0.01947108209133148, + "step": 95790 + }, + { + "epoch": 13.598296664300923, + "grad_norm": 2.6254916191101074, + "learning_rate": 8.640766501064585e-05, + "loss": 0.09533782601356507, + "step": 95800 + }, + { + "epoch": 13.59971611071682, + "grad_norm": 0.07714466750621796, + "learning_rate": 8.640624556422996e-05, + "loss": 0.011156822741031646, + "step": 95810 + }, + { + "epoch": 13.601135557132718, + "grad_norm": 0.8347751498222351, + "learning_rate": 8.640482611781406e-05, + "loss": 0.01339704990386963, + "step": 95820 + }, + { + "epoch": 13.602555003548616, + "grad_norm": 0.10340499132871628, + "learning_rate": 8.640340667139815e-05, + "loss": 0.03355931043624878, + "step": 95830 + }, + { + "epoch": 13.603974449964515, + "grad_norm": 1.814827561378479, + "learning_rate": 8.640198722498227e-05, + "loss": 0.018925678730010987, + "step": 95840 + }, + { + "epoch": 13.605393896380411, + "grad_norm": 0.0566093884408474, + "learning_rate": 8.640056777856636e-05, + "loss": 0.023309621214866637, + "step": 95850 + }, + { + "epoch": 13.60681334279631, + "grad_norm": 10.556805610656738, + "learning_rate": 8.639914833215047e-05, + "loss": 0.03882618546485901, + "step": 95860 + }, + { + "epoch": 13.608232789212208, + "grad_norm": 0.1373138278722763, + "learning_rate": 8.639772888573456e-05, + "loss": 0.030927222967147828, + "step": 95870 + }, + { + "epoch": 13.609652235628104, + "grad_norm": 0.23542313277721405, + "learning_rate": 8.639630943931867e-05, + "loss": 0.008216166496276855, + "step": 95880 + }, + { + "epoch": 13.611071682044003, + "grad_norm": 0.5358021855354309, + "learning_rate": 8.639488999290277e-05, + "loss": 0.026577457785606384, + "step": 95890 + }, + { + "epoch": 13.6124911284599, + "grad_norm": 0.3513150215148926, + "learning_rate": 8.639347054648688e-05, + "loss": 0.03022733926773071, + "step": 95900 + }, + { + "epoch": 13.6139105748758, + "grad_norm": 3.255770683288574, + "learning_rate": 8.639205110007097e-05, + "loss": 0.04010620713233948, + "step": 95910 + }, + { + "epoch": 13.615330021291696, + "grad_norm": 0.07835977524518967, + "learning_rate": 8.639063165365507e-05, + "loss": 0.0524819552898407, + "step": 95920 + }, + { + "epoch": 13.616749467707594, + "grad_norm": 7.067140579223633, + "learning_rate": 8.638921220723918e-05, + "loss": 0.007290441542863846, + "step": 95930 + }, + { + "epoch": 13.618168914123492, + "grad_norm": 0.5542891621589661, + "learning_rate": 8.638779276082328e-05, + "loss": 0.034179630875587466, + "step": 95940 + }, + { + "epoch": 13.619588360539389, + "grad_norm": 0.25055670738220215, + "learning_rate": 8.638637331440739e-05, + "loss": 0.016138841211795808, + "step": 95950 + }, + { + "epoch": 13.621007806955287, + "grad_norm": 1.4587302207946777, + "learning_rate": 8.638495386799149e-05, + "loss": 0.06437674760818482, + "step": 95960 + }, + { + "epoch": 13.622427253371185, + "grad_norm": 0.1022627130150795, + "learning_rate": 8.638353442157559e-05, + "loss": 0.019828467071056365, + "step": 95970 + }, + { + "epoch": 13.623846699787084, + "grad_norm": 4.383076190948486, + "learning_rate": 8.638211497515968e-05, + "loss": 0.045711484551429746, + "step": 95980 + }, + { + "epoch": 13.62526614620298, + "grad_norm": 0.9741289019584656, + "learning_rate": 8.63806955287438e-05, + "loss": 0.044488522410392764, + "step": 95990 + }, + { + "epoch": 13.626685592618879, + "grad_norm": 0.208095520734787, + "learning_rate": 8.637927608232789e-05, + "loss": 0.04469639658927917, + "step": 96000 + }, + { + "epoch": 13.626685592618879, + "eval_accuracy": 0.9734215044191518, + "eval_loss": 0.10497359931468964, + "eval_runtime": 30.8281, + "eval_samples_per_second": 510.151, + "eval_steps_per_second": 15.959, + "step": 96000 + }, + { + "epoch": 13.628105039034777, + "grad_norm": 6.797325611114502, + "learning_rate": 8.6377856635912e-05, + "loss": 0.036408495903015134, + "step": 96010 + }, + { + "epoch": 13.629524485450673, + "grad_norm": 1.130789875984192, + "learning_rate": 8.63764371894961e-05, + "loss": 0.017700037360191344, + "step": 96020 + }, + { + "epoch": 13.630943931866572, + "grad_norm": 3.6554105281829834, + "learning_rate": 8.63750177430802e-05, + "loss": 0.038382101058959964, + "step": 96030 + }, + { + "epoch": 13.63236337828247, + "grad_norm": 3.8934245109558105, + "learning_rate": 8.637359829666431e-05, + "loss": 0.07922664880752564, + "step": 96040 + }, + { + "epoch": 13.633782824698368, + "grad_norm": 0.1729537695646286, + "learning_rate": 8.63721788502484e-05, + "loss": 0.03476465344429016, + "step": 96050 + }, + { + "epoch": 13.635202271114265, + "grad_norm": 8.660470008850098, + "learning_rate": 8.637075940383252e-05, + "loss": 0.04056967496871948, + "step": 96060 + }, + { + "epoch": 13.636621717530163, + "grad_norm": 1.6571571826934814, + "learning_rate": 8.63693399574166e-05, + "loss": 0.01973864436149597, + "step": 96070 + }, + { + "epoch": 13.638041163946061, + "grad_norm": 0.2835308015346527, + "learning_rate": 8.636792051100071e-05, + "loss": 0.015158508718013764, + "step": 96080 + }, + { + "epoch": 13.639460610361958, + "grad_norm": 0.2651826739311218, + "learning_rate": 8.636650106458481e-05, + "loss": 0.006721274554729461, + "step": 96090 + }, + { + "epoch": 13.640880056777856, + "grad_norm": 0.014933490194380283, + "learning_rate": 8.636508161816892e-05, + "loss": 0.027929207682609557, + "step": 96100 + }, + { + "epoch": 13.642299503193755, + "grad_norm": 0.16669167578220367, + "learning_rate": 8.636366217175302e-05, + "loss": 0.016313910484313965, + "step": 96110 + }, + { + "epoch": 13.643718949609653, + "grad_norm": 2.4867875576019287, + "learning_rate": 8.636224272533713e-05, + "loss": 0.02824159860610962, + "step": 96120 + }, + { + "epoch": 13.64513839602555, + "grad_norm": 0.00703821936622262, + "learning_rate": 8.636082327892123e-05, + "loss": 0.01784539073705673, + "step": 96130 + }, + { + "epoch": 13.646557842441448, + "grad_norm": 11.87582015991211, + "learning_rate": 8.635940383250532e-05, + "loss": 0.02379693239927292, + "step": 96140 + }, + { + "epoch": 13.647977288857346, + "grad_norm": 0.019314516335725784, + "learning_rate": 8.635798438608943e-05, + "loss": 0.012368345260620117, + "step": 96150 + }, + { + "epoch": 13.649396735273243, + "grad_norm": 5.645758152008057, + "learning_rate": 8.635656493967353e-05, + "loss": 0.03507188558578491, + "step": 96160 + }, + { + "epoch": 13.650816181689141, + "grad_norm": 5.865706443786621, + "learning_rate": 8.635514549325764e-05, + "loss": 0.057054382562637326, + "step": 96170 + }, + { + "epoch": 13.65223562810504, + "grad_norm": 0.0784079059958458, + "learning_rate": 8.635372604684173e-05, + "loss": 0.006159195676445961, + "step": 96180 + }, + { + "epoch": 13.653655074520938, + "grad_norm": 0.8433167338371277, + "learning_rate": 8.635230660042584e-05, + "loss": 0.011216971278190612, + "step": 96190 + }, + { + "epoch": 13.655074520936834, + "grad_norm": 0.18653298914432526, + "learning_rate": 8.635088715400993e-05, + "loss": 0.03910190463066101, + "step": 96200 + }, + { + "epoch": 13.656493967352732, + "grad_norm": 0.7993428111076355, + "learning_rate": 8.634946770759404e-05, + "loss": 0.004709036275744438, + "step": 96210 + }, + { + "epoch": 13.65791341376863, + "grad_norm": 0.08979519456624985, + "learning_rate": 8.634804826117816e-05, + "loss": 0.028900161385536194, + "step": 96220 + }, + { + "epoch": 13.659332860184527, + "grad_norm": 0.010041122324764729, + "learning_rate": 8.634662881476224e-05, + "loss": 0.02693893015384674, + "step": 96230 + }, + { + "epoch": 13.660752306600425, + "grad_norm": 0.7909981608390808, + "learning_rate": 8.634520936834635e-05, + "loss": 0.008231808245182038, + "step": 96240 + }, + { + "epoch": 13.662171753016324, + "grad_norm": 8.569683074951172, + "learning_rate": 8.634378992193045e-05, + "loss": 0.027000784873962402, + "step": 96250 + }, + { + "epoch": 13.663591199432222, + "grad_norm": 1.8035515546798706, + "learning_rate": 8.634237047551456e-05, + "loss": 0.013083310425281524, + "step": 96260 + }, + { + "epoch": 13.665010645848119, + "grad_norm": 6.626659393310547, + "learning_rate": 8.634095102909866e-05, + "loss": 0.02948010265827179, + "step": 96270 + }, + { + "epoch": 13.666430092264017, + "grad_norm": 7.713639259338379, + "learning_rate": 8.633953158268275e-05, + "loss": 0.018895019590854645, + "step": 96280 + }, + { + "epoch": 13.667849538679915, + "grad_norm": 0.03560718894004822, + "learning_rate": 8.633811213626685e-05, + "loss": 0.02730792760848999, + "step": 96290 + }, + { + "epoch": 13.669268985095812, + "grad_norm": 0.49820488691329956, + "learning_rate": 8.633669268985096e-05, + "loss": 0.04874304831027985, + "step": 96300 + }, + { + "epoch": 13.67068843151171, + "grad_norm": 0.038449477404356, + "learning_rate": 8.633527324343507e-05, + "loss": 0.050292950868606565, + "step": 96310 + }, + { + "epoch": 13.672107877927608, + "grad_norm": 10.892190933227539, + "learning_rate": 8.633385379701917e-05, + "loss": 0.05344282388687134, + "step": 96320 + }, + { + "epoch": 13.673527324343507, + "grad_norm": 0.24301016330718994, + "learning_rate": 8.633243435060327e-05, + "loss": 0.024064372479915618, + "step": 96330 + }, + { + "epoch": 13.674946770759403, + "grad_norm": 0.02121180109679699, + "learning_rate": 8.633101490418737e-05, + "loss": 0.01656196266412735, + "step": 96340 + }, + { + "epoch": 13.676366217175302, + "grad_norm": 0.4311664402484894, + "learning_rate": 8.632959545777148e-05, + "loss": 0.01601491868495941, + "step": 96350 + }, + { + "epoch": 13.6777856635912, + "grad_norm": 0.7602726221084595, + "learning_rate": 8.632817601135557e-05, + "loss": 0.01864083707332611, + "step": 96360 + }, + { + "epoch": 13.679205110007096, + "grad_norm": 11.445473670959473, + "learning_rate": 8.632675656493968e-05, + "loss": 0.043167969584465025, + "step": 96370 + }, + { + "epoch": 13.680624556422995, + "grad_norm": 0.05061354488134384, + "learning_rate": 8.632533711852377e-05, + "loss": 0.012322446703910828, + "step": 96380 + }, + { + "epoch": 13.682044002838893, + "grad_norm": 0.009072613902390003, + "learning_rate": 8.632391767210788e-05, + "loss": 0.02431444972753525, + "step": 96390 + }, + { + "epoch": 13.683463449254791, + "grad_norm": 0.07909449934959412, + "learning_rate": 8.632249822569199e-05, + "loss": 0.01688341349363327, + "step": 96400 + }, + { + "epoch": 13.684882895670688, + "grad_norm": 9.46364974975586, + "learning_rate": 8.632107877927609e-05, + "loss": 0.028126460313796998, + "step": 96410 + }, + { + "epoch": 13.686302342086586, + "grad_norm": 7.244299411773682, + "learning_rate": 8.63196593328602e-05, + "loss": 0.03135330080986023, + "step": 96420 + }, + { + "epoch": 13.687721788502484, + "grad_norm": 7.555941104888916, + "learning_rate": 8.631823988644428e-05, + "loss": 0.040532243251800534, + "step": 96430 + }, + { + "epoch": 13.689141234918381, + "grad_norm": 1.6134663820266724, + "learning_rate": 8.631682044002839e-05, + "loss": 0.06105220913887024, + "step": 96440 + }, + { + "epoch": 13.69056068133428, + "grad_norm": 0.026344293728470802, + "learning_rate": 8.631540099361249e-05, + "loss": 0.03273046612739563, + "step": 96450 + }, + { + "epoch": 13.691980127750178, + "grad_norm": 3.042829990386963, + "learning_rate": 8.63139815471966e-05, + "loss": 0.012820084393024445, + "step": 96460 + }, + { + "epoch": 13.693399574166076, + "grad_norm": 0.07684643566608429, + "learning_rate": 8.63125621007807e-05, + "loss": 0.012618523836135865, + "step": 96470 + }, + { + "epoch": 13.694819020581972, + "grad_norm": 0.09263870865106583, + "learning_rate": 8.631114265436481e-05, + "loss": 0.01064542829990387, + "step": 96480 + }, + { + "epoch": 13.69623846699787, + "grad_norm": 5.022707939147949, + "learning_rate": 8.630972320794891e-05, + "loss": 0.01942266374826431, + "step": 96490 + }, + { + "epoch": 13.697657913413769, + "grad_norm": 1.1834547519683838, + "learning_rate": 8.6308303761533e-05, + "loss": 0.03992840051651001, + "step": 96500 + }, + { + "epoch": 13.697657913413769, + "eval_accuracy": 0.9791441470083296, + "eval_loss": 0.07206237316131592, + "eval_runtime": 32.6855, + "eval_samples_per_second": 481.161, + "eval_steps_per_second": 15.053, + "step": 96500 + }, + { + "epoch": 13.699077359829666, + "grad_norm": 0.13294564187526703, + "learning_rate": 8.630688431511712e-05, + "loss": 0.05398126244544983, + "step": 96510 + }, + { + "epoch": 13.700496806245564, + "grad_norm": 1.2916076183319092, + "learning_rate": 8.630546486870121e-05, + "loss": 0.03420543372631073, + "step": 96520 + }, + { + "epoch": 13.701916252661462, + "grad_norm": 0.049955952912569046, + "learning_rate": 8.630404542228532e-05, + "loss": 0.019472208619117738, + "step": 96530 + }, + { + "epoch": 13.70333569907736, + "grad_norm": 0.1284303069114685, + "learning_rate": 8.630262597586941e-05, + "loss": 0.02896394729614258, + "step": 96540 + }, + { + "epoch": 13.704755145493257, + "grad_norm": 0.5136931538581848, + "learning_rate": 8.630120652945352e-05, + "loss": 0.031944602727890015, + "step": 96550 + }, + { + "epoch": 13.706174591909155, + "grad_norm": 1.1831598281860352, + "learning_rate": 8.629978708303762e-05, + "loss": 0.010644739121198654, + "step": 96560 + }, + { + "epoch": 13.707594038325054, + "grad_norm": 0.017192769795656204, + "learning_rate": 8.629836763662173e-05, + "loss": 0.013148699700832368, + "step": 96570 + }, + { + "epoch": 13.70901348474095, + "grad_norm": 0.10975632816553116, + "learning_rate": 8.629694819020582e-05, + "loss": 0.01649189442396164, + "step": 96580 + }, + { + "epoch": 13.710432931156848, + "grad_norm": 1.0271081924438477, + "learning_rate": 8.629552874378992e-05, + "loss": 0.0365587055683136, + "step": 96590 + }, + { + "epoch": 13.711852377572747, + "grad_norm": 6.755058765411377, + "learning_rate": 8.629410929737403e-05, + "loss": 0.02149779498577118, + "step": 96600 + }, + { + "epoch": 13.713271823988645, + "grad_norm": 0.44150310754776, + "learning_rate": 8.629268985095813e-05, + "loss": 0.016236330568790435, + "step": 96610 + }, + { + "epoch": 13.714691270404542, + "grad_norm": 0.09499189257621765, + "learning_rate": 8.629127040454224e-05, + "loss": 0.023830153048038483, + "step": 96620 + }, + { + "epoch": 13.71611071682044, + "grad_norm": 0.6429286003112793, + "learning_rate": 8.628985095812634e-05, + "loss": 0.024831396341323853, + "step": 96630 + }, + { + "epoch": 13.717530163236338, + "grad_norm": 0.37857896089553833, + "learning_rate": 8.628843151171044e-05, + "loss": 0.04653356075286865, + "step": 96640 + }, + { + "epoch": 13.718949609652235, + "grad_norm": 0.8553446531295776, + "learning_rate": 8.628701206529453e-05, + "loss": 0.016309410333633423, + "step": 96650 + }, + { + "epoch": 13.720369056068133, + "grad_norm": 12.546866416931152, + "learning_rate": 8.628559261887864e-05, + "loss": 0.04090455174446106, + "step": 96660 + }, + { + "epoch": 13.721788502484031, + "grad_norm": 0.5049958229064941, + "learning_rate": 8.628417317246274e-05, + "loss": 0.053654146194458005, + "step": 96670 + }, + { + "epoch": 13.72320794889993, + "grad_norm": 0.32195183634757996, + "learning_rate": 8.628275372604685e-05, + "loss": 0.0933415949344635, + "step": 96680 + }, + { + "epoch": 13.724627395315826, + "grad_norm": 2.5842678546905518, + "learning_rate": 8.628133427963095e-05, + "loss": 0.028504377603530882, + "step": 96690 + }, + { + "epoch": 13.726046841731725, + "grad_norm": 0.4025324583053589, + "learning_rate": 8.627991483321505e-05, + "loss": 0.04696339964866638, + "step": 96700 + }, + { + "epoch": 13.727466288147623, + "grad_norm": 0.9358816742897034, + "learning_rate": 8.627849538679916e-05, + "loss": 0.02000337541103363, + "step": 96710 + }, + { + "epoch": 13.72888573456352, + "grad_norm": 0.13171012699604034, + "learning_rate": 8.627707594038326e-05, + "loss": 0.012757310271263122, + "step": 96720 + }, + { + "epoch": 13.730305180979418, + "grad_norm": 4.228630065917969, + "learning_rate": 8.627565649396737e-05, + "loss": 0.016465026140213012, + "step": 96730 + }, + { + "epoch": 13.731724627395316, + "grad_norm": 1.9195640087127686, + "learning_rate": 8.627423704755145e-05, + "loss": 0.06258389949798585, + "step": 96740 + }, + { + "epoch": 13.733144073811214, + "grad_norm": 0.1192382425069809, + "learning_rate": 8.627281760113556e-05, + "loss": 0.02845693230628967, + "step": 96750 + }, + { + "epoch": 13.73456352022711, + "grad_norm": 0.35611721873283386, + "learning_rate": 8.627139815471966e-05, + "loss": 0.024792948365211488, + "step": 96760 + }, + { + "epoch": 13.735982966643009, + "grad_norm": 0.27649229764938354, + "learning_rate": 8.626997870830377e-05, + "loss": 0.027475398778915406, + "step": 96770 + }, + { + "epoch": 13.737402413058907, + "grad_norm": 0.01736115850508213, + "learning_rate": 8.626855926188787e-05, + "loss": 0.0017727486789226531, + "step": 96780 + }, + { + "epoch": 13.738821859474804, + "grad_norm": 0.0943591445684433, + "learning_rate": 8.626713981547196e-05, + "loss": 0.053753846883773805, + "step": 96790 + }, + { + "epoch": 13.740241305890702, + "grad_norm": 5.441102981567383, + "learning_rate": 8.626572036905607e-05, + "loss": 0.018227586150169374, + "step": 96800 + }, + { + "epoch": 13.7416607523066, + "grad_norm": 0.41876062750816345, + "learning_rate": 8.626430092264017e-05, + "loss": 0.048694318532943724, + "step": 96810 + }, + { + "epoch": 13.743080198722499, + "grad_norm": 2.1366820335388184, + "learning_rate": 8.626288147622428e-05, + "loss": 0.020703762769699097, + "step": 96820 + }, + { + "epoch": 13.744499645138395, + "grad_norm": 0.06722243875265121, + "learning_rate": 8.626146202980838e-05, + "loss": 0.04667206406593323, + "step": 96830 + }, + { + "epoch": 13.745919091554294, + "grad_norm": 5.052031517028809, + "learning_rate": 8.626004258339249e-05, + "loss": 0.04078722596168518, + "step": 96840 + }, + { + "epoch": 13.747338537970192, + "grad_norm": 4.313608646392822, + "learning_rate": 8.625862313697658e-05, + "loss": 0.02702971696853638, + "step": 96850 + }, + { + "epoch": 13.748757984386089, + "grad_norm": 11.140143394470215, + "learning_rate": 8.625720369056069e-05, + "loss": 0.05737009048461914, + "step": 96860 + }, + { + "epoch": 13.750177430801987, + "grad_norm": 1.1045628786087036, + "learning_rate": 8.625578424414478e-05, + "loss": 0.017936806380748748, + "step": 96870 + }, + { + "epoch": 13.751596877217885, + "grad_norm": 0.9916929006576538, + "learning_rate": 8.62543647977289e-05, + "loss": 0.036751154065132144, + "step": 96880 + }, + { + "epoch": 13.753016323633783, + "grad_norm": 0.5851067900657654, + "learning_rate": 8.625294535131299e-05, + "loss": 0.06546493172645569, + "step": 96890 + }, + { + "epoch": 13.75443577004968, + "grad_norm": 2.6111655235290527, + "learning_rate": 8.625152590489709e-05, + "loss": 0.010462664812803269, + "step": 96900 + }, + { + "epoch": 13.755855216465578, + "grad_norm": 0.018237149342894554, + "learning_rate": 8.62501064584812e-05, + "loss": 0.01750764548778534, + "step": 96910 + }, + { + "epoch": 13.757274662881477, + "grad_norm": 0.44217807054519653, + "learning_rate": 8.62486870120653e-05, + "loss": 0.02455083727836609, + "step": 96920 + }, + { + "epoch": 13.758694109297373, + "grad_norm": 0.14735212922096252, + "learning_rate": 8.624726756564941e-05, + "loss": 0.05003917217254639, + "step": 96930 + }, + { + "epoch": 13.760113555713271, + "grad_norm": 8.448341369628906, + "learning_rate": 8.62458481192335e-05, + "loss": 0.011260174214839935, + "step": 96940 + }, + { + "epoch": 13.76153300212917, + "grad_norm": 0.5719733834266663, + "learning_rate": 8.62444286728176e-05, + "loss": 0.05195282101631164, + "step": 96950 + }, + { + "epoch": 13.762952448545068, + "grad_norm": 0.014794730581343174, + "learning_rate": 8.62430092264017e-05, + "loss": 0.02123808115720749, + "step": 96960 + }, + { + "epoch": 13.764371894960965, + "grad_norm": 0.8494911789894104, + "learning_rate": 8.624158977998581e-05, + "loss": 0.00920439586043358, + "step": 96970 + }, + { + "epoch": 13.765791341376863, + "grad_norm": 0.963769793510437, + "learning_rate": 8.624017033356991e-05, + "loss": 0.03247123658657074, + "step": 96980 + }, + { + "epoch": 13.767210787792761, + "grad_norm": 0.07355058938264847, + "learning_rate": 8.623875088715402e-05, + "loss": 0.015551319718360901, + "step": 96990 + }, + { + "epoch": 13.768630234208658, + "grad_norm": 0.03591252863407135, + "learning_rate": 8.623733144073812e-05, + "loss": 0.014192771911621094, + "step": 97000 + }, + { + "epoch": 13.768630234208658, + "eval_accuracy": 0.9810516945380555, + "eval_loss": 0.06501225382089615, + "eval_runtime": 32.7725, + "eval_samples_per_second": 479.884, + "eval_steps_per_second": 15.013, + "step": 97000 + }, + { + "epoch": 13.770049680624556, + "grad_norm": 0.28048276901245117, + "learning_rate": 8.623591199432221e-05, + "loss": 0.04204888939857483, + "step": 97010 + }, + { + "epoch": 13.771469127040454, + "grad_norm": 0.03193013370037079, + "learning_rate": 8.623449254790633e-05, + "loss": 0.03201920092105866, + "step": 97020 + }, + { + "epoch": 13.772888573456353, + "grad_norm": 0.08718361705541611, + "learning_rate": 8.623307310149042e-05, + "loss": 0.018675795197486876, + "step": 97030 + }, + { + "epoch": 13.77430801987225, + "grad_norm": 1.2164140939712524, + "learning_rate": 8.623165365507453e-05, + "loss": 0.028575024008750914, + "step": 97040 + }, + { + "epoch": 13.775727466288147, + "grad_norm": 12.365880966186523, + "learning_rate": 8.623023420865862e-05, + "loss": 0.0183505117893219, + "step": 97050 + }, + { + "epoch": 13.777146912704046, + "grad_norm": 4.400852680206299, + "learning_rate": 8.622881476224273e-05, + "loss": 0.00443883091211319, + "step": 97060 + }, + { + "epoch": 13.778566359119942, + "grad_norm": 0.044489286839962006, + "learning_rate": 8.622739531582683e-05, + "loss": 0.006819433718919754, + "step": 97070 + }, + { + "epoch": 13.77998580553584, + "grad_norm": 0.26168087124824524, + "learning_rate": 8.622597586941094e-05, + "loss": 0.031000277400016783, + "step": 97080 + }, + { + "epoch": 13.781405251951739, + "grad_norm": 0.05729484185576439, + "learning_rate": 8.622455642299503e-05, + "loss": 0.016029319167137145, + "step": 97090 + }, + { + "epoch": 13.782824698367637, + "grad_norm": 0.1679743528366089, + "learning_rate": 8.622313697657913e-05, + "loss": 0.007387077063322067, + "step": 97100 + }, + { + "epoch": 13.784244144783534, + "grad_norm": 1.0637929439544678, + "learning_rate": 8.622171753016324e-05, + "loss": 0.034793981909751893, + "step": 97110 + }, + { + "epoch": 13.785663591199432, + "grad_norm": 0.06055055558681488, + "learning_rate": 8.622029808374734e-05, + "loss": 0.0071873687207698825, + "step": 97120 + }, + { + "epoch": 13.78708303761533, + "grad_norm": 6.253574371337891, + "learning_rate": 8.621887863733145e-05, + "loss": 0.02324788421392441, + "step": 97130 + }, + { + "epoch": 13.788502484031227, + "grad_norm": 8.996084213256836, + "learning_rate": 8.621745919091555e-05, + "loss": 0.02676660418510437, + "step": 97140 + }, + { + "epoch": 13.789921930447125, + "grad_norm": 0.2644156217575073, + "learning_rate": 8.621603974449966e-05, + "loss": 0.00823134183883667, + "step": 97150 + }, + { + "epoch": 13.791341376863024, + "grad_norm": 7.25373649597168, + "learning_rate": 8.621462029808374e-05, + "loss": 0.02458227276802063, + "step": 97160 + }, + { + "epoch": 13.792760823278922, + "grad_norm": 13.660566329956055, + "learning_rate": 8.621320085166785e-05, + "loss": 0.027688038349151612, + "step": 97170 + }, + { + "epoch": 13.794180269694818, + "grad_norm": 0.9016245007514954, + "learning_rate": 8.621178140525195e-05, + "loss": 0.02340591847896576, + "step": 97180 + }, + { + "epoch": 13.795599716110717, + "grad_norm": 0.5798392295837402, + "learning_rate": 8.621036195883606e-05, + "loss": 0.024847133457660674, + "step": 97190 + }, + { + "epoch": 13.797019162526615, + "grad_norm": 0.1002761498093605, + "learning_rate": 8.620894251242016e-05, + "loss": 0.05150417685508728, + "step": 97200 + }, + { + "epoch": 13.798438608942512, + "grad_norm": 0.3393736481666565, + "learning_rate": 8.620752306600426e-05, + "loss": 0.07004774808883667, + "step": 97210 + }, + { + "epoch": 13.79985805535841, + "grad_norm": 7.55497407913208, + "learning_rate": 8.620610361958837e-05, + "loss": 0.008413270115852356, + "step": 97220 + }, + { + "epoch": 13.801277501774308, + "grad_norm": 1.928196668624878, + "learning_rate": 8.620468417317247e-05, + "loss": 0.02218780517578125, + "step": 97230 + }, + { + "epoch": 13.802696948190206, + "grad_norm": 2.2993032932281494, + "learning_rate": 8.620326472675658e-05, + "loss": 0.022586297988891602, + "step": 97240 + }, + { + "epoch": 13.804116394606103, + "grad_norm": 0.03581158444285393, + "learning_rate": 8.620184528034067e-05, + "loss": 0.00915946438908577, + "step": 97250 + }, + { + "epoch": 13.805535841022001, + "grad_norm": 0.1405903398990631, + "learning_rate": 8.620042583392477e-05, + "loss": 0.016153512895107268, + "step": 97260 + }, + { + "epoch": 13.8069552874379, + "grad_norm": 2.959599018096924, + "learning_rate": 8.619900638750887e-05, + "loss": 0.02759789824485779, + "step": 97270 + }, + { + "epoch": 13.808374733853796, + "grad_norm": 0.08808876574039459, + "learning_rate": 8.619758694109298e-05, + "loss": 0.05625124573707581, + "step": 97280 + }, + { + "epoch": 13.809794180269694, + "grad_norm": 3.4986207485198975, + "learning_rate": 8.619616749467708e-05, + "loss": 0.009423717856407166, + "step": 97290 + }, + { + "epoch": 13.811213626685593, + "grad_norm": 0.2367706447839737, + "learning_rate": 8.619474804826119e-05, + "loss": 0.03007735013961792, + "step": 97300 + }, + { + "epoch": 13.812633073101491, + "grad_norm": 1.6623224020004272, + "learning_rate": 8.619332860184528e-05, + "loss": 0.0030352432280778886, + "step": 97310 + }, + { + "epoch": 13.814052519517388, + "grad_norm": 0.6825453639030457, + "learning_rate": 8.619190915542938e-05, + "loss": 0.015154258906841278, + "step": 97320 + }, + { + "epoch": 13.815471965933286, + "grad_norm": 0.1828623265028, + "learning_rate": 8.619048970901349e-05, + "loss": 0.0347985178232193, + "step": 97330 + }, + { + "epoch": 13.816891412349184, + "grad_norm": 0.72845458984375, + "learning_rate": 8.618907026259759e-05, + "loss": 0.016802479326725007, + "step": 97340 + }, + { + "epoch": 13.81831085876508, + "grad_norm": 0.04905217885971069, + "learning_rate": 8.61876508161817e-05, + "loss": 0.028523996472358704, + "step": 97350 + }, + { + "epoch": 13.819730305180979, + "grad_norm": 0.05003027990460396, + "learning_rate": 8.618623136976579e-05, + "loss": 0.03651518523693085, + "step": 97360 + }, + { + "epoch": 13.821149751596877, + "grad_norm": 0.8443085551261902, + "learning_rate": 8.61848119233499e-05, + "loss": 0.06068788170814514, + "step": 97370 + }, + { + "epoch": 13.822569198012776, + "grad_norm": 3.9706737995147705, + "learning_rate": 8.6183392476934e-05, + "loss": 0.009664274752140045, + "step": 97380 + }, + { + "epoch": 13.823988644428672, + "grad_norm": 7.973998546600342, + "learning_rate": 8.61819730305181e-05, + "loss": 0.019925667345523833, + "step": 97390 + }, + { + "epoch": 13.82540809084457, + "grad_norm": 0.016353856772184372, + "learning_rate": 8.61805535841022e-05, + "loss": 0.012330979853868485, + "step": 97400 + }, + { + "epoch": 13.826827537260469, + "grad_norm": 0.08897315710783005, + "learning_rate": 8.61791341376863e-05, + "loss": 0.011504063010215759, + "step": 97410 + }, + { + "epoch": 13.828246983676365, + "grad_norm": 0.01945200376212597, + "learning_rate": 8.617771469127041e-05, + "loss": 0.04555101692676544, + "step": 97420 + }, + { + "epoch": 13.829666430092264, + "grad_norm": 10.132003784179688, + "learning_rate": 8.617629524485451e-05, + "loss": 0.02212253361940384, + "step": 97430 + }, + { + "epoch": 13.831085876508162, + "grad_norm": 0.14945967495441437, + "learning_rate": 8.617487579843862e-05, + "loss": 0.01767318695783615, + "step": 97440 + }, + { + "epoch": 13.83250532292406, + "grad_norm": 3.7053380012512207, + "learning_rate": 8.617345635202272e-05, + "loss": 0.020854970812797545, + "step": 97450 + }, + { + "epoch": 13.833924769339957, + "grad_norm": 5.3743109703063965, + "learning_rate": 8.617203690560681e-05, + "loss": 0.041898411512374875, + "step": 97460 + }, + { + "epoch": 13.835344215755855, + "grad_norm": 0.31361621618270874, + "learning_rate": 8.617061745919091e-05, + "loss": 0.010599453002214432, + "step": 97470 + }, + { + "epoch": 13.836763662171753, + "grad_norm": 0.018387148156762123, + "learning_rate": 8.616919801277502e-05, + "loss": 0.014518235623836518, + "step": 97480 + }, + { + "epoch": 13.83818310858765, + "grad_norm": 0.4590878188610077, + "learning_rate": 8.616777856635912e-05, + "loss": 0.032176035642623904, + "step": 97490 + }, + { + "epoch": 13.839602555003548, + "grad_norm": 0.02756396494805813, + "learning_rate": 8.616635911994323e-05, + "loss": 0.02949948012828827, + "step": 97500 + }, + { + "epoch": 13.839602555003548, + "eval_accuracy": 0.9776181089845488, + "eval_loss": 0.0747826024889946, + "eval_runtime": 30.7964, + "eval_samples_per_second": 510.676, + "eval_steps_per_second": 15.976, + "step": 97500 + }, + { + "epoch": 13.841022001419446, + "grad_norm": 0.6445552706718445, + "learning_rate": 8.616493967352734e-05, + "loss": 0.05971266627311707, + "step": 97510 + }, + { + "epoch": 13.842441447835345, + "grad_norm": 0.09805190563201904, + "learning_rate": 8.616352022711142e-05, + "loss": 0.024461531639099122, + "step": 97520 + }, + { + "epoch": 13.843860894251241, + "grad_norm": 6.539288520812988, + "learning_rate": 8.616210078069554e-05, + "loss": 0.019382362067699433, + "step": 97530 + }, + { + "epoch": 13.84528034066714, + "grad_norm": 1.607917070388794, + "learning_rate": 8.616068133427963e-05, + "loss": 0.011100460588932038, + "step": 97540 + }, + { + "epoch": 13.846699787083038, + "grad_norm": 0.9553514719009399, + "learning_rate": 8.615926188786374e-05, + "loss": 0.041931447386741635, + "step": 97550 + }, + { + "epoch": 13.848119233498934, + "grad_norm": 0.7937570214271545, + "learning_rate": 8.615784244144784e-05, + "loss": 0.026648005843162535, + "step": 97560 + }, + { + "epoch": 13.849538679914833, + "grad_norm": 0.1172422394156456, + "learning_rate": 8.615642299503194e-05, + "loss": 0.019187642633914946, + "step": 97570 + }, + { + "epoch": 13.850958126330731, + "grad_norm": 0.2581722140312195, + "learning_rate": 8.615500354861604e-05, + "loss": 0.021943846344947816, + "step": 97580 + }, + { + "epoch": 13.85237757274663, + "grad_norm": 1.9560277462005615, + "learning_rate": 8.615358410220015e-05, + "loss": 0.025108674168586732, + "step": 97590 + }, + { + "epoch": 13.853797019162526, + "grad_norm": 0.5822705626487732, + "learning_rate": 8.615216465578424e-05, + "loss": 0.03498307466506958, + "step": 97600 + }, + { + "epoch": 13.855216465578424, + "grad_norm": 3.110405921936035, + "learning_rate": 8.615074520936836e-05, + "loss": 0.030993205308914185, + "step": 97610 + }, + { + "epoch": 13.856635911994323, + "grad_norm": 5.9054741859436035, + "learning_rate": 8.614932576295245e-05, + "loss": 0.028361022472381592, + "step": 97620 + }, + { + "epoch": 13.858055358410219, + "grad_norm": 0.049585871398448944, + "learning_rate": 8.614790631653655e-05, + "loss": 0.016482987999916078, + "step": 97630 + }, + { + "epoch": 13.859474804826117, + "grad_norm": 2.339905261993408, + "learning_rate": 8.614648687012066e-05, + "loss": 0.03827682137489319, + "step": 97640 + }, + { + "epoch": 13.860894251242016, + "grad_norm": 0.04161804914474487, + "learning_rate": 8.614506742370476e-05, + "loss": 0.02619813084602356, + "step": 97650 + }, + { + "epoch": 13.862313697657914, + "grad_norm": 0.275062620639801, + "learning_rate": 8.614364797728887e-05, + "loss": 0.03152703642845154, + "step": 97660 + }, + { + "epoch": 13.86373314407381, + "grad_norm": 2.660701036453247, + "learning_rate": 8.614222853087295e-05, + "loss": 0.024106189608573914, + "step": 97670 + }, + { + "epoch": 13.865152590489709, + "grad_norm": 5.393669605255127, + "learning_rate": 8.614080908445706e-05, + "loss": 0.029433247447013856, + "step": 97680 + }, + { + "epoch": 13.866572036905607, + "grad_norm": 2.5080268383026123, + "learning_rate": 8.613938963804116e-05, + "loss": 0.048150715231895444, + "step": 97690 + }, + { + "epoch": 13.867991483321505, + "grad_norm": 1.032168984413147, + "learning_rate": 8.613797019162527e-05, + "loss": 0.00849594920873642, + "step": 97700 + }, + { + "epoch": 13.869410929737402, + "grad_norm": 0.4703840911388397, + "learning_rate": 8.613655074520938e-05, + "loss": 0.022085925936698912, + "step": 97710 + }, + { + "epoch": 13.8708303761533, + "grad_norm": 0.8159363865852356, + "learning_rate": 8.613513129879347e-05, + "loss": 0.023894134163856506, + "step": 97720 + }, + { + "epoch": 13.872249822569199, + "grad_norm": 1.7675553560256958, + "learning_rate": 8.613371185237758e-05, + "loss": 0.03863615989685058, + "step": 97730 + }, + { + "epoch": 13.873669268985095, + "grad_norm": 0.1795162409543991, + "learning_rate": 8.613229240596168e-05, + "loss": 0.047685247659683225, + "step": 97740 + }, + { + "epoch": 13.875088715400993, + "grad_norm": 0.5392060875892639, + "learning_rate": 8.613087295954579e-05, + "loss": 0.031848755478858945, + "step": 97750 + }, + { + "epoch": 13.876508161816892, + "grad_norm": 0.1371711939573288, + "learning_rate": 8.612945351312988e-05, + "loss": 0.003922409191727638, + "step": 97760 + }, + { + "epoch": 13.87792760823279, + "grad_norm": 13.345478057861328, + "learning_rate": 8.612803406671398e-05, + "loss": 0.026858839392662048, + "step": 97770 + }, + { + "epoch": 13.879347054648687, + "grad_norm": 1.7686020135879517, + "learning_rate": 8.612661462029808e-05, + "loss": 0.028010132908821105, + "step": 97780 + }, + { + "epoch": 13.880766501064585, + "grad_norm": 11.41087532043457, + "learning_rate": 8.612519517388219e-05, + "loss": 0.039809000492095944, + "step": 97790 + }, + { + "epoch": 13.882185947480483, + "grad_norm": 1.6744792461395264, + "learning_rate": 8.61237757274663e-05, + "loss": 0.0045928433537483215, + "step": 97800 + }, + { + "epoch": 13.88360539389638, + "grad_norm": 0.4395209848880768, + "learning_rate": 8.61223562810504e-05, + "loss": 0.01665394753217697, + "step": 97810 + }, + { + "epoch": 13.885024840312278, + "grad_norm": 1.1791696548461914, + "learning_rate": 8.61209368346345e-05, + "loss": 0.016723376512527467, + "step": 97820 + }, + { + "epoch": 13.886444286728176, + "grad_norm": 0.04103924706578255, + "learning_rate": 8.611951738821859e-05, + "loss": 0.03735288977622986, + "step": 97830 + }, + { + "epoch": 13.887863733144075, + "grad_norm": 0.374519407749176, + "learning_rate": 8.61180979418027e-05, + "loss": 0.03749838471412659, + "step": 97840 + }, + { + "epoch": 13.889283179559971, + "grad_norm": 3.321499824523926, + "learning_rate": 8.61166784953868e-05, + "loss": 0.014933030307292938, + "step": 97850 + }, + { + "epoch": 13.89070262597587, + "grad_norm": 1.2417290210723877, + "learning_rate": 8.611525904897091e-05, + "loss": 0.02480054199695587, + "step": 97860 + }, + { + "epoch": 13.892122072391768, + "grad_norm": 8.301517486572266, + "learning_rate": 8.611383960255501e-05, + "loss": 0.03353300094604492, + "step": 97870 + }, + { + "epoch": 13.893541518807664, + "grad_norm": 1.6440887451171875, + "learning_rate": 8.61124201561391e-05, + "loss": 0.029333028197288512, + "step": 97880 + }, + { + "epoch": 13.894960965223563, + "grad_norm": 0.038577500730752945, + "learning_rate": 8.611100070972322e-05, + "loss": 0.008193275332450867, + "step": 97890 + }, + { + "epoch": 13.896380411639461, + "grad_norm": 11.59971809387207, + "learning_rate": 8.610958126330731e-05, + "loss": 0.01720527410507202, + "step": 97900 + }, + { + "epoch": 13.89779985805536, + "grad_norm": 0.11012128740549088, + "learning_rate": 8.610816181689143e-05, + "loss": 0.028680214285850526, + "step": 97910 + }, + { + "epoch": 13.899219304471256, + "grad_norm": 0.0679960697889328, + "learning_rate": 8.610674237047552e-05, + "loss": 0.015310463309288026, + "step": 97920 + }, + { + "epoch": 13.900638750887154, + "grad_norm": 1.3655829429626465, + "learning_rate": 8.610532292405962e-05, + "loss": 0.03698239922523498, + "step": 97930 + }, + { + "epoch": 13.902058197303052, + "grad_norm": 0.7326348423957825, + "learning_rate": 8.610390347764372e-05, + "loss": 0.011951953172683716, + "step": 97940 + }, + { + "epoch": 13.903477643718949, + "grad_norm": 0.3693626821041107, + "learning_rate": 8.610248403122783e-05, + "loss": 0.02224213778972626, + "step": 97950 + }, + { + "epoch": 13.904897090134847, + "grad_norm": 0.18457281589508057, + "learning_rate": 8.610106458481193e-05, + "loss": 0.04400915801525116, + "step": 97960 + }, + { + "epoch": 13.906316536550746, + "grad_norm": 0.31119829416275024, + "learning_rate": 8.609964513839604e-05, + "loss": 0.025654134154319764, + "step": 97970 + }, + { + "epoch": 13.907735982966644, + "grad_norm": 0.9079119563102722, + "learning_rate": 8.609822569198013e-05, + "loss": 0.020143988728523254, + "step": 97980 + }, + { + "epoch": 13.90915542938254, + "grad_norm": 14.945606231689453, + "learning_rate": 8.609680624556423e-05, + "loss": 0.028208765387535095, + "step": 97990 + }, + { + "epoch": 13.910574875798439, + "grad_norm": 0.9307637214660645, + "learning_rate": 8.609538679914834e-05, + "loss": 0.0067070737481117245, + "step": 98000 + }, + { + "epoch": 13.910574875798439, + "eval_accuracy": 0.9830864119030965, + "eval_loss": 0.06170034408569336, + "eval_runtime": 32.3419, + "eval_samples_per_second": 486.273, + "eval_steps_per_second": 15.212, + "step": 98000 + }, + { + "epoch": 13.911994322214337, + "grad_norm": 0.025303415954113007, + "learning_rate": 8.609396735273244e-05, + "loss": 0.027243292331695555, + "step": 98010 + }, + { + "epoch": 13.913413768630233, + "grad_norm": 2.292128086090088, + "learning_rate": 8.609254790631655e-05, + "loss": 0.025096246600151063, + "step": 98020 + }, + { + "epoch": 13.914833215046132, + "grad_norm": 1.6361472606658936, + "learning_rate": 8.609112845990063e-05, + "loss": 0.014097224175930022, + "step": 98030 + }, + { + "epoch": 13.91625266146203, + "grad_norm": 0.0516996905207634, + "learning_rate": 8.608970901348475e-05, + "loss": 0.036923548579216, + "step": 98040 + }, + { + "epoch": 13.917672107877928, + "grad_norm": 2.1145389080047607, + "learning_rate": 8.608828956706884e-05, + "loss": 0.019678601622581483, + "step": 98050 + }, + { + "epoch": 13.919091554293825, + "grad_norm": 6.481072425842285, + "learning_rate": 8.608687012065295e-05, + "loss": 0.01554270088672638, + "step": 98060 + }, + { + "epoch": 13.920511000709723, + "grad_norm": 0.5275385975837708, + "learning_rate": 8.608545067423705e-05, + "loss": 0.05277801156044006, + "step": 98070 + }, + { + "epoch": 13.921930447125622, + "grad_norm": 0.02168644405901432, + "learning_rate": 8.608403122782115e-05, + "loss": 0.019489049911499023, + "step": 98080 + }, + { + "epoch": 13.923349893541518, + "grad_norm": 0.02501189522445202, + "learning_rate": 8.608261178140526e-05, + "loss": 0.08635119199752808, + "step": 98090 + }, + { + "epoch": 13.924769339957416, + "grad_norm": 0.07429813593626022, + "learning_rate": 8.608119233498936e-05, + "loss": 0.025414294004440306, + "step": 98100 + }, + { + "epoch": 13.926188786373315, + "grad_norm": 13.504375457763672, + "learning_rate": 8.607977288857347e-05, + "loss": 0.04383018016815186, + "step": 98110 + }, + { + "epoch": 13.927608232789213, + "grad_norm": 0.5328422784805298, + "learning_rate": 8.607835344215757e-05, + "loss": 0.015777355432510375, + "step": 98120 + }, + { + "epoch": 13.92902767920511, + "grad_norm": 0.07831300050020218, + "learning_rate": 8.607693399574166e-05, + "loss": 0.027874544262886047, + "step": 98130 + }, + { + "epoch": 13.930447125621008, + "grad_norm": 1.3268852233886719, + "learning_rate": 8.607551454932576e-05, + "loss": 0.04068158566951752, + "step": 98140 + }, + { + "epoch": 13.931866572036906, + "grad_norm": 3.3505053520202637, + "learning_rate": 8.607409510290987e-05, + "loss": 0.013634760677814484, + "step": 98150 + }, + { + "epoch": 13.933286018452803, + "grad_norm": 0.1177496388554573, + "learning_rate": 8.607267565649397e-05, + "loss": 0.012508463859558106, + "step": 98160 + }, + { + "epoch": 13.934705464868701, + "grad_norm": 0.4416254162788391, + "learning_rate": 8.607125621007808e-05, + "loss": 0.025961068272590638, + "step": 98170 + }, + { + "epoch": 13.9361249112846, + "grad_norm": 7.925416946411133, + "learning_rate": 8.606983676366218e-05, + "loss": 0.03427475094795227, + "step": 98180 + }, + { + "epoch": 13.937544357700498, + "grad_norm": 0.23578715324401855, + "learning_rate": 8.606841731724627e-05, + "loss": 0.02209036499261856, + "step": 98190 + }, + { + "epoch": 13.938963804116394, + "grad_norm": 0.14721371233463287, + "learning_rate": 8.606699787083038e-05, + "loss": 0.10272301435470581, + "step": 98200 + }, + { + "epoch": 13.940383250532292, + "grad_norm": 3.374873161315918, + "learning_rate": 8.606557842441448e-05, + "loss": 0.020183584094047545, + "step": 98210 + }, + { + "epoch": 13.94180269694819, + "grad_norm": 8.487064361572266, + "learning_rate": 8.606415897799859e-05, + "loss": 0.032809144258499144, + "step": 98220 + }, + { + "epoch": 13.943222143364087, + "grad_norm": 1.1708842515945435, + "learning_rate": 8.606273953158269e-05, + "loss": 0.053074246644973753, + "step": 98230 + }, + { + "epoch": 13.944641589779986, + "grad_norm": 21.59556007385254, + "learning_rate": 8.606132008516679e-05, + "loss": 0.10838912725448609, + "step": 98240 + }, + { + "epoch": 13.946061036195884, + "grad_norm": 0.26949456334114075, + "learning_rate": 8.605990063875089e-05, + "loss": 0.020542636513710022, + "step": 98250 + }, + { + "epoch": 13.947480482611782, + "grad_norm": 1.2219308614730835, + "learning_rate": 8.6058481192335e-05, + "loss": 0.022973744571208952, + "step": 98260 + }, + { + "epoch": 13.948899929027679, + "grad_norm": 2.3675146102905273, + "learning_rate": 8.60570617459191e-05, + "loss": 0.03723432123661041, + "step": 98270 + }, + { + "epoch": 13.950319375443577, + "grad_norm": 0.1154785230755806, + "learning_rate": 8.60556422995032e-05, + "loss": 0.03642845153808594, + "step": 98280 + }, + { + "epoch": 13.951738821859475, + "grad_norm": 1.0712108612060547, + "learning_rate": 8.60542228530873e-05, + "loss": 0.03615443110466003, + "step": 98290 + }, + { + "epoch": 13.953158268275372, + "grad_norm": 0.4232095777988434, + "learning_rate": 8.60528034066714e-05, + "loss": 0.02514898478984833, + "step": 98300 + }, + { + "epoch": 13.95457771469127, + "grad_norm": 0.08924500644207001, + "learning_rate": 8.605138396025551e-05, + "loss": 0.02497561275959015, + "step": 98310 + }, + { + "epoch": 13.955997161107168, + "grad_norm": 0.053555138409137726, + "learning_rate": 8.604996451383961e-05, + "loss": 0.010297687351703643, + "step": 98320 + }, + { + "epoch": 13.957416607523067, + "grad_norm": 0.3748902380466461, + "learning_rate": 8.604854506742372e-05, + "loss": 0.007654424756765366, + "step": 98330 + }, + { + "epoch": 13.958836053938963, + "grad_norm": 8.75135326385498, + "learning_rate": 8.60471256210078e-05, + "loss": 0.03046306371688843, + "step": 98340 + }, + { + "epoch": 13.960255500354862, + "grad_norm": 0.07814659923315048, + "learning_rate": 8.604570617459191e-05, + "loss": 0.006948675215244293, + "step": 98350 + }, + { + "epoch": 13.96167494677076, + "grad_norm": 0.058386798948049545, + "learning_rate": 8.604428672817601e-05, + "loss": 0.04257642328739166, + "step": 98360 + }, + { + "epoch": 13.963094393186656, + "grad_norm": 1.3675897121429443, + "learning_rate": 8.604286728176012e-05, + "loss": 0.03919885754585266, + "step": 98370 + }, + { + "epoch": 13.964513839602555, + "grad_norm": 10.515870094299316, + "learning_rate": 8.604144783534422e-05, + "loss": 0.03455580770969391, + "step": 98380 + }, + { + "epoch": 13.965933286018453, + "grad_norm": 6.172956466674805, + "learning_rate": 8.604002838892832e-05, + "loss": 0.03574661910533905, + "step": 98390 + }, + { + "epoch": 13.967352732434351, + "grad_norm": 0.2484154850244522, + "learning_rate": 8.603860894251243e-05, + "loss": 0.01784258782863617, + "step": 98400 + }, + { + "epoch": 13.968772178850248, + "grad_norm": 0.031984057277441025, + "learning_rate": 8.603718949609652e-05, + "loss": 0.010730892419815063, + "step": 98410 + }, + { + "epoch": 13.970191625266146, + "grad_norm": 0.11616003513336182, + "learning_rate": 8.603577004968064e-05, + "loss": 0.014920552074909211, + "step": 98420 + }, + { + "epoch": 13.971611071682045, + "grad_norm": 0.31973591446876526, + "learning_rate": 8.603435060326473e-05, + "loss": 0.022470778226852416, + "step": 98430 + }, + { + "epoch": 13.973030518097941, + "grad_norm": 0.07785550504922867, + "learning_rate": 8.603293115684883e-05, + "loss": 0.022737446427345275, + "step": 98440 + }, + { + "epoch": 13.97444996451384, + "grad_norm": 9.522161483764648, + "learning_rate": 8.603151171043293e-05, + "loss": 0.05277225971221924, + "step": 98450 + }, + { + "epoch": 13.975869410929738, + "grad_norm": 0.9376153349876404, + "learning_rate": 8.603009226401704e-05, + "loss": 0.03830648064613342, + "step": 98460 + }, + { + "epoch": 13.977288857345636, + "grad_norm": 0.26860079169273376, + "learning_rate": 8.602867281760114e-05, + "loss": 0.004900941625237465, + "step": 98470 + }, + { + "epoch": 13.978708303761533, + "grad_norm": 0.4131862223148346, + "learning_rate": 8.602725337118525e-05, + "loss": 0.022539208829402923, + "step": 98480 + }, + { + "epoch": 13.98012775017743, + "grad_norm": 0.473401814699173, + "learning_rate": 8.602583392476934e-05, + "loss": 0.02035791575908661, + "step": 98490 + }, + { + "epoch": 13.98154719659333, + "grad_norm": 0.4627954661846161, + "learning_rate": 8.602441447835344e-05, + "loss": 0.019416412711143492, + "step": 98500 + }, + { + "epoch": 13.98154719659333, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.05638109892606735, + "eval_runtime": 31.4517, + "eval_samples_per_second": 500.036, + "eval_steps_per_second": 15.643, + "step": 98500 + }, + { + "epoch": 13.982966643009226, + "grad_norm": 0.14699269831180573, + "learning_rate": 8.602299503193755e-05, + "loss": 0.013596628606319428, + "step": 98510 + }, + { + "epoch": 13.984386089425124, + "grad_norm": 0.6649786233901978, + "learning_rate": 8.602157558552165e-05, + "loss": 0.02714385688304901, + "step": 98520 + }, + { + "epoch": 13.985805535841022, + "grad_norm": 4.457527160644531, + "learning_rate": 8.602015613910576e-05, + "loss": 0.03538078665733337, + "step": 98530 + }, + { + "epoch": 13.98722498225692, + "grad_norm": 12.919422149658203, + "learning_rate": 8.601873669268984e-05, + "loss": 0.04029761552810669, + "step": 98540 + }, + { + "epoch": 13.988644428672817, + "grad_norm": 0.2152722179889679, + "learning_rate": 8.601731724627396e-05, + "loss": 0.0575924813747406, + "step": 98550 + }, + { + "epoch": 13.990063875088715, + "grad_norm": 4.616553783416748, + "learning_rate": 8.601589779985805e-05, + "loss": 0.018635259568691255, + "step": 98560 + }, + { + "epoch": 13.991483321504614, + "grad_norm": 6.969879150390625, + "learning_rate": 8.601447835344216e-05, + "loss": 0.018483972549438475, + "step": 98570 + }, + { + "epoch": 13.99290276792051, + "grad_norm": 0.6723334789276123, + "learning_rate": 8.601305890702626e-05, + "loss": 0.00979367196559906, + "step": 98580 + }, + { + "epoch": 13.994322214336409, + "grad_norm": 1.566240906715393, + "learning_rate": 8.601163946061037e-05, + "loss": 0.007228370010852814, + "step": 98590 + }, + { + "epoch": 13.995741660752307, + "grad_norm": 1.199708104133606, + "learning_rate": 8.601022001419447e-05, + "loss": 0.016369998455047607, + "step": 98600 + }, + { + "epoch": 13.997161107168205, + "grad_norm": 0.20838220417499542, + "learning_rate": 8.600880056777857e-05, + "loss": 0.023891144990921022, + "step": 98610 + }, + { + "epoch": 13.998580553584102, + "grad_norm": 0.1336381584405899, + "learning_rate": 8.600738112136268e-05, + "loss": 0.028340649604797364, + "step": 98620 + }, + { + "epoch": 14.0, + "grad_norm": 0.5767505168914795, + "learning_rate": 8.600596167494678e-05, + "loss": 0.00616534873843193, + "step": 98630 + }, + { + "epoch": 14.001419446415898, + "grad_norm": 7.575362205505371, + "learning_rate": 8.600454222853089e-05, + "loss": 0.044139009714126584, + "step": 98640 + }, + { + "epoch": 14.002838892831795, + "grad_norm": 0.02623675763607025, + "learning_rate": 8.600312278211497e-05, + "loss": 0.017933820188045502, + "step": 98650 + }, + { + "epoch": 14.004258339247693, + "grad_norm": 0.4991348087787628, + "learning_rate": 8.600170333569908e-05, + "loss": 0.02468074709177017, + "step": 98660 + }, + { + "epoch": 14.005677785663591, + "grad_norm": 0.03932322561740875, + "learning_rate": 8.600028388928318e-05, + "loss": 0.04342496395111084, + "step": 98670 + }, + { + "epoch": 14.00709723207949, + "grad_norm": 7.684366226196289, + "learning_rate": 8.599886444286729e-05, + "loss": 0.05863423347473144, + "step": 98680 + }, + { + "epoch": 14.008516678495386, + "grad_norm": 0.6191020607948303, + "learning_rate": 8.599744499645139e-05, + "loss": 0.03286792933940887, + "step": 98690 + }, + { + "epoch": 14.009936124911285, + "grad_norm": 12.611638069152832, + "learning_rate": 8.599602555003548e-05, + "loss": 0.046319156885147095, + "step": 98700 + }, + { + "epoch": 14.011355571327183, + "grad_norm": 0.5372052192687988, + "learning_rate": 8.59946061036196e-05, + "loss": 0.01128637120127678, + "step": 98710 + }, + { + "epoch": 14.01277501774308, + "grad_norm": 1.9387632608413696, + "learning_rate": 8.599318665720369e-05, + "loss": 0.025460246205329894, + "step": 98720 + }, + { + "epoch": 14.014194464158978, + "grad_norm": 6.6452555656433105, + "learning_rate": 8.59917672107878e-05, + "loss": 0.015543276071548462, + "step": 98730 + }, + { + "epoch": 14.015613910574876, + "grad_norm": 0.17176109552383423, + "learning_rate": 8.59903477643719e-05, + "loss": 0.013405351340770722, + "step": 98740 + }, + { + "epoch": 14.017033356990774, + "grad_norm": 0.30507367849349976, + "learning_rate": 8.5988928317956e-05, + "loss": 0.02677640914916992, + "step": 98750 + }, + { + "epoch": 14.01845280340667, + "grad_norm": 0.07649233937263489, + "learning_rate": 8.59875088715401e-05, + "loss": 0.004966056346893311, + "step": 98760 + }, + { + "epoch": 14.01987224982257, + "grad_norm": 0.1671622395515442, + "learning_rate": 8.59860894251242e-05, + "loss": 0.06767297387123108, + "step": 98770 + }, + { + "epoch": 14.021291696238467, + "grad_norm": 2.377171516418457, + "learning_rate": 8.59846699787083e-05, + "loss": 0.027815097570419313, + "step": 98780 + }, + { + "epoch": 14.022711142654364, + "grad_norm": 0.3002987802028656, + "learning_rate": 8.598325053229241e-05, + "loss": 0.040025681257247925, + "step": 98790 + }, + { + "epoch": 14.024130589070262, + "grad_norm": 0.2978041470050812, + "learning_rate": 8.598183108587651e-05, + "loss": 0.005738424882292747, + "step": 98800 + }, + { + "epoch": 14.02555003548616, + "grad_norm": 0.4054115116596222, + "learning_rate": 8.598041163946061e-05, + "loss": 0.057332354784011844, + "step": 98810 + }, + { + "epoch": 14.026969481902059, + "grad_norm": 6.931030750274658, + "learning_rate": 8.597899219304472e-05, + "loss": 0.004104747623205185, + "step": 98820 + }, + { + "epoch": 14.028388928317955, + "grad_norm": 0.2317405343055725, + "learning_rate": 8.597757274662882e-05, + "loss": 0.00666595995426178, + "step": 98830 + }, + { + "epoch": 14.029808374733854, + "grad_norm": 0.5171235799789429, + "learning_rate": 8.597615330021293e-05, + "loss": 0.04026064872741699, + "step": 98840 + }, + { + "epoch": 14.031227821149752, + "grad_norm": 0.126663938164711, + "learning_rate": 8.597473385379701e-05, + "loss": 0.012223343551158904, + "step": 98850 + }, + { + "epoch": 14.032647267565649, + "grad_norm": 11.934538841247559, + "learning_rate": 8.597331440738112e-05, + "loss": 0.02199336588382721, + "step": 98860 + }, + { + "epoch": 14.034066713981547, + "grad_norm": 3.310514450073242, + "learning_rate": 8.597189496096522e-05, + "loss": 0.01725810319185257, + "step": 98870 + }, + { + "epoch": 14.035486160397445, + "grad_norm": 6.979939937591553, + "learning_rate": 8.597047551454933e-05, + "loss": 0.04737822115421295, + "step": 98880 + }, + { + "epoch": 14.036905606813344, + "grad_norm": 0.2535474896430969, + "learning_rate": 8.596905606813343e-05, + "loss": 0.026235219836235047, + "step": 98890 + }, + { + "epoch": 14.03832505322924, + "grad_norm": 0.12438727170228958, + "learning_rate": 8.596763662171753e-05, + "loss": 0.054426532983779904, + "step": 98900 + }, + { + "epoch": 14.039744499645138, + "grad_norm": 13.687928199768066, + "learning_rate": 8.596621717530164e-05, + "loss": 0.05139861702919006, + "step": 98910 + }, + { + "epoch": 14.041163946061037, + "grad_norm": 0.013612110167741776, + "learning_rate": 8.596479772888573e-05, + "loss": 0.047477427124977115, + "step": 98920 + }, + { + "epoch": 14.042583392476933, + "grad_norm": 3.528958559036255, + "learning_rate": 8.596337828246985e-05, + "loss": 0.0062147751450538635, + "step": 98930 + }, + { + "epoch": 14.044002838892832, + "grad_norm": 8.854484558105469, + "learning_rate": 8.596195883605394e-05, + "loss": 0.021570898592472076, + "step": 98940 + }, + { + "epoch": 14.04542228530873, + "grad_norm": 3.4759981632232666, + "learning_rate": 8.596053938963805e-05, + "loss": 0.07649534344673156, + "step": 98950 + }, + { + "epoch": 14.046841731724628, + "grad_norm": 1.6767079830169678, + "learning_rate": 8.595911994322214e-05, + "loss": 0.03111596405506134, + "step": 98960 + }, + { + "epoch": 14.048261178140525, + "grad_norm": 0.6731406450271606, + "learning_rate": 8.595770049680625e-05, + "loss": 0.006647625565528869, + "step": 98970 + }, + { + "epoch": 14.049680624556423, + "grad_norm": 1.234702229499817, + "learning_rate": 8.595628105039035e-05, + "loss": 0.09047362208366394, + "step": 98980 + }, + { + "epoch": 14.051100070972321, + "grad_norm": 2.347928524017334, + "learning_rate": 8.595486160397446e-05, + "loss": 0.02055363208055496, + "step": 98990 + }, + { + "epoch": 14.052519517388218, + "grad_norm": 0.2850717604160309, + "learning_rate": 8.595344215755857e-05, + "loss": 0.008768963813781738, + "step": 99000 + }, + { + "epoch": 14.052519517388218, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.054401837289333344, + "eval_runtime": 30.7732, + "eval_samples_per_second": 511.061, + "eval_steps_per_second": 15.988, + "step": 99000 + }, + { + "epoch": 14.053938963804116, + "grad_norm": 8.854166984558105, + "learning_rate": 8.595202271114265e-05, + "loss": 0.01207282543182373, + "step": 99010 + }, + { + "epoch": 14.055358410220014, + "grad_norm": 0.08972510695457458, + "learning_rate": 8.595060326472676e-05, + "loss": 0.026438263058662415, + "step": 99020 + }, + { + "epoch": 14.056777856635913, + "grad_norm": 0.09185374528169632, + "learning_rate": 8.594918381831086e-05, + "loss": 0.006578221917152405, + "step": 99030 + }, + { + "epoch": 14.05819730305181, + "grad_norm": 0.0555814653635025, + "learning_rate": 8.594776437189497e-05, + "loss": 0.021403425931930543, + "step": 99040 + }, + { + "epoch": 14.059616749467708, + "grad_norm": 1.687256932258606, + "learning_rate": 8.594634492547907e-05, + "loss": 0.008669185638427734, + "step": 99050 + }, + { + "epoch": 14.061036195883606, + "grad_norm": 0.04107533022761345, + "learning_rate": 8.594492547906317e-05, + "loss": 0.06898521780967712, + "step": 99060 + }, + { + "epoch": 14.062455642299502, + "grad_norm": 2.3766849040985107, + "learning_rate": 8.594350603264726e-05, + "loss": 0.03251543939113617, + "step": 99070 + }, + { + "epoch": 14.0638750887154, + "grad_norm": 0.07620040327310562, + "learning_rate": 8.594208658623137e-05, + "loss": 0.027515420317649843, + "step": 99080 + }, + { + "epoch": 14.065294535131299, + "grad_norm": 0.17723844945430756, + "learning_rate": 8.594066713981549e-05, + "loss": 0.0391448974609375, + "step": 99090 + }, + { + "epoch": 14.066713981547197, + "grad_norm": 0.40072786808013916, + "learning_rate": 8.593924769339958e-05, + "loss": 0.016369281709194182, + "step": 99100 + }, + { + "epoch": 14.068133427963094, + "grad_norm": 20.97338104248047, + "learning_rate": 8.593782824698368e-05, + "loss": 0.03046499490737915, + "step": 99110 + }, + { + "epoch": 14.069552874378992, + "grad_norm": 0.13278134167194366, + "learning_rate": 8.593640880056778e-05, + "loss": 0.0066719576716423035, + "step": 99120 + }, + { + "epoch": 14.07097232079489, + "grad_norm": 1.1554104089736938, + "learning_rate": 8.593498935415189e-05, + "loss": 0.028399959206581116, + "step": 99130 + }, + { + "epoch": 14.072391767210787, + "grad_norm": 10.434669494628906, + "learning_rate": 8.593356990773599e-05, + "loss": 0.01027916967868805, + "step": 99140 + }, + { + "epoch": 14.073811213626685, + "grad_norm": 0.4504542946815491, + "learning_rate": 8.59321504613201e-05, + "loss": 0.00998721569776535, + "step": 99150 + }, + { + "epoch": 14.075230660042584, + "grad_norm": 2.134850263595581, + "learning_rate": 8.593073101490418e-05, + "loss": 0.044223248958587646, + "step": 99160 + }, + { + "epoch": 14.076650106458482, + "grad_norm": 0.6842259168624878, + "learning_rate": 8.592931156848829e-05, + "loss": 0.022297856211662293, + "step": 99170 + }, + { + "epoch": 14.078069552874378, + "grad_norm": 0.15943333506584167, + "learning_rate": 8.59278921220724e-05, + "loss": 0.0330827534198761, + "step": 99180 + }, + { + "epoch": 14.079488999290277, + "grad_norm": 0.26099467277526855, + "learning_rate": 8.59264726756565e-05, + "loss": 0.015528751909732819, + "step": 99190 + }, + { + "epoch": 14.080908445706175, + "grad_norm": 7.665553092956543, + "learning_rate": 8.592505322924061e-05, + "loss": 0.01843828409910202, + "step": 99200 + }, + { + "epoch": 14.082327892122072, + "grad_norm": 0.1282263994216919, + "learning_rate": 8.59236337828247e-05, + "loss": 0.005658290535211563, + "step": 99210 + }, + { + "epoch": 14.08374733853797, + "grad_norm": 3.389772653579712, + "learning_rate": 8.59222143364088e-05, + "loss": 0.009044589102268219, + "step": 99220 + }, + { + "epoch": 14.085166784953868, + "grad_norm": 1.2471179962158203, + "learning_rate": 8.59207948899929e-05, + "loss": 0.03725117146968841, + "step": 99230 + }, + { + "epoch": 14.086586231369767, + "grad_norm": 0.07664031535387039, + "learning_rate": 8.591937544357701e-05, + "loss": 0.014812260866165161, + "step": 99240 + }, + { + "epoch": 14.088005677785663, + "grad_norm": 0.05161131173372269, + "learning_rate": 8.591795599716111e-05, + "loss": 0.022272004187107085, + "step": 99250 + }, + { + "epoch": 14.089425124201561, + "grad_norm": 0.9902065992355347, + "learning_rate": 8.591653655074521e-05, + "loss": 0.03187122941017151, + "step": 99260 + }, + { + "epoch": 14.09084457061746, + "grad_norm": 6.050073623657227, + "learning_rate": 8.591511710432932e-05, + "loss": 0.011229197680950164, + "step": 99270 + }, + { + "epoch": 14.092264017033356, + "grad_norm": 2.0064609050750732, + "learning_rate": 8.591369765791342e-05, + "loss": 0.007860420644283295, + "step": 99280 + }, + { + "epoch": 14.093683463449254, + "grad_norm": 0.3100337088108063, + "learning_rate": 8.591227821149753e-05, + "loss": 0.008608365058898925, + "step": 99290 + }, + { + "epoch": 14.095102909865153, + "grad_norm": 0.5801976919174194, + "learning_rate": 8.591085876508162e-05, + "loss": 0.044814455509185794, + "step": 99300 + }, + { + "epoch": 14.096522356281051, + "grad_norm": 0.1426689773797989, + "learning_rate": 8.590943931866574e-05, + "loss": 0.013057531416416168, + "step": 99310 + }, + { + "epoch": 14.097941802696948, + "grad_norm": 0.8306183218955994, + "learning_rate": 8.590801987224982e-05, + "loss": 0.02446906715631485, + "step": 99320 + }, + { + "epoch": 14.099361249112846, + "grad_norm": 0.06384455412626266, + "learning_rate": 8.590660042583393e-05, + "loss": 0.012030948698520661, + "step": 99330 + }, + { + "epoch": 14.100780695528744, + "grad_norm": 1.4948430061340332, + "learning_rate": 8.590518097941803e-05, + "loss": 0.03669027090072632, + "step": 99340 + }, + { + "epoch": 14.10220014194464, + "grad_norm": 2.81807279586792, + "learning_rate": 8.590376153300214e-05, + "loss": 0.0058967161923646925, + "step": 99350 + }, + { + "epoch": 14.103619588360539, + "grad_norm": 0.1990390419960022, + "learning_rate": 8.590234208658624e-05, + "loss": 0.01164540946483612, + "step": 99360 + }, + { + "epoch": 14.105039034776437, + "grad_norm": 0.062245313078165054, + "learning_rate": 8.590092264017033e-05, + "loss": 0.006242537871003151, + "step": 99370 + }, + { + "epoch": 14.106458481192336, + "grad_norm": 6.959029197692871, + "learning_rate": 8.589950319375444e-05, + "loss": 0.03449139297008515, + "step": 99380 + }, + { + "epoch": 14.107877927608232, + "grad_norm": 0.16301681101322174, + "learning_rate": 8.589808374733854e-05, + "loss": 0.022822481393814088, + "step": 99390 + }, + { + "epoch": 14.10929737402413, + "grad_norm": 0.8691009283065796, + "learning_rate": 8.589666430092265e-05, + "loss": 0.01241927444934845, + "step": 99400 + }, + { + "epoch": 14.110716820440029, + "grad_norm": 2.5208559036254883, + "learning_rate": 8.589524485450675e-05, + "loss": 0.005818301066756249, + "step": 99410 + }, + { + "epoch": 14.112136266855925, + "grad_norm": 0.32502809166908264, + "learning_rate": 8.589382540809085e-05, + "loss": 0.006208596378564834, + "step": 99420 + }, + { + "epoch": 14.113555713271824, + "grad_norm": 0.0438520573079586, + "learning_rate": 8.589254790631655e-05, + "loss": 0.034111449122428895, + "step": 99430 + }, + { + "epoch": 14.114975159687722, + "grad_norm": 6.138270854949951, + "learning_rate": 8.589112845990064e-05, + "loss": 0.0335618793964386, + "step": 99440 + }, + { + "epoch": 14.11639460610362, + "grad_norm": 0.47917288541793823, + "learning_rate": 8.588970901348474e-05, + "loss": 0.006820973753929138, + "step": 99450 + }, + { + "epoch": 14.117814052519517, + "grad_norm": 0.24871699512004852, + "learning_rate": 8.588828956706885e-05, + "loss": 0.010762445628643036, + "step": 99460 + }, + { + "epoch": 14.119233498935415, + "grad_norm": 0.26667341589927673, + "learning_rate": 8.588687012065295e-05, + "loss": 0.031206348538398744, + "step": 99470 + }, + { + "epoch": 14.120652945351313, + "grad_norm": 4.056185245513916, + "learning_rate": 8.588545067423706e-05, + "loss": 0.018537884950637816, + "step": 99480 + }, + { + "epoch": 14.12207239176721, + "grad_norm": 0.356323778629303, + "learning_rate": 8.588403122782114e-05, + "loss": 0.025170964002609254, + "step": 99490 + }, + { + "epoch": 14.123491838183108, + "grad_norm": 0.5766083002090454, + "learning_rate": 8.588261178140525e-05, + "loss": 0.07454286813735962, + "step": 99500 + }, + { + "epoch": 14.123491838183108, + "eval_accuracy": 0.9858205633623705, + "eval_loss": 0.05155513063073158, + "eval_runtime": 31.5063, + "eval_samples_per_second": 499.169, + "eval_steps_per_second": 15.616, + "step": 99500 + }, + { + "epoch": 14.124911284599007, + "grad_norm": 0.03237629681825638, + "learning_rate": 8.588119233498935e-05, + "loss": 0.009366624057292938, + "step": 99510 + }, + { + "epoch": 14.126330731014905, + "grad_norm": 0.24052318930625916, + "learning_rate": 8.587977288857346e-05, + "loss": 0.00688902735710144, + "step": 99520 + }, + { + "epoch": 14.127750177430801, + "grad_norm": 3.427706480026245, + "learning_rate": 8.587835344215756e-05, + "loss": 0.010054501891136169, + "step": 99530 + }, + { + "epoch": 14.1291696238467, + "grad_norm": 0.6839547157287598, + "learning_rate": 8.587693399574166e-05, + "loss": 0.005801101773977279, + "step": 99540 + }, + { + "epoch": 14.130589070262598, + "grad_norm": 3.000164270401001, + "learning_rate": 8.587551454932577e-05, + "loss": 0.0038481026887893675, + "step": 99550 + }, + { + "epoch": 14.132008516678495, + "grad_norm": 1.2292991876602173, + "learning_rate": 8.587409510290987e-05, + "loss": 0.052852863073349, + "step": 99560 + }, + { + "epoch": 14.133427963094393, + "grad_norm": 0.055449653416872025, + "learning_rate": 8.587267565649398e-05, + "loss": 0.003483012318611145, + "step": 99570 + }, + { + "epoch": 14.134847409510291, + "grad_norm": 0.10466715693473816, + "learning_rate": 8.587125621007807e-05, + "loss": 0.03381909728050232, + "step": 99580 + }, + { + "epoch": 14.13626685592619, + "grad_norm": 0.04983215406537056, + "learning_rate": 8.586983676366217e-05, + "loss": 0.013138096034526824, + "step": 99590 + }, + { + "epoch": 14.137686302342086, + "grad_norm": 0.3967151939868927, + "learning_rate": 8.586841731724627e-05, + "loss": 0.01961733549833298, + "step": 99600 + }, + { + "epoch": 14.139105748757984, + "grad_norm": 0.10708726942539215, + "learning_rate": 8.586699787083038e-05, + "loss": 0.03929960429668426, + "step": 99610 + }, + { + "epoch": 14.140525195173883, + "grad_norm": 0.630052387714386, + "learning_rate": 8.586557842441448e-05, + "loss": 0.029604125022888183, + "step": 99620 + }, + { + "epoch": 14.14194464158978, + "grad_norm": 0.006481459829956293, + "learning_rate": 8.586415897799859e-05, + "loss": 0.044744950532913205, + "step": 99630 + }, + { + "epoch": 14.143364088005677, + "grad_norm": 0.10336804389953613, + "learning_rate": 8.586273953158269e-05, + "loss": 0.022768531739711762, + "step": 99640 + }, + { + "epoch": 14.144783534421576, + "grad_norm": 4.770381450653076, + "learning_rate": 8.586132008516678e-05, + "loss": 0.015540549159049987, + "step": 99650 + }, + { + "epoch": 14.146202980837474, + "grad_norm": 0.225839301943779, + "learning_rate": 8.58599006387509e-05, + "loss": 0.007052184641361236, + "step": 99660 + }, + { + "epoch": 14.14762242725337, + "grad_norm": 13.46019172668457, + "learning_rate": 8.585848119233499e-05, + "loss": 0.031240320205688475, + "step": 99670 + }, + { + "epoch": 14.149041873669269, + "grad_norm": 0.5248197317123413, + "learning_rate": 8.58570617459191e-05, + "loss": 0.06163159608840942, + "step": 99680 + }, + { + "epoch": 14.150461320085167, + "grad_norm": 0.07162128388881683, + "learning_rate": 8.58556422995032e-05, + "loss": 0.027028301358222963, + "step": 99690 + }, + { + "epoch": 14.151880766501064, + "grad_norm": 0.01644066721200943, + "learning_rate": 8.58542228530873e-05, + "loss": 0.01212691217660904, + "step": 99700 + }, + { + "epoch": 14.153300212916962, + "grad_norm": 0.15020127594470978, + "learning_rate": 8.58528034066714e-05, + "loss": 0.009765591472387314, + "step": 99710 + }, + { + "epoch": 14.15471965933286, + "grad_norm": 2.6016457080841064, + "learning_rate": 8.58513839602555e-05, + "loss": 0.010732583701610565, + "step": 99720 + }, + { + "epoch": 14.156139105748759, + "grad_norm": 0.014076477847993374, + "learning_rate": 8.58499645138396e-05, + "loss": 0.013157817721366882, + "step": 99730 + }, + { + "epoch": 14.157558552164655, + "grad_norm": 0.020325670018792152, + "learning_rate": 8.584854506742371e-05, + "loss": 0.007947267591953277, + "step": 99740 + }, + { + "epoch": 14.158977998580554, + "grad_norm": 4.29239559173584, + "learning_rate": 8.584712562100781e-05, + "loss": 0.04670752882957459, + "step": 99750 + }, + { + "epoch": 14.160397444996452, + "grad_norm": 0.648195743560791, + "learning_rate": 8.584570617459191e-05, + "loss": 0.03870112299919128, + "step": 99760 + }, + { + "epoch": 14.161816891412348, + "grad_norm": 6.478837966918945, + "learning_rate": 8.584428672817602e-05, + "loss": 0.029291576147079466, + "step": 99770 + }, + { + "epoch": 14.163236337828247, + "grad_norm": 0.534260630607605, + "learning_rate": 8.584286728176012e-05, + "loss": 0.010391275584697723, + "step": 99780 + }, + { + "epoch": 14.164655784244145, + "grad_norm": 0.26018503308296204, + "learning_rate": 8.584144783534423e-05, + "loss": 0.00734531432390213, + "step": 99790 + }, + { + "epoch": 14.166075230660043, + "grad_norm": 0.3219590187072754, + "learning_rate": 8.584002838892831e-05, + "loss": 0.004718105867505073, + "step": 99800 + }, + { + "epoch": 14.16749467707594, + "grad_norm": 0.05225837603211403, + "learning_rate": 8.583860894251242e-05, + "loss": 0.005737151578068733, + "step": 99810 + }, + { + "epoch": 14.168914123491838, + "grad_norm": 2.183525562286377, + "learning_rate": 8.583718949609652e-05, + "loss": 0.01032693013548851, + "step": 99820 + }, + { + "epoch": 14.170333569907736, + "grad_norm": 0.5399150252342224, + "learning_rate": 8.583577004968063e-05, + "loss": 0.006520085036754608, + "step": 99830 + }, + { + "epoch": 14.171753016323633, + "grad_norm": 0.18672649562358856, + "learning_rate": 8.583435060326473e-05, + "loss": 0.005176125466823578, + "step": 99840 + }, + { + "epoch": 14.173172462739531, + "grad_norm": 0.05119100213050842, + "learning_rate": 8.583293115684883e-05, + "loss": 0.023728413879871367, + "step": 99850 + }, + { + "epoch": 14.17459190915543, + "grad_norm": 0.018034903332591057, + "learning_rate": 8.583151171043294e-05, + "loss": 0.016659101843833922, + "step": 99860 + }, + { + "epoch": 14.176011355571328, + "grad_norm": 2.1384027004241943, + "learning_rate": 8.583009226401703e-05, + "loss": 0.037726446986198425, + "step": 99870 + }, + { + "epoch": 14.177430801987224, + "grad_norm": 2.5266449451446533, + "learning_rate": 8.582867281760114e-05, + "loss": 0.025884675979614257, + "step": 99880 + }, + { + "epoch": 14.178850248403123, + "grad_norm": 0.28664296865463257, + "learning_rate": 8.582725337118524e-05, + "loss": 0.026016849279403686, + "step": 99890 + }, + { + "epoch": 14.180269694819021, + "grad_norm": 0.27895665168762207, + "learning_rate": 8.582583392476934e-05, + "loss": 0.0053959134966135025, + "step": 99900 + }, + { + "epoch": 14.181689141234918, + "grad_norm": 0.005411416757851839, + "learning_rate": 8.582441447835344e-05, + "loss": 0.010221479833126068, + "step": 99910 + }, + { + "epoch": 14.183108587650816, + "grad_norm": 0.020072845742106438, + "learning_rate": 8.582299503193755e-05, + "loss": 0.020439541339874266, + "step": 99920 + }, + { + "epoch": 14.184528034066714, + "grad_norm": 0.9487782716751099, + "learning_rate": 8.582157558552165e-05, + "loss": 0.03342333137989044, + "step": 99930 + }, + { + "epoch": 14.185947480482612, + "grad_norm": 3.2585058212280273, + "learning_rate": 8.582015613910576e-05, + "loss": 0.009980223327875137, + "step": 99940 + }, + { + "epoch": 14.187366926898509, + "grad_norm": 2.015963315963745, + "learning_rate": 8.581873669268987e-05, + "loss": 0.039945772290229796, + "step": 99950 + }, + { + "epoch": 14.188786373314407, + "grad_norm": 4.911747932434082, + "learning_rate": 8.581731724627395e-05, + "loss": 0.04019148051738739, + "step": 99960 + }, + { + "epoch": 14.190205819730306, + "grad_norm": 1.3588132858276367, + "learning_rate": 8.581589779985806e-05, + "loss": 0.03294050395488739, + "step": 99970 + }, + { + "epoch": 14.191625266146202, + "grad_norm": 8.521455764770508, + "learning_rate": 8.581447835344216e-05, + "loss": 0.026378729939460756, + "step": 99980 + }, + { + "epoch": 14.1930447125621, + "grad_norm": 1.8677093982696533, + "learning_rate": 8.581305890702627e-05, + "loss": 0.01372207999229431, + "step": 99990 + }, + { + "epoch": 14.194464158977999, + "grad_norm": 0.10951168835163116, + "learning_rate": 8.581163946061037e-05, + "loss": 0.012156614661216735, + "step": 100000 + }, + { + "epoch": 14.194464158977999, + "eval_accuracy": 0.9814967889616583, + "eval_loss": 0.07002508640289307, + "eval_runtime": 30.1341, + "eval_samples_per_second": 521.9, + "eval_steps_per_second": 16.327, + "step": 100000 + }, + { + "epoch": 14.195883605393897, + "grad_norm": 4.687601566314697, + "learning_rate": 8.581022001419446e-05, + "loss": 0.08939755558967591, + "step": 100010 + }, + { + "epoch": 14.197303051809794, + "grad_norm": 14.153443336486816, + "learning_rate": 8.580880056777856e-05, + "loss": 0.0787558138370514, + "step": 100020 + }, + { + "epoch": 14.198722498225692, + "grad_norm": 0.00876574032008648, + "learning_rate": 8.580738112136267e-05, + "loss": 0.020130300521850587, + "step": 100030 + }, + { + "epoch": 14.20014194464159, + "grad_norm": 13.658935546875, + "learning_rate": 8.580596167494678e-05, + "loss": 0.01373588889837265, + "step": 100040 + }, + { + "epoch": 14.201561391057487, + "grad_norm": 0.07329968363046646, + "learning_rate": 8.580454222853088e-05, + "loss": 0.0472787082195282, + "step": 100050 + }, + { + "epoch": 14.202980837473385, + "grad_norm": 4.477980613708496, + "learning_rate": 8.580312278211498e-05, + "loss": 0.05152733325958252, + "step": 100060 + }, + { + "epoch": 14.204400283889283, + "grad_norm": 0.26643145084381104, + "learning_rate": 8.580170333569908e-05, + "loss": 0.06452604532241821, + "step": 100070 + }, + { + "epoch": 14.205819730305182, + "grad_norm": 0.09822005778551102, + "learning_rate": 8.580028388928319e-05, + "loss": 0.017590297758579253, + "step": 100080 + }, + { + "epoch": 14.207239176721078, + "grad_norm": 0.04513048753142357, + "learning_rate": 8.579886444286728e-05, + "loss": 0.04057992696762085, + "step": 100090 + }, + { + "epoch": 14.208658623136976, + "grad_norm": 1.4454002380371094, + "learning_rate": 8.57974449964514e-05, + "loss": 0.01286405324935913, + "step": 100100 + }, + { + "epoch": 14.210078069552875, + "grad_norm": 0.030584536492824554, + "learning_rate": 8.579602555003548e-05, + "loss": 0.02225019484758377, + "step": 100110 + }, + { + "epoch": 14.211497515968771, + "grad_norm": 4.225281715393066, + "learning_rate": 8.579460610361959e-05, + "loss": 0.008702501654624939, + "step": 100120 + }, + { + "epoch": 14.21291696238467, + "grad_norm": 0.37821707129478455, + "learning_rate": 8.57931866572037e-05, + "loss": 0.024403400719165802, + "step": 100130 + }, + { + "epoch": 14.214336408800568, + "grad_norm": 13.270866394042969, + "learning_rate": 8.57917672107878e-05, + "loss": 0.035308724641799925, + "step": 100140 + }, + { + "epoch": 14.215755855216466, + "grad_norm": 0.15924280881881714, + "learning_rate": 8.579034776437191e-05, + "loss": 0.00895916372537613, + "step": 100150 + }, + { + "epoch": 14.217175301632363, + "grad_norm": 0.23295794427394867, + "learning_rate": 8.578892831795599e-05, + "loss": 0.04624794721603394, + "step": 100160 + }, + { + "epoch": 14.218594748048261, + "grad_norm": 0.36088430881500244, + "learning_rate": 8.57875088715401e-05, + "loss": 0.011353875696659087, + "step": 100170 + }, + { + "epoch": 14.22001419446416, + "grad_norm": 0.03815864771604538, + "learning_rate": 8.57860894251242e-05, + "loss": 0.05593507289886475, + "step": 100180 + }, + { + "epoch": 14.221433640880056, + "grad_norm": 0.9573842287063599, + "learning_rate": 8.578466997870831e-05, + "loss": 0.041238024830818176, + "step": 100190 + }, + { + "epoch": 14.222853087295954, + "grad_norm": 2.3269762992858887, + "learning_rate": 8.578325053229241e-05, + "loss": 0.04610580801963806, + "step": 100200 + }, + { + "epoch": 14.224272533711853, + "grad_norm": 3.7672483921051025, + "learning_rate": 8.578183108587651e-05, + "loss": 0.01693085432052612, + "step": 100210 + }, + { + "epoch": 14.22569198012775, + "grad_norm": 16.042234420776367, + "learning_rate": 8.578041163946062e-05, + "loss": 0.04853768944740296, + "step": 100220 + }, + { + "epoch": 14.227111426543647, + "grad_norm": 9.958683967590332, + "learning_rate": 8.577899219304472e-05, + "loss": 0.012343405932188033, + "step": 100230 + }, + { + "epoch": 14.228530872959546, + "grad_norm": 0.010709248483181, + "learning_rate": 8.577757274662883e-05, + "loss": 0.028152650594711302, + "step": 100240 + }, + { + "epoch": 14.229950319375444, + "grad_norm": 6.041823863983154, + "learning_rate": 8.577615330021292e-05, + "loss": 0.0360468327999115, + "step": 100250 + }, + { + "epoch": 14.231369765791342, + "grad_norm": 1.348669409751892, + "learning_rate": 8.577473385379702e-05, + "loss": 0.011957320570945739, + "step": 100260 + }, + { + "epoch": 14.232789212207239, + "grad_norm": 2.1172597408294678, + "learning_rate": 8.577331440738112e-05, + "loss": 0.010528762638568879, + "step": 100270 + }, + { + "epoch": 14.234208658623137, + "grad_norm": 3.0857114791870117, + "learning_rate": 8.577189496096523e-05, + "loss": 0.032012763619422915, + "step": 100280 + }, + { + "epoch": 14.235628105039035, + "grad_norm": 0.1845846027135849, + "learning_rate": 8.577047551454933e-05, + "loss": 0.04174632728099823, + "step": 100290 + }, + { + "epoch": 14.237047551454932, + "grad_norm": 1.4138368368148804, + "learning_rate": 8.576905606813344e-05, + "loss": 0.00865376740694046, + "step": 100300 + }, + { + "epoch": 14.23846699787083, + "grad_norm": 0.2461751401424408, + "learning_rate": 8.576763662171754e-05, + "loss": 0.005132092162966728, + "step": 100310 + }, + { + "epoch": 14.239886444286729, + "grad_norm": 1.0818413496017456, + "learning_rate": 8.576621717530163e-05, + "loss": 0.027364933490753175, + "step": 100320 + }, + { + "epoch": 14.241305890702627, + "grad_norm": 0.22002390027046204, + "learning_rate": 8.576479772888574e-05, + "loss": 0.026398837566375732, + "step": 100330 + }, + { + "epoch": 14.242725337118523, + "grad_norm": 1.0248297452926636, + "learning_rate": 8.576337828246984e-05, + "loss": 0.005221531540155411, + "step": 100340 + }, + { + "epoch": 14.244144783534422, + "grad_norm": 2.140570878982544, + "learning_rate": 8.576195883605395e-05, + "loss": 0.041142240166664124, + "step": 100350 + }, + { + "epoch": 14.24556422995032, + "grad_norm": 0.22018738090991974, + "learning_rate": 8.576053938963805e-05, + "loss": 0.018037812411785127, + "step": 100360 + }, + { + "epoch": 14.246983676366217, + "grad_norm": 0.2746393084526062, + "learning_rate": 8.575911994322215e-05, + "loss": 0.016584034264087676, + "step": 100370 + }, + { + "epoch": 14.248403122782115, + "grad_norm": 1.3459296226501465, + "learning_rate": 8.575770049680624e-05, + "loss": 0.010618263483047485, + "step": 100380 + }, + { + "epoch": 14.249822569198013, + "grad_norm": 0.019148923456668854, + "learning_rate": 8.575628105039035e-05, + "loss": 0.06375048160552979, + "step": 100390 + }, + { + "epoch": 14.251242015613911, + "grad_norm": 0.5293266177177429, + "learning_rate": 8.575486160397445e-05, + "loss": 0.03687954843044281, + "step": 100400 + }, + { + "epoch": 14.252661462029808, + "grad_norm": 1.7792558670043945, + "learning_rate": 8.575344215755856e-05, + "loss": 0.011453892290592193, + "step": 100410 + }, + { + "epoch": 14.254080908445706, + "grad_norm": 0.006813056766986847, + "learning_rate": 8.575202271114266e-05, + "loss": 0.005444584414362907, + "step": 100420 + }, + { + "epoch": 14.255500354861605, + "grad_norm": 2.6219913959503174, + "learning_rate": 8.575060326472676e-05, + "loss": 0.021393463015556335, + "step": 100430 + }, + { + "epoch": 14.256919801277501, + "grad_norm": 6.097723007202148, + "learning_rate": 8.574918381831087e-05, + "loss": 0.027876609563827516, + "step": 100440 + }, + { + "epoch": 14.2583392476934, + "grad_norm": 0.08926290273666382, + "learning_rate": 8.574776437189497e-05, + "loss": 0.004753031581640243, + "step": 100450 + }, + { + "epoch": 14.259758694109298, + "grad_norm": 0.09094270318746567, + "learning_rate": 8.574634492547908e-05, + "loss": 0.013828447461128235, + "step": 100460 + }, + { + "epoch": 14.261178140525196, + "grad_norm": 6.245047569274902, + "learning_rate": 8.574492547906316e-05, + "loss": 0.04343651533126831, + "step": 100470 + }, + { + "epoch": 14.262597586941093, + "grad_norm": 0.009788050316274166, + "learning_rate": 8.574350603264727e-05, + "loss": 0.0223904013633728, + "step": 100480 + }, + { + "epoch": 14.264017033356991, + "grad_norm": 0.4701679050922394, + "learning_rate": 8.574208658623137e-05, + "loss": 0.008665598928928375, + "step": 100490 + }, + { + "epoch": 14.26543647977289, + "grad_norm": 1.7598905563354492, + "learning_rate": 8.574066713981548e-05, + "loss": 0.018955098092556, + "step": 100500 + }, + { + "epoch": 14.26543647977289, + "eval_accuracy": 0.986837922044891, + "eval_loss": 0.04708363860845566, + "eval_runtime": 30.9519, + "eval_samples_per_second": 508.111, + "eval_steps_per_second": 15.896, + "step": 100500 + }, + { + "epoch": 14.266855926188786, + "grad_norm": 0.8167769908905029, + "learning_rate": 8.573924769339958e-05, + "loss": 0.006450720131397247, + "step": 100510 + }, + { + "epoch": 14.268275372604684, + "grad_norm": 6.334622859954834, + "learning_rate": 8.573782824698367e-05, + "loss": 0.01875331699848175, + "step": 100520 + }, + { + "epoch": 14.269694819020582, + "grad_norm": 0.37761762738227844, + "learning_rate": 8.573640880056779e-05, + "loss": 0.035563099384307864, + "step": 100530 + }, + { + "epoch": 14.27111426543648, + "grad_norm": 7.297554016113281, + "learning_rate": 8.573498935415188e-05, + "loss": 0.027794861793518068, + "step": 100540 + }, + { + "epoch": 14.272533711852377, + "grad_norm": 0.4307068884372711, + "learning_rate": 8.5733569907736e-05, + "loss": 0.012988004088401794, + "step": 100550 + }, + { + "epoch": 14.273953158268275, + "grad_norm": 1.8834869861602783, + "learning_rate": 8.573215046132009e-05, + "loss": 0.01791456192731857, + "step": 100560 + }, + { + "epoch": 14.275372604684174, + "grad_norm": 0.041106026619672775, + "learning_rate": 8.573073101490419e-05, + "loss": 0.003982153534889221, + "step": 100570 + }, + { + "epoch": 14.27679205110007, + "grad_norm": 3.069298267364502, + "learning_rate": 8.572931156848829e-05, + "loss": 0.014299359917640687, + "step": 100580 + }, + { + "epoch": 14.278211497515969, + "grad_norm": 0.061462994664907455, + "learning_rate": 8.57278921220724e-05, + "loss": 0.056102311611175536, + "step": 100590 + }, + { + "epoch": 14.279630943931867, + "grad_norm": 9.913806915283203, + "learning_rate": 8.57264726756565e-05, + "loss": 0.035783016681671144, + "step": 100600 + }, + { + "epoch": 14.281050390347765, + "grad_norm": 0.4509928524494171, + "learning_rate": 8.57250532292406e-05, + "loss": 0.007971011102199554, + "step": 100610 + }, + { + "epoch": 14.282469836763662, + "grad_norm": 0.8236384987831116, + "learning_rate": 8.57236337828247e-05, + "loss": 0.013647985458374024, + "step": 100620 + }, + { + "epoch": 14.28388928317956, + "grad_norm": 1.0165517330169678, + "learning_rate": 8.57222143364088e-05, + "loss": 0.012246866524219514, + "step": 100630 + }, + { + "epoch": 14.285308729595458, + "grad_norm": 4.174709320068359, + "learning_rate": 8.572079488999291e-05, + "loss": 0.0925281286239624, + "step": 100640 + }, + { + "epoch": 14.286728176011355, + "grad_norm": 0.11360863596200943, + "learning_rate": 8.571937544357701e-05, + "loss": 0.004844916984438896, + "step": 100650 + }, + { + "epoch": 14.288147622427253, + "grad_norm": 16.6639347076416, + "learning_rate": 8.571795599716112e-05, + "loss": 0.037261354923248294, + "step": 100660 + }, + { + "epoch": 14.289567068843152, + "grad_norm": 0.04352438077330589, + "learning_rate": 8.571653655074522e-05, + "loss": 0.015262427926063537, + "step": 100670 + }, + { + "epoch": 14.29098651525905, + "grad_norm": 1.37468421459198, + "learning_rate": 8.571511710432931e-05, + "loss": 0.017841906845569612, + "step": 100680 + }, + { + "epoch": 14.292405961674946, + "grad_norm": 0.07316924631595612, + "learning_rate": 8.571369765791341e-05, + "loss": 0.02864176332950592, + "step": 100690 + }, + { + "epoch": 14.293825408090845, + "grad_norm": 1.4973715543746948, + "learning_rate": 8.571227821149752e-05, + "loss": 0.029614627361297607, + "step": 100700 + }, + { + "epoch": 14.295244854506743, + "grad_norm": 0.6571174263954163, + "learning_rate": 8.571085876508162e-05, + "loss": 0.00811554342508316, + "step": 100710 + }, + { + "epoch": 14.29666430092264, + "grad_norm": 0.6326042413711548, + "learning_rate": 8.570943931866573e-05, + "loss": 0.012401780486106873, + "step": 100720 + }, + { + "epoch": 14.298083747338538, + "grad_norm": 10.682669639587402, + "learning_rate": 8.570801987224983e-05, + "loss": 0.04365946650505066, + "step": 100730 + }, + { + "epoch": 14.299503193754436, + "grad_norm": 0.2900744676589966, + "learning_rate": 8.570660042583393e-05, + "loss": 0.022507643699645995, + "step": 100740 + }, + { + "epoch": 14.300922640170334, + "grad_norm": 1.0781408548355103, + "learning_rate": 8.570518097941804e-05, + "loss": 0.036583822965621945, + "step": 100750 + }, + { + "epoch": 14.302342086586231, + "grad_norm": 3.8918097019195557, + "learning_rate": 8.570376153300213e-05, + "loss": 0.048116791248321536, + "step": 100760 + }, + { + "epoch": 14.30376153300213, + "grad_norm": 12.145566940307617, + "learning_rate": 8.570234208658624e-05, + "loss": 0.037739336490631104, + "step": 100770 + }, + { + "epoch": 14.305180979418028, + "grad_norm": 5.226789951324463, + "learning_rate": 8.570092264017033e-05, + "loss": 0.011161120235919952, + "step": 100780 + }, + { + "epoch": 14.306600425833924, + "grad_norm": 7.265553951263428, + "learning_rate": 8.569950319375444e-05, + "loss": 0.039224272966384886, + "step": 100790 + }, + { + "epoch": 14.308019872249822, + "grad_norm": 0.03872114047408104, + "learning_rate": 8.569808374733854e-05, + "loss": 0.03458241820335388, + "step": 100800 + }, + { + "epoch": 14.30943931866572, + "grad_norm": 0.05727313831448555, + "learning_rate": 8.569666430092265e-05, + "loss": 0.060028254985809326, + "step": 100810 + }, + { + "epoch": 14.310858765081619, + "grad_norm": 5.03977632522583, + "learning_rate": 8.569524485450675e-05, + "loss": 0.018303254246711732, + "step": 100820 + }, + { + "epoch": 14.312278211497516, + "grad_norm": 0.09965986013412476, + "learning_rate": 8.569382540809084e-05, + "loss": 0.059241455793380735, + "step": 100830 + }, + { + "epoch": 14.313697657913414, + "grad_norm": 0.1746777594089508, + "learning_rate": 8.569240596167495e-05, + "loss": 0.025620871782302858, + "step": 100840 + }, + { + "epoch": 14.315117104329312, + "grad_norm": 0.008593921549618244, + "learning_rate": 8.569098651525905e-05, + "loss": 0.01247217133641243, + "step": 100850 + }, + { + "epoch": 14.316536550745209, + "grad_norm": 1.2999850511550903, + "learning_rate": 8.568956706884316e-05, + "loss": 0.02095559537410736, + "step": 100860 + }, + { + "epoch": 14.317955997161107, + "grad_norm": 0.6758670806884766, + "learning_rate": 8.568814762242726e-05, + "loss": 0.02123214602470398, + "step": 100870 + }, + { + "epoch": 14.319375443577005, + "grad_norm": 12.219331741333008, + "learning_rate": 8.568672817601136e-05, + "loss": 0.010732834041118623, + "step": 100880 + }, + { + "epoch": 14.320794889992904, + "grad_norm": 2.747171640396118, + "learning_rate": 8.568530872959545e-05, + "loss": 0.008057169616222382, + "step": 100890 + }, + { + "epoch": 14.3222143364088, + "grad_norm": 13.715052604675293, + "learning_rate": 8.568388928317956e-05, + "loss": 0.035181736946105956, + "step": 100900 + }, + { + "epoch": 14.323633782824698, + "grad_norm": 4.738697528839111, + "learning_rate": 8.568246983676366e-05, + "loss": 0.04868959188461304, + "step": 100910 + }, + { + "epoch": 14.325053229240597, + "grad_norm": 0.3947387635707855, + "learning_rate": 8.568105039034777e-05, + "loss": 0.0034671925008296967, + "step": 100920 + }, + { + "epoch": 14.326472675656493, + "grad_norm": 1.7262924909591675, + "learning_rate": 8.567963094393187e-05, + "loss": 0.03679071366786957, + "step": 100930 + }, + { + "epoch": 14.327892122072392, + "grad_norm": 5.729528427124023, + "learning_rate": 8.567821149751597e-05, + "loss": 0.03186109662055969, + "step": 100940 + }, + { + "epoch": 14.32931156848829, + "grad_norm": 10.659774780273438, + "learning_rate": 8.567679205110008e-05, + "loss": 0.05918008089065552, + "step": 100950 + }, + { + "epoch": 14.330731014904188, + "grad_norm": 0.5046432614326477, + "learning_rate": 8.567537260468418e-05, + "loss": 0.04549593031406403, + "step": 100960 + }, + { + "epoch": 14.332150461320085, + "grad_norm": 21.737239837646484, + "learning_rate": 8.567395315826829e-05, + "loss": 0.06181545257568359, + "step": 100970 + }, + { + "epoch": 14.333569907735983, + "grad_norm": 0.727480411529541, + "learning_rate": 8.567253371185237e-05, + "loss": 0.0723923921585083, + "step": 100980 + }, + { + "epoch": 14.334989354151881, + "grad_norm": 8.854576110839844, + "learning_rate": 8.567111426543648e-05, + "loss": 0.02641754150390625, + "step": 100990 + }, + { + "epoch": 14.336408800567778, + "grad_norm": 10.398995399475098, + "learning_rate": 8.566969481902058e-05, + "loss": 0.008688996732234954, + "step": 101000 + }, + { + "epoch": 14.336408800567778, + "eval_accuracy": 0.9741209385133847, + "eval_loss": 0.10290984809398651, + "eval_runtime": 30.9993, + "eval_samples_per_second": 507.334, + "eval_steps_per_second": 15.871, + "step": 101000 + }, + { + "epoch": 14.337828246983676, + "grad_norm": 2.215137243270874, + "learning_rate": 8.566827537260469e-05, + "loss": 0.024131688475608825, + "step": 101010 + }, + { + "epoch": 14.339247693399575, + "grad_norm": 0.03819451108574867, + "learning_rate": 8.566685592618879e-05, + "loss": 0.06235557794570923, + "step": 101020 + }, + { + "epoch": 14.340667139815473, + "grad_norm": 4.079460144042969, + "learning_rate": 8.56654364797729e-05, + "loss": 0.009826949238777161, + "step": 101030 + }, + { + "epoch": 14.34208658623137, + "grad_norm": 2.0592312812805176, + "learning_rate": 8.5664017033357e-05, + "loss": 0.022684365510940552, + "step": 101040 + }, + { + "epoch": 14.343506032647268, + "grad_norm": 0.500636637210846, + "learning_rate": 8.56625975869411e-05, + "loss": 0.0208365797996521, + "step": 101050 + }, + { + "epoch": 14.344925479063166, + "grad_norm": 0.7281819581985474, + "learning_rate": 8.56611781405252e-05, + "loss": 0.009615353494882583, + "step": 101060 + }, + { + "epoch": 14.346344925479062, + "grad_norm": 1.5502413511276245, + "learning_rate": 8.56597586941093e-05, + "loss": 0.08351185917854309, + "step": 101070 + }, + { + "epoch": 14.34776437189496, + "grad_norm": 0.9243219494819641, + "learning_rate": 8.565833924769341e-05, + "loss": 0.0019005615264177322, + "step": 101080 + }, + { + "epoch": 14.349183818310859, + "grad_norm": 0.04442301765084267, + "learning_rate": 8.56569198012775e-05, + "loss": 0.013065469264984132, + "step": 101090 + }, + { + "epoch": 14.350603264726757, + "grad_norm": 0.8222534656524658, + "learning_rate": 8.565550035486161e-05, + "loss": 0.019597794115543365, + "step": 101100 + }, + { + "epoch": 14.352022711142654, + "grad_norm": 1.4117416143417358, + "learning_rate": 8.56540809084457e-05, + "loss": 0.02556995749473572, + "step": 101110 + }, + { + "epoch": 14.353442157558552, + "grad_norm": 1.0706210136413574, + "learning_rate": 8.565266146202982e-05, + "loss": 0.025179722905158998, + "step": 101120 + }, + { + "epoch": 14.35486160397445, + "grad_norm": 0.07932776212692261, + "learning_rate": 8.565124201561391e-05, + "loss": 0.04019379019737244, + "step": 101130 + }, + { + "epoch": 14.356281050390347, + "grad_norm": 0.0066423784010112286, + "learning_rate": 8.564982256919801e-05, + "loss": 0.004692386835813522, + "step": 101140 + }, + { + "epoch": 14.357700496806245, + "grad_norm": 0.3343643844127655, + "learning_rate": 8.564840312278212e-05, + "loss": 0.01938415616750717, + "step": 101150 + }, + { + "epoch": 14.359119943222144, + "grad_norm": 0.09557344764471054, + "learning_rate": 8.564698367636622e-05, + "loss": 0.010768187046051026, + "step": 101160 + }, + { + "epoch": 14.360539389638042, + "grad_norm": 0.10270483791828156, + "learning_rate": 8.564556422995033e-05, + "loss": 0.028739473223686217, + "step": 101170 + }, + { + "epoch": 14.361958836053939, + "grad_norm": 2.6416287422180176, + "learning_rate": 8.564414478353443e-05, + "loss": 0.03519602417945862, + "step": 101180 + }, + { + "epoch": 14.363378282469837, + "grad_norm": 4.812742710113525, + "learning_rate": 8.564272533711852e-05, + "loss": 0.042847877740859984, + "step": 101190 + }, + { + "epoch": 14.364797728885735, + "grad_norm": 0.6029514670372009, + "learning_rate": 8.564130589070262e-05, + "loss": 0.05594496130943298, + "step": 101200 + }, + { + "epoch": 14.366217175301632, + "grad_norm": 5.7831196784973145, + "learning_rate": 8.563988644428673e-05, + "loss": 0.03804347813129425, + "step": 101210 + }, + { + "epoch": 14.36763662171753, + "grad_norm": 0.9964185357093811, + "learning_rate": 8.563846699787083e-05, + "loss": 0.007220058888196945, + "step": 101220 + }, + { + "epoch": 14.369056068133428, + "grad_norm": 0.31065285205841064, + "learning_rate": 8.563704755145494e-05, + "loss": 0.03414974212646484, + "step": 101230 + }, + { + "epoch": 14.370475514549327, + "grad_norm": 0.042580220848321915, + "learning_rate": 8.563562810503904e-05, + "loss": 0.028570058941841125, + "step": 101240 + }, + { + "epoch": 14.371894960965223, + "grad_norm": 0.6354950666427612, + "learning_rate": 8.563420865862314e-05, + "loss": 0.02657265365123749, + "step": 101250 + }, + { + "epoch": 14.373314407381121, + "grad_norm": 0.13414810597896576, + "learning_rate": 8.563278921220725e-05, + "loss": 0.0045540835708379745, + "step": 101260 + }, + { + "epoch": 14.37473385379702, + "grad_norm": 2.9977593421936035, + "learning_rate": 8.563136976579134e-05, + "loss": 0.028857874870300292, + "step": 101270 + }, + { + "epoch": 14.376153300212916, + "grad_norm": 0.01566135697066784, + "learning_rate": 8.562995031937545e-05, + "loss": 0.005179446935653686, + "step": 101280 + }, + { + "epoch": 14.377572746628815, + "grad_norm": 0.35850459337234497, + "learning_rate": 8.562853087295954e-05, + "loss": 0.03362755179405212, + "step": 101290 + }, + { + "epoch": 14.378992193044713, + "grad_norm": 0.2453339397907257, + "learning_rate": 8.562711142654365e-05, + "loss": 0.008520130068063736, + "step": 101300 + }, + { + "epoch": 14.380411639460611, + "grad_norm": 0.022847548127174377, + "learning_rate": 8.562569198012775e-05, + "loss": 0.03245726227760315, + "step": 101310 + }, + { + "epoch": 14.381831085876508, + "grad_norm": 0.7326244711875916, + "learning_rate": 8.562427253371186e-05, + "loss": 0.015918411314487457, + "step": 101320 + }, + { + "epoch": 14.383250532292406, + "grad_norm": 2.888026714324951, + "learning_rate": 8.562285308729597e-05, + "loss": 0.022091734409332275, + "step": 101330 + }, + { + "epoch": 14.384669978708304, + "grad_norm": 0.3161379098892212, + "learning_rate": 8.562143364088005e-05, + "loss": 0.019298197329044343, + "step": 101340 + }, + { + "epoch": 14.3860894251242, + "grad_norm": 0.21936295926570892, + "learning_rate": 8.562001419446416e-05, + "loss": 0.011969022452831268, + "step": 101350 + }, + { + "epoch": 14.3875088715401, + "grad_norm": 4.343912601470947, + "learning_rate": 8.561859474804826e-05, + "loss": 0.018465761840343476, + "step": 101360 + }, + { + "epoch": 14.388928317955997, + "grad_norm": 1.221403956413269, + "learning_rate": 8.561717530163237e-05, + "loss": 0.005646177381277084, + "step": 101370 + }, + { + "epoch": 14.390347764371896, + "grad_norm": 0.12888500094413757, + "learning_rate": 8.561575585521647e-05, + "loss": 0.06626540422439575, + "step": 101380 + }, + { + "epoch": 14.391767210787792, + "grad_norm": 0.5282251834869385, + "learning_rate": 8.561433640880058e-05, + "loss": 0.01441122591495514, + "step": 101390 + }, + { + "epoch": 14.39318665720369, + "grad_norm": 3.577329397201538, + "learning_rate": 8.561291696238466e-05, + "loss": 0.0062760643661022185, + "step": 101400 + }, + { + "epoch": 14.394606103619589, + "grad_norm": 1.0349695682525635, + "learning_rate": 8.561149751596878e-05, + "loss": 0.01954812407493591, + "step": 101410 + }, + { + "epoch": 14.396025550035485, + "grad_norm": 3.2017345428466797, + "learning_rate": 8.561007806955289e-05, + "loss": 0.014151862263679505, + "step": 101420 + }, + { + "epoch": 14.397444996451384, + "grad_norm": 1.0517817735671997, + "learning_rate": 8.560865862313698e-05, + "loss": 0.03300471901893616, + "step": 101430 + }, + { + "epoch": 14.398864442867282, + "grad_norm": 0.044288262724876404, + "learning_rate": 8.56072391767211e-05, + "loss": 0.004322785884141922, + "step": 101440 + }, + { + "epoch": 14.40028388928318, + "grad_norm": 1.9268205165863037, + "learning_rate": 8.560581973030518e-05, + "loss": 0.024862904846668244, + "step": 101450 + }, + { + "epoch": 14.401703335699077, + "grad_norm": 0.012334626168012619, + "learning_rate": 8.560440028388929e-05, + "loss": 0.019610564410686492, + "step": 101460 + }, + { + "epoch": 14.403122782114975, + "grad_norm": Infinity, + "learning_rate": 8.560298083747339e-05, + "loss": 0.09853307008743287, + "step": 101470 + }, + { + "epoch": 14.404542228530874, + "grad_norm": 12.455414772033691, + "learning_rate": 8.560170333569908e-05, + "loss": 0.048549103736877444, + "step": 101480 + }, + { + "epoch": 14.40596167494677, + "grad_norm": 0.7663528919219971, + "learning_rate": 8.560028388928318e-05, + "loss": 0.06977788805961609, + "step": 101490 + }, + { + "epoch": 14.407381121362668, + "grad_norm": 0.030371706932783127, + "learning_rate": 8.559886444286729e-05, + "loss": 0.01331292986869812, + "step": 101500 + }, + { + "epoch": 14.407381121362668, + "eval_accuracy": 0.9783175430787817, + "eval_loss": 0.09227219969034195, + "eval_runtime": 30.3574, + "eval_samples_per_second": 518.061, + "eval_steps_per_second": 16.207, + "step": 101500 + }, + { + "epoch": 14.408800567778567, + "grad_norm": 0.9815260767936707, + "learning_rate": 8.559744499645139e-05, + "loss": 0.04523753821849823, + "step": 101510 + }, + { + "epoch": 14.410220014194465, + "grad_norm": 4.4220428466796875, + "learning_rate": 8.559602555003549e-05, + "loss": 0.025368493795394898, + "step": 101520 + }, + { + "epoch": 14.411639460610361, + "grad_norm": 0.2869272530078888, + "learning_rate": 8.559460610361959e-05, + "loss": 0.006363069266080856, + "step": 101530 + }, + { + "epoch": 14.41305890702626, + "grad_norm": 0.7667129039764404, + "learning_rate": 8.55931866572037e-05, + "loss": 0.011126606166362763, + "step": 101540 + }, + { + "epoch": 14.414478353442158, + "grad_norm": 0.27252647280693054, + "learning_rate": 8.55917672107878e-05, + "loss": 0.04273441433906555, + "step": 101550 + }, + { + "epoch": 14.415897799858055, + "grad_norm": 11.699451446533203, + "learning_rate": 8.55903477643719e-05, + "loss": 0.016938979923725127, + "step": 101560 + }, + { + "epoch": 14.417317246273953, + "grad_norm": 3.4000279903411865, + "learning_rate": 8.5588928317956e-05, + "loss": 0.0042859077453613285, + "step": 101570 + }, + { + "epoch": 14.418736692689851, + "grad_norm": 0.18611261248588562, + "learning_rate": 8.55875088715401e-05, + "loss": 0.02584313750267029, + "step": 101580 + }, + { + "epoch": 14.42015613910575, + "grad_norm": 0.011047269217669964, + "learning_rate": 8.558608942512421e-05, + "loss": 0.017075327038764954, + "step": 101590 + }, + { + "epoch": 14.421575585521646, + "grad_norm": 7.181526184082031, + "learning_rate": 8.558466997870831e-05, + "loss": 0.010425643622875213, + "step": 101600 + }, + { + "epoch": 14.422995031937544, + "grad_norm": 0.1424635797739029, + "learning_rate": 8.558325053229242e-05, + "loss": 0.01081070452928543, + "step": 101610 + }, + { + "epoch": 14.424414478353443, + "grad_norm": 0.01988801546394825, + "learning_rate": 8.55818310858765e-05, + "loss": 0.004163437709212303, + "step": 101620 + }, + { + "epoch": 14.42583392476934, + "grad_norm": 0.0159855168312788, + "learning_rate": 8.558041163946061e-05, + "loss": 0.05156056880950928, + "step": 101630 + }, + { + "epoch": 14.427253371185238, + "grad_norm": 2.1559231281280518, + "learning_rate": 8.557899219304471e-05, + "loss": 0.0501996636390686, + "step": 101640 + }, + { + "epoch": 14.428672817601136, + "grad_norm": 7.339823246002197, + "learning_rate": 8.557757274662882e-05, + "loss": 0.02520217001438141, + "step": 101650 + }, + { + "epoch": 14.430092264017034, + "grad_norm": 4.2832746505737305, + "learning_rate": 8.557615330021292e-05, + "loss": 0.011065931618213653, + "step": 101660 + }, + { + "epoch": 14.43151171043293, + "grad_norm": 8.455238342285156, + "learning_rate": 8.557473385379702e-05, + "loss": 0.04223898947238922, + "step": 101670 + }, + { + "epoch": 14.432931156848829, + "grad_norm": 0.3069610893726349, + "learning_rate": 8.557331440738113e-05, + "loss": 0.01566736549139023, + "step": 101680 + }, + { + "epoch": 14.434350603264727, + "grad_norm": 6.469844341278076, + "learning_rate": 8.557189496096522e-05, + "loss": 0.026878052949905397, + "step": 101690 + }, + { + "epoch": 14.435770049680624, + "grad_norm": 6.7413554191589355, + "learning_rate": 8.557047551454934e-05, + "loss": 0.050300925970077515, + "step": 101700 + }, + { + "epoch": 14.437189496096522, + "grad_norm": 1.7050913572311401, + "learning_rate": 8.556905606813343e-05, + "loss": 0.01144518330693245, + "step": 101710 + }, + { + "epoch": 14.43860894251242, + "grad_norm": 8.869119644165039, + "learning_rate": 8.556763662171754e-05, + "loss": 0.021273121237754822, + "step": 101720 + }, + { + "epoch": 14.440028388928319, + "grad_norm": 0.823733389377594, + "learning_rate": 8.556621717530163e-05, + "loss": 0.015097922086715699, + "step": 101730 + }, + { + "epoch": 14.441447835344215, + "grad_norm": 2.2480180263519287, + "learning_rate": 8.556479772888574e-05, + "loss": 0.058521485328674315, + "step": 101740 + }, + { + "epoch": 14.442867281760114, + "grad_norm": 3.448361873626709, + "learning_rate": 8.556337828246984e-05, + "loss": 0.012908448278903962, + "step": 101750 + }, + { + "epoch": 14.444286728176012, + "grad_norm": 7.998870849609375, + "learning_rate": 8.556195883605395e-05, + "loss": 0.013298434019088746, + "step": 101760 + }, + { + "epoch": 14.445706174591908, + "grad_norm": 6.137280464172363, + "learning_rate": 8.556053938963804e-05, + "loss": 0.016340239346027373, + "step": 101770 + }, + { + "epoch": 14.447125621007807, + "grad_norm": 5.501545429229736, + "learning_rate": 8.555911994322214e-05, + "loss": 0.03287135660648346, + "step": 101780 + }, + { + "epoch": 14.448545067423705, + "grad_norm": 1.2825032472610474, + "learning_rate": 8.555770049680625e-05, + "loss": 0.043364471197128295, + "step": 101790 + }, + { + "epoch": 14.449964513839603, + "grad_norm": 10.023609161376953, + "learning_rate": 8.555628105039035e-05, + "loss": 0.028557685017585755, + "step": 101800 + }, + { + "epoch": 14.4513839602555, + "grad_norm": 1.2176203727722168, + "learning_rate": 8.555486160397446e-05, + "loss": 0.043269476294517516, + "step": 101810 + }, + { + "epoch": 14.452803406671398, + "grad_norm": 0.30955979228019714, + "learning_rate": 8.555344215755856e-05, + "loss": 0.01596580147743225, + "step": 101820 + }, + { + "epoch": 14.454222853087296, + "grad_norm": 0.026654871180653572, + "learning_rate": 8.555202271114266e-05, + "loss": 0.0021469760686159134, + "step": 101830 + }, + { + "epoch": 14.455642299503193, + "grad_norm": 3.4512853622436523, + "learning_rate": 8.555060326472675e-05, + "loss": 0.04226863384246826, + "step": 101840 + }, + { + "epoch": 14.457061745919091, + "grad_norm": 0.1448906511068344, + "learning_rate": 8.554918381831086e-05, + "loss": 0.05672473907470703, + "step": 101850 + }, + { + "epoch": 14.45848119233499, + "grad_norm": 1.955114722251892, + "learning_rate": 8.554776437189496e-05, + "loss": 0.012461932003498077, + "step": 101860 + }, + { + "epoch": 14.459900638750888, + "grad_norm": 0.15195594727993011, + "learning_rate": 8.554634492547907e-05, + "loss": 0.014708581566810607, + "step": 101870 + }, + { + "epoch": 14.461320085166784, + "grad_norm": 0.371501624584198, + "learning_rate": 8.554492547906317e-05, + "loss": 0.04243779182434082, + "step": 101880 + }, + { + "epoch": 14.462739531582683, + "grad_norm": 14.460862159729004, + "learning_rate": 8.554350603264727e-05, + "loss": 0.03571424782276154, + "step": 101890 + }, + { + "epoch": 14.464158977998581, + "grad_norm": 9.164223670959473, + "learning_rate": 8.554208658623138e-05, + "loss": 0.030497419834136962, + "step": 101900 + }, + { + "epoch": 14.465578424414478, + "grad_norm": 4.030511856079102, + "learning_rate": 8.554066713981548e-05, + "loss": 0.037016880512237546, + "step": 101910 + }, + { + "epoch": 14.466997870830376, + "grad_norm": 4.120282173156738, + "learning_rate": 8.553924769339959e-05, + "loss": 0.04612137973308563, + "step": 101920 + }, + { + "epoch": 14.468417317246274, + "grad_norm": 0.1960458755493164, + "learning_rate": 8.553782824698367e-05, + "loss": 0.011224465072154998, + "step": 101930 + }, + { + "epoch": 14.469836763662173, + "grad_norm": 1.459161639213562, + "learning_rate": 8.553640880056778e-05, + "loss": 0.02568800449371338, + "step": 101940 + }, + { + "epoch": 14.471256210078069, + "grad_norm": 0.141305074095726, + "learning_rate": 8.553498935415188e-05, + "loss": 0.018661434948444366, + "step": 101950 + }, + { + "epoch": 14.472675656493967, + "grad_norm": 1.7660448551177979, + "learning_rate": 8.553356990773599e-05, + "loss": 0.011808550357818604, + "step": 101960 + }, + { + "epoch": 14.474095102909866, + "grad_norm": 0.19055935740470886, + "learning_rate": 8.553215046132009e-05, + "loss": 0.013704870641231538, + "step": 101970 + }, + { + "epoch": 14.475514549325762, + "grad_norm": 0.24393945932388306, + "learning_rate": 8.553073101490418e-05, + "loss": 0.03193598985671997, + "step": 101980 + }, + { + "epoch": 14.47693399574166, + "grad_norm": 3.6609933376312256, + "learning_rate": 8.55293115684883e-05, + "loss": 0.0095102459192276, + "step": 101990 + }, + { + "epoch": 14.478353442157559, + "grad_norm": 8.48030948638916, + "learning_rate": 8.552789212207239e-05, + "loss": 0.027347564697265625, + "step": 102000 + }, + { + "epoch": 14.478353442157559, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.04714876785874367, + "eval_runtime": 31.0801, + "eval_samples_per_second": 506.015, + "eval_steps_per_second": 15.83, + "step": 102000 + }, + { + "epoch": 14.479772888573457, + "grad_norm": 0.44472378492355347, + "learning_rate": 8.55264726756565e-05, + "loss": 0.00895627737045288, + "step": 102010 + }, + { + "epoch": 14.481192334989354, + "grad_norm": 0.016872040927410126, + "learning_rate": 8.55250532292406e-05, + "loss": 0.026319512724876405, + "step": 102020 + }, + { + "epoch": 14.482611781405252, + "grad_norm": 0.049058735370635986, + "learning_rate": 8.55236337828247e-05, + "loss": 0.020491470396518708, + "step": 102030 + }, + { + "epoch": 14.48403122782115, + "grad_norm": 0.06828688085079193, + "learning_rate": 8.55222143364088e-05, + "loss": 0.029941469430923462, + "step": 102040 + }, + { + "epoch": 14.485450674237047, + "grad_norm": 0.29337137937545776, + "learning_rate": 8.55207948899929e-05, + "loss": 0.010589533299207688, + "step": 102050 + }, + { + "epoch": 14.486870120652945, + "grad_norm": 1.7915858030319214, + "learning_rate": 8.5519375443577e-05, + "loss": 0.0408454954624176, + "step": 102060 + }, + { + "epoch": 14.488289567068843, + "grad_norm": 0.12905509769916534, + "learning_rate": 8.551795599716111e-05, + "loss": 0.040449097752571106, + "step": 102070 + }, + { + "epoch": 14.489709013484742, + "grad_norm": 1.493906021118164, + "learning_rate": 8.551653655074521e-05, + "loss": 0.01022346168756485, + "step": 102080 + }, + { + "epoch": 14.491128459900638, + "grad_norm": 1.35050368309021, + "learning_rate": 8.551511710432931e-05, + "loss": 0.00817287564277649, + "step": 102090 + }, + { + "epoch": 14.492547906316537, + "grad_norm": 0.04836224392056465, + "learning_rate": 8.551369765791342e-05, + "loss": 0.017035089433193207, + "step": 102100 + }, + { + "epoch": 14.493967352732435, + "grad_norm": 0.1411861628293991, + "learning_rate": 8.551227821149752e-05, + "loss": 0.04491946399211884, + "step": 102110 + }, + { + "epoch": 14.495386799148331, + "grad_norm": 0.44113588333129883, + "learning_rate": 8.551085876508163e-05, + "loss": 0.007547316700220108, + "step": 102120 + }, + { + "epoch": 14.49680624556423, + "grad_norm": 10.944716453552246, + "learning_rate": 8.550943931866573e-05, + "loss": 0.02461606413125992, + "step": 102130 + }, + { + "epoch": 14.498225691980128, + "grad_norm": 1.2483770847320557, + "learning_rate": 8.550801987224982e-05, + "loss": 0.012789353728294373, + "step": 102140 + }, + { + "epoch": 14.499645138396026, + "grad_norm": 6.052944660186768, + "learning_rate": 8.550660042583392e-05, + "loss": 0.010456231236457825, + "step": 102150 + }, + { + "epoch": 14.501064584811923, + "grad_norm": 1.0382543802261353, + "learning_rate": 8.550518097941803e-05, + "loss": 0.026821547746658327, + "step": 102160 + }, + { + "epoch": 14.502484031227821, + "grad_norm": 0.07013286650180817, + "learning_rate": 8.550376153300213e-05, + "loss": 0.027486056089401245, + "step": 102170 + }, + { + "epoch": 14.50390347764372, + "grad_norm": 0.03486839681863785, + "learning_rate": 8.550234208658624e-05, + "loss": 0.10720919370651245, + "step": 102180 + }, + { + "epoch": 14.505322924059616, + "grad_norm": 0.2477417290210724, + "learning_rate": 8.550092264017034e-05, + "loss": 0.014333716034889222, + "step": 102190 + }, + { + "epoch": 14.506742370475514, + "grad_norm": 0.23805853724479675, + "learning_rate": 8.549950319375443e-05, + "loss": 0.018902239203453065, + "step": 102200 + }, + { + "epoch": 14.508161816891413, + "grad_norm": 0.04487096145749092, + "learning_rate": 8.549808374733855e-05, + "loss": 0.013421098887920379, + "step": 102210 + }, + { + "epoch": 14.509581263307311, + "grad_norm": 0.6457217931747437, + "learning_rate": 8.549666430092264e-05, + "loss": 0.06798742413520813, + "step": 102220 + }, + { + "epoch": 14.511000709723207, + "grad_norm": 0.8722721934318542, + "learning_rate": 8.549524485450675e-05, + "loss": 0.019697029888629914, + "step": 102230 + }, + { + "epoch": 14.512420156139106, + "grad_norm": 0.5466771721839905, + "learning_rate": 8.549382540809084e-05, + "loss": 0.016688692569732665, + "step": 102240 + }, + { + "epoch": 14.513839602555004, + "grad_norm": 0.023246901109814644, + "learning_rate": 8.549240596167495e-05, + "loss": 0.005826687440276146, + "step": 102250 + }, + { + "epoch": 14.5152590489709, + "grad_norm": 0.37465423345565796, + "learning_rate": 8.549098651525905e-05, + "loss": 0.017506250739097597, + "step": 102260 + }, + { + "epoch": 14.516678495386799, + "grad_norm": 0.033044762909412384, + "learning_rate": 8.548956706884316e-05, + "loss": 0.05119068622589111, + "step": 102270 + }, + { + "epoch": 14.518097941802697, + "grad_norm": 8.139904975891113, + "learning_rate": 8.548814762242727e-05, + "loss": 0.053800547122955324, + "step": 102280 + }, + { + "epoch": 14.519517388218595, + "grad_norm": 5.402328014373779, + "learning_rate": 8.548672817601135e-05, + "loss": 0.009679973125457764, + "step": 102290 + }, + { + "epoch": 14.520936834634492, + "grad_norm": 0.05474318191409111, + "learning_rate": 8.548530872959546e-05, + "loss": 0.005274232476949692, + "step": 102300 + }, + { + "epoch": 14.52235628105039, + "grad_norm": 0.6504866480827332, + "learning_rate": 8.548388928317956e-05, + "loss": 0.02782217264175415, + "step": 102310 + }, + { + "epoch": 14.523775727466289, + "grad_norm": 0.7330292463302612, + "learning_rate": 8.548246983676367e-05, + "loss": 0.08225621581077576, + "step": 102320 + }, + { + "epoch": 14.525195173882185, + "grad_norm": 0.9900355339050293, + "learning_rate": 8.548105039034777e-05, + "loss": 0.006571047753095627, + "step": 102330 + }, + { + "epoch": 14.526614620298083, + "grad_norm": 0.028682060539722443, + "learning_rate": 8.547963094393187e-05, + "loss": 0.0058587446808815, + "step": 102340 + }, + { + "epoch": 14.528034066713982, + "grad_norm": 0.07497026026248932, + "learning_rate": 8.547821149751596e-05, + "loss": 0.004919090494513512, + "step": 102350 + }, + { + "epoch": 14.52945351312988, + "grad_norm": 1.8536876440048218, + "learning_rate": 8.547679205110007e-05, + "loss": 0.011548362672328949, + "step": 102360 + }, + { + "epoch": 14.530872959545777, + "grad_norm": 0.7886669039726257, + "learning_rate": 8.547537260468418e-05, + "loss": 0.021945886313915253, + "step": 102370 + }, + { + "epoch": 14.532292405961675, + "grad_norm": 0.4612744450569153, + "learning_rate": 8.547395315826828e-05, + "loss": 0.051427000761032106, + "step": 102380 + }, + { + "epoch": 14.533711852377573, + "grad_norm": 0.23230144381523132, + "learning_rate": 8.547253371185238e-05, + "loss": 0.03924002945423126, + "step": 102390 + }, + { + "epoch": 14.53513129879347, + "grad_norm": 3.556591033935547, + "learning_rate": 8.547111426543648e-05, + "loss": 0.04937024414539337, + "step": 102400 + }, + { + "epoch": 14.536550745209368, + "grad_norm": 0.04170704260468483, + "learning_rate": 8.546969481902059e-05, + "loss": 0.05127858519554138, + "step": 102410 + }, + { + "epoch": 14.537970191625266, + "grad_norm": 7.728824615478516, + "learning_rate": 8.546827537260469e-05, + "loss": 0.05602442026138306, + "step": 102420 + }, + { + "epoch": 14.539389638041165, + "grad_norm": 5.621062755584717, + "learning_rate": 8.54668559261888e-05, + "loss": 0.017265281081199645, + "step": 102430 + }, + { + "epoch": 14.540809084457061, + "grad_norm": 0.029028164222836494, + "learning_rate": 8.54654364797729e-05, + "loss": 0.007049372792243958, + "step": 102440 + }, + { + "epoch": 14.54222853087296, + "grad_norm": 0.06667573004961014, + "learning_rate": 8.546401703335699e-05, + "loss": 0.005283458903431892, + "step": 102450 + }, + { + "epoch": 14.543647977288858, + "grad_norm": 0.7624027729034424, + "learning_rate": 8.54625975869411e-05, + "loss": 0.017357051372528076, + "step": 102460 + }, + { + "epoch": 14.545067423704754, + "grad_norm": 0.05842301622033119, + "learning_rate": 8.54611781405252e-05, + "loss": 0.010766969621181488, + "step": 102470 + }, + { + "epoch": 14.546486870120653, + "grad_norm": 0.3239707350730896, + "learning_rate": 8.545975869410931e-05, + "loss": 0.010560451447963715, + "step": 102480 + }, + { + "epoch": 14.547906316536551, + "grad_norm": 2.594454526901245, + "learning_rate": 8.545833924769341e-05, + "loss": 0.03495635986328125, + "step": 102490 + }, + { + "epoch": 14.54932576295245, + "grad_norm": 0.0778871476650238, + "learning_rate": 8.54569198012775e-05, + "loss": 0.016888731718063356, + "step": 102500 + }, + { + "epoch": 14.54932576295245, + "eval_accuracy": 0.9813696191263432, + "eval_loss": 0.07888679951429367, + "eval_runtime": 30.2995, + "eval_samples_per_second": 519.051, + "eval_steps_per_second": 16.238, + "step": 102500 + }, + { + "epoch": 14.550745209368346, + "grad_norm": 0.047318752855062485, + "learning_rate": 8.54555003548616e-05, + "loss": 0.028882688283920287, + "step": 102510 + }, + { + "epoch": 14.552164655784244, + "grad_norm": 1.6493586301803589, + "learning_rate": 8.545408090844571e-05, + "loss": 0.04082152545452118, + "step": 102520 + }, + { + "epoch": 14.553584102200142, + "grad_norm": 1.9483829736709595, + "learning_rate": 8.545266146202981e-05, + "loss": 0.02645583152770996, + "step": 102530 + }, + { + "epoch": 14.555003548616039, + "grad_norm": 18.570716857910156, + "learning_rate": 8.545124201561392e-05, + "loss": 0.0315351814031601, + "step": 102540 + }, + { + "epoch": 14.556422995031937, + "grad_norm": 0.7574842572212219, + "learning_rate": 8.544982256919802e-05, + "loss": 0.012972161173820496, + "step": 102550 + }, + { + "epoch": 14.557842441447836, + "grad_norm": 0.3595932722091675, + "learning_rate": 8.544840312278212e-05, + "loss": 0.0756956934928894, + "step": 102560 + }, + { + "epoch": 14.559261887863734, + "grad_norm": 4.002671718597412, + "learning_rate": 8.544698367636623e-05, + "loss": 0.04419144093990326, + "step": 102570 + }, + { + "epoch": 14.56068133427963, + "grad_norm": 0.07301048189401627, + "learning_rate": 8.544556422995032e-05, + "loss": 0.02443731129169464, + "step": 102580 + }, + { + "epoch": 14.562100780695529, + "grad_norm": 0.05954832211136818, + "learning_rate": 8.544414478353444e-05, + "loss": 0.02469266653060913, + "step": 102590 + }, + { + "epoch": 14.563520227111427, + "grad_norm": 0.6604858636856079, + "learning_rate": 8.544272533711852e-05, + "loss": 0.0301352858543396, + "step": 102600 + }, + { + "epoch": 14.564939673527324, + "grad_norm": 0.09414374083280563, + "learning_rate": 8.544130589070263e-05, + "loss": 0.010161099582910537, + "step": 102610 + }, + { + "epoch": 14.566359119943222, + "grad_norm": 0.8994581699371338, + "learning_rate": 8.543988644428673e-05, + "loss": 0.0168968603014946, + "step": 102620 + }, + { + "epoch": 14.56777856635912, + "grad_norm": 0.01584504544734955, + "learning_rate": 8.543846699787084e-05, + "loss": 0.04087940454483032, + "step": 102630 + }, + { + "epoch": 14.569198012775018, + "grad_norm": 1.0661892890930176, + "learning_rate": 8.543704755145494e-05, + "loss": 0.013919854164123535, + "step": 102640 + }, + { + "epoch": 14.570617459190915, + "grad_norm": 10.946099281311035, + "learning_rate": 8.543562810503903e-05, + "loss": 0.023270314931869505, + "step": 102650 + }, + { + "epoch": 14.572036905606813, + "grad_norm": 1.4906280040740967, + "learning_rate": 8.543420865862314e-05, + "loss": 0.03611093461513519, + "step": 102660 + }, + { + "epoch": 14.573456352022712, + "grad_norm": 0.058776769787073135, + "learning_rate": 8.543278921220724e-05, + "loss": 0.011062215268611907, + "step": 102670 + }, + { + "epoch": 14.574875798438608, + "grad_norm": 1.3882954120635986, + "learning_rate": 8.543136976579135e-05, + "loss": 0.02433699816465378, + "step": 102680 + }, + { + "epoch": 14.576295244854506, + "grad_norm": 0.5981866121292114, + "learning_rate": 8.542995031937545e-05, + "loss": 0.017913120985031127, + "step": 102690 + }, + { + "epoch": 14.577714691270405, + "grad_norm": 0.05415025353431702, + "learning_rate": 8.542853087295955e-05, + "loss": 0.0577204167842865, + "step": 102700 + }, + { + "epoch": 14.579134137686303, + "grad_norm": 0.62937992811203, + "learning_rate": 8.542711142654364e-05, + "loss": 0.00876297652721405, + "step": 102710 + }, + { + "epoch": 14.5805535841022, + "grad_norm": 0.33898359537124634, + "learning_rate": 8.542569198012776e-05, + "loss": 0.05263091921806336, + "step": 102720 + }, + { + "epoch": 14.581973030518098, + "grad_norm": 0.11864381283521652, + "learning_rate": 8.542427253371185e-05, + "loss": 0.042475050687789916, + "step": 102730 + }, + { + "epoch": 14.583392476933996, + "grad_norm": 11.901185035705566, + "learning_rate": 8.542285308729596e-05, + "loss": 0.0330029308795929, + "step": 102740 + }, + { + "epoch": 14.584811923349893, + "grad_norm": 8.142967224121094, + "learning_rate": 8.542143364088006e-05, + "loss": 0.026455461978912354, + "step": 102750 + }, + { + "epoch": 14.586231369765791, + "grad_norm": 8.642476081848145, + "learning_rate": 8.542001419446416e-05, + "loss": 0.02869180738925934, + "step": 102760 + }, + { + "epoch": 14.58765081618169, + "grad_norm": 7.578298091888428, + "learning_rate": 8.541859474804827e-05, + "loss": 0.012754182517528533, + "step": 102770 + }, + { + "epoch": 14.589070262597588, + "grad_norm": 0.013344738632440567, + "learning_rate": 8.541717530163237e-05, + "loss": 0.047271886467933656, + "step": 102780 + }, + { + "epoch": 14.590489709013484, + "grad_norm": 0.1891198754310608, + "learning_rate": 8.541575585521648e-05, + "loss": 0.004413479566574096, + "step": 102790 + }, + { + "epoch": 14.591909155429382, + "grad_norm": 0.11714489012956619, + "learning_rate": 8.541433640880058e-05, + "loss": 0.012677499651908874, + "step": 102800 + }, + { + "epoch": 14.59332860184528, + "grad_norm": 0.3130483031272888, + "learning_rate": 8.541291696238467e-05, + "loss": 0.005448834598064422, + "step": 102810 + }, + { + "epoch": 14.594748048261177, + "grad_norm": 3.508953094482422, + "learning_rate": 8.541149751596877e-05, + "loss": 0.03599079251289368, + "step": 102820 + }, + { + "epoch": 14.596167494677076, + "grad_norm": 0.023073973134160042, + "learning_rate": 8.541007806955288e-05, + "loss": 0.024753133952617645, + "step": 102830 + }, + { + "epoch": 14.597586941092974, + "grad_norm": 0.10521383583545685, + "learning_rate": 8.540865862313698e-05, + "loss": 0.017889854311943055, + "step": 102840 + }, + { + "epoch": 14.599006387508872, + "grad_norm": 14.080901145935059, + "learning_rate": 8.540723917672109e-05, + "loss": 0.020883312821388243, + "step": 102850 + }, + { + "epoch": 14.600425833924769, + "grad_norm": 13.305614471435547, + "learning_rate": 8.540581973030519e-05, + "loss": 0.03402230143547058, + "step": 102860 + }, + { + "epoch": 14.601845280340667, + "grad_norm": 12.037869453430176, + "learning_rate": 8.540440028388928e-05, + "loss": 0.05208061933517456, + "step": 102870 + }, + { + "epoch": 14.603264726756565, + "grad_norm": 0.06191776320338249, + "learning_rate": 8.54029808374734e-05, + "loss": 0.03488814234733582, + "step": 102880 + }, + { + "epoch": 14.604684173172462, + "grad_norm": 3.047244071960449, + "learning_rate": 8.540156139105749e-05, + "loss": 0.031791788339614865, + "step": 102890 + }, + { + "epoch": 14.60610361958836, + "grad_norm": 0.5617519021034241, + "learning_rate": 8.54001419446416e-05, + "loss": 0.021275760233402254, + "step": 102900 + }, + { + "epoch": 14.607523066004259, + "grad_norm": 7.143520355224609, + "learning_rate": 8.539872249822569e-05, + "loss": 0.02951667606830597, + "step": 102910 + }, + { + "epoch": 14.608942512420157, + "grad_norm": 0.12056829035282135, + "learning_rate": 8.53973030518098e-05, + "loss": 0.006852982938289643, + "step": 102920 + }, + { + "epoch": 14.610361958836053, + "grad_norm": 2.9511711597442627, + "learning_rate": 8.53958836053939e-05, + "loss": 0.03665172159671783, + "step": 102930 + }, + { + "epoch": 14.611781405251952, + "grad_norm": 0.8545325398445129, + "learning_rate": 8.5394464158978e-05, + "loss": 0.028074532747268677, + "step": 102940 + }, + { + "epoch": 14.61320085166785, + "grad_norm": 0.16128212213516235, + "learning_rate": 8.53930447125621e-05, + "loss": 0.05112728476524353, + "step": 102950 + }, + { + "epoch": 14.614620298083747, + "grad_norm": 0.2537463307380676, + "learning_rate": 8.53916252661462e-05, + "loss": 0.030259785056114197, + "step": 102960 + }, + { + "epoch": 14.616039744499645, + "grad_norm": 0.07488785684108734, + "learning_rate": 8.539020581973031e-05, + "loss": 0.015859323740005492, + "step": 102970 + }, + { + "epoch": 14.617459190915543, + "grad_norm": 6.450938701629639, + "learning_rate": 8.538878637331441e-05, + "loss": 0.03825899958610535, + "step": 102980 + }, + { + "epoch": 14.618878637331441, + "grad_norm": 0.20200957357883453, + "learning_rate": 8.538736692689852e-05, + "loss": 0.010987403988838195, + "step": 102990 + }, + { + "epoch": 14.620298083747338, + "grad_norm": 2.432180166244507, + "learning_rate": 8.538594748048262e-05, + "loss": 0.020841056108474733, + "step": 103000 + }, + { + "epoch": 14.620298083747338, + "eval_accuracy": 0.9786990525847269, + "eval_loss": 0.07823944091796875, + "eval_runtime": 30.9251, + "eval_samples_per_second": 508.551, + "eval_steps_per_second": 15.909, + "step": 103000 + }, + { + "epoch": 14.621717530163236, + "grad_norm": 19.911706924438477, + "learning_rate": 8.538452803406672e-05, + "loss": 0.016798266768455507, + "step": 103010 + }, + { + "epoch": 14.623136976579135, + "grad_norm": 4.428212642669678, + "learning_rate": 8.538310858765081e-05, + "loss": 0.009555123746395111, + "step": 103020 + }, + { + "epoch": 14.624556422995031, + "grad_norm": 0.3057141900062561, + "learning_rate": 8.538168914123492e-05, + "loss": 0.03787533640861511, + "step": 103030 + }, + { + "epoch": 14.62597586941093, + "grad_norm": 0.4630641043186188, + "learning_rate": 8.538026969481902e-05, + "loss": 0.006291747093200684, + "step": 103040 + }, + { + "epoch": 14.627395315826828, + "grad_norm": 2.2272002696990967, + "learning_rate": 8.537885024840313e-05, + "loss": 0.011624373495578766, + "step": 103050 + }, + { + "epoch": 14.628814762242726, + "grad_norm": 10.10580825805664, + "learning_rate": 8.537743080198723e-05, + "loss": 0.049487939476966857, + "step": 103060 + }, + { + "epoch": 14.630234208658623, + "grad_norm": 3.810765027999878, + "learning_rate": 8.537601135557133e-05, + "loss": 0.053276628255844116, + "step": 103070 + }, + { + "epoch": 14.63165365507452, + "grad_norm": 0.27761077880859375, + "learning_rate": 8.537459190915544e-05, + "loss": 0.05323188304901123, + "step": 103080 + }, + { + "epoch": 14.63307310149042, + "grad_norm": 0.04070013016462326, + "learning_rate": 8.537317246273953e-05, + "loss": 0.0037203233689069746, + "step": 103090 + }, + { + "epoch": 14.634492547906316, + "grad_norm": 0.011536057107150555, + "learning_rate": 8.537175301632365e-05, + "loss": 0.019993194937705995, + "step": 103100 + }, + { + "epoch": 14.635911994322214, + "grad_norm": 0.10641834884881973, + "learning_rate": 8.537033356990774e-05, + "loss": 0.00649840384721756, + "step": 103110 + }, + { + "epoch": 14.637331440738112, + "grad_norm": 0.5350741744041443, + "learning_rate": 8.536891412349184e-05, + "loss": 0.009857784211635589, + "step": 103120 + }, + { + "epoch": 14.63875088715401, + "grad_norm": 0.06728602200746536, + "learning_rate": 8.536749467707594e-05, + "loss": 0.04699492752552033, + "step": 103130 + }, + { + "epoch": 14.640170333569907, + "grad_norm": 4.469810485839844, + "learning_rate": 8.536607523066005e-05, + "loss": 0.007526058703660965, + "step": 103140 + }, + { + "epoch": 14.641589779985805, + "grad_norm": 0.19321392476558685, + "learning_rate": 8.536465578424415e-05, + "loss": 0.012016779184341431, + "step": 103150 + }, + { + "epoch": 14.643009226401704, + "grad_norm": 0.7691546678543091, + "learning_rate": 8.536323633782826e-05, + "loss": 0.021825101971626282, + "step": 103160 + }, + { + "epoch": 14.6444286728176, + "grad_norm": 1.6768932342529297, + "learning_rate": 8.536181689141235e-05, + "loss": 0.014258480072021485, + "step": 103170 + }, + { + "epoch": 14.645848119233499, + "grad_norm": 0.3741374909877777, + "learning_rate": 8.536039744499645e-05, + "loss": 0.022825604677200316, + "step": 103180 + }, + { + "epoch": 14.647267565649397, + "grad_norm": 0.8053756356239319, + "learning_rate": 8.535897799858056e-05, + "loss": 0.013836804032325744, + "step": 103190 + }, + { + "epoch": 14.648687012065295, + "grad_norm": 0.02035684697329998, + "learning_rate": 8.535755855216466e-05, + "loss": 0.017511588335037232, + "step": 103200 + }, + { + "epoch": 14.650106458481192, + "grad_norm": 3.142700433731079, + "learning_rate": 8.535613910574877e-05, + "loss": 0.010556647181510925, + "step": 103210 + }, + { + "epoch": 14.65152590489709, + "grad_norm": 0.06115235015749931, + "learning_rate": 8.535471965933285e-05, + "loss": 0.013547767698764802, + "step": 103220 + }, + { + "epoch": 14.652945351312988, + "grad_norm": 0.029936334118247032, + "learning_rate": 8.535330021291697e-05, + "loss": 0.012430180609226228, + "step": 103230 + }, + { + "epoch": 14.654364797728885, + "grad_norm": 8.30500316619873, + "learning_rate": 8.535188076650106e-05, + "loss": 0.06894768476486206, + "step": 103240 + }, + { + "epoch": 14.655784244144783, + "grad_norm": 0.31151121854782104, + "learning_rate": 8.535046132008517e-05, + "loss": 0.02243182957172394, + "step": 103250 + }, + { + "epoch": 14.657203690560682, + "grad_norm": 3.7427921295166016, + "learning_rate": 8.534904187366927e-05, + "loss": 0.05491254329681396, + "step": 103260 + }, + { + "epoch": 14.65862313697658, + "grad_norm": 6.065624237060547, + "learning_rate": 8.534762242725337e-05, + "loss": 0.02661764919757843, + "step": 103270 + }, + { + "epoch": 14.660042583392476, + "grad_norm": 0.13813666999340057, + "learning_rate": 8.534620298083748e-05, + "loss": 0.007654508948326111, + "step": 103280 + }, + { + "epoch": 14.661462029808375, + "grad_norm": 0.08765114843845367, + "learning_rate": 8.534478353442158e-05, + "loss": 0.017231097817420958, + "step": 103290 + }, + { + "epoch": 14.662881476224273, + "grad_norm": 0.5863285064697266, + "learning_rate": 8.534336408800569e-05, + "loss": 0.030512651801109313, + "step": 103300 + }, + { + "epoch": 14.66430092264017, + "grad_norm": 0.07344143837690353, + "learning_rate": 8.534194464158979e-05, + "loss": 0.009811799228191375, + "step": 103310 + }, + { + "epoch": 14.665720369056068, + "grad_norm": 6.590825080871582, + "learning_rate": 8.534052519517388e-05, + "loss": 0.008233676850795745, + "step": 103320 + }, + { + "epoch": 14.667139815471966, + "grad_norm": 6.864149570465088, + "learning_rate": 8.533910574875798e-05, + "loss": 0.010529959201812744, + "step": 103330 + }, + { + "epoch": 14.668559261887864, + "grad_norm": 0.1694544553756714, + "learning_rate": 8.533768630234209e-05, + "loss": 0.022904330492019655, + "step": 103340 + }, + { + "epoch": 14.669978708303761, + "grad_norm": 0.5476866960525513, + "learning_rate": 8.533626685592619e-05, + "loss": 0.030253446102142333, + "step": 103350 + }, + { + "epoch": 14.67139815471966, + "grad_norm": 0.27971404790878296, + "learning_rate": 8.53348474095103e-05, + "loss": 0.030365151166915894, + "step": 103360 + }, + { + "epoch": 14.672817601135558, + "grad_norm": 0.12163878977298737, + "learning_rate": 8.53334279630944e-05, + "loss": 0.018135757744312288, + "step": 103370 + }, + { + "epoch": 14.674237047551454, + "grad_norm": 1.1759569644927979, + "learning_rate": 8.53320085166785e-05, + "loss": 0.016406714916229248, + "step": 103380 + }, + { + "epoch": 14.675656493967352, + "grad_norm": 9.473148345947266, + "learning_rate": 8.53305890702626e-05, + "loss": 0.08271357417106628, + "step": 103390 + }, + { + "epoch": 14.67707594038325, + "grad_norm": 12.95916748046875, + "learning_rate": 8.53291696238467e-05, + "loss": 0.05103471279144287, + "step": 103400 + }, + { + "epoch": 14.678495386799149, + "grad_norm": 0.7540145516395569, + "learning_rate": 8.532775017743081e-05, + "loss": 0.06290702819824219, + "step": 103410 + }, + { + "epoch": 14.679914833215046, + "grad_norm": 0.44665971398353577, + "learning_rate": 8.53263307310149e-05, + "loss": 0.015360213816165924, + "step": 103420 + }, + { + "epoch": 14.681334279630944, + "grad_norm": 0.027483150362968445, + "learning_rate": 8.532491128459901e-05, + "loss": 0.013640022277832032, + "step": 103430 + }, + { + "epoch": 14.682753726046842, + "grad_norm": 0.5103288292884827, + "learning_rate": 8.53234918381831e-05, + "loss": 0.003732307255268097, + "step": 103440 + }, + { + "epoch": 14.684173172462739, + "grad_norm": 0.006598861422389746, + "learning_rate": 8.532207239176722e-05, + "loss": 0.001814265176653862, + "step": 103450 + }, + { + "epoch": 14.685592618878637, + "grad_norm": 0.10724376887083054, + "learning_rate": 8.532065294535131e-05, + "loss": 0.004641066119074821, + "step": 103460 + }, + { + "epoch": 14.687012065294535, + "grad_norm": 0.1681101769208908, + "learning_rate": 8.531923349893542e-05, + "loss": 0.021464285254478455, + "step": 103470 + }, + { + "epoch": 14.688431511710434, + "grad_norm": 0.023226885125041008, + "learning_rate": 8.531781405251952e-05, + "loss": 0.012756478786468507, + "step": 103480 + }, + { + "epoch": 14.68985095812633, + "grad_norm": 1.445003867149353, + "learning_rate": 8.531639460610362e-05, + "loss": 0.017902496457099914, + "step": 103490 + }, + { + "epoch": 14.691270404542228, + "grad_norm": 0.3953350782394409, + "learning_rate": 8.531497515968773e-05, + "loss": 0.012959374487400055, + "step": 103500 + }, + { + "epoch": 14.691270404542228, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.041505590081214905, + "eval_runtime": 31.1689, + "eval_samples_per_second": 504.573, + "eval_steps_per_second": 15.785, + "step": 103500 + }, + { + "epoch": 14.692689850958127, + "grad_norm": 0.17475774884223938, + "learning_rate": 8.531355571327183e-05, + "loss": 0.023621699213981627, + "step": 103510 + }, + { + "epoch": 14.694109297374023, + "grad_norm": 3.899446487426758, + "learning_rate": 8.531213626685594e-05, + "loss": 0.03020642101764679, + "step": 103520 + }, + { + "epoch": 14.695528743789922, + "grad_norm": 3.107422113418579, + "learning_rate": 8.531071682044002e-05, + "loss": 0.03576629757881165, + "step": 103530 + }, + { + "epoch": 14.69694819020582, + "grad_norm": 0.12557634711265564, + "learning_rate": 8.530929737402413e-05, + "loss": 0.026172888278961182, + "step": 103540 + }, + { + "epoch": 14.698367636621718, + "grad_norm": 0.07338567078113556, + "learning_rate": 8.530787792760823e-05, + "loss": 0.0036056622862815856, + "step": 103550 + }, + { + "epoch": 14.699787083037615, + "grad_norm": 1.2753487825393677, + "learning_rate": 8.530660042583393e-05, + "loss": 0.030322042107582093, + "step": 103560 + }, + { + "epoch": 14.701206529453513, + "grad_norm": 0.45679599046707153, + "learning_rate": 8.530518097941803e-05, + "loss": 0.013920623064041137, + "step": 103570 + }, + { + "epoch": 14.702625975869411, + "grad_norm": 0.35328221321105957, + "learning_rate": 8.530376153300214e-05, + "loss": 0.03288788497447968, + "step": 103580 + }, + { + "epoch": 14.704045422285308, + "grad_norm": 0.04782756417989731, + "learning_rate": 8.530234208658624e-05, + "loss": 0.0034589260816574098, + "step": 103590 + }, + { + "epoch": 14.705464868701206, + "grad_norm": 1.0690654516220093, + "learning_rate": 8.530092264017033e-05, + "loss": 0.005941495299339294, + "step": 103600 + }, + { + "epoch": 14.706884315117104, + "grad_norm": 6.147830486297607, + "learning_rate": 8.529950319375444e-05, + "loss": 0.01815780848264694, + "step": 103610 + }, + { + "epoch": 14.708303761533003, + "grad_norm": 0.023603355512022972, + "learning_rate": 8.529808374733854e-05, + "loss": 0.010245384275913238, + "step": 103620 + }, + { + "epoch": 14.7097232079489, + "grad_norm": 2.9636645317077637, + "learning_rate": 8.529666430092265e-05, + "loss": 0.016753530502319335, + "step": 103630 + }, + { + "epoch": 14.711142654364798, + "grad_norm": 0.7192009091377258, + "learning_rate": 8.529524485450675e-05, + "loss": 0.030516594648361206, + "step": 103640 + }, + { + "epoch": 14.712562100780696, + "grad_norm": 0.4792954623699188, + "learning_rate": 8.529382540809085e-05, + "loss": 0.022221264243125916, + "step": 103650 + }, + { + "epoch": 14.713981547196592, + "grad_norm": 5.789441108703613, + "learning_rate": 8.529240596167494e-05, + "loss": 0.012740533053874969, + "step": 103660 + }, + { + "epoch": 14.71540099361249, + "grad_norm": 4.176222801208496, + "learning_rate": 8.529098651525905e-05, + "loss": 0.009820845723152161, + "step": 103670 + }, + { + "epoch": 14.716820440028389, + "grad_norm": 0.06941059231758118, + "learning_rate": 8.528956706884315e-05, + "loss": 0.013245144486427307, + "step": 103680 + }, + { + "epoch": 14.718239886444287, + "grad_norm": 7.595728397369385, + "learning_rate": 8.528814762242726e-05, + "loss": 0.04703215062618256, + "step": 103690 + }, + { + "epoch": 14.719659332860184, + "grad_norm": 0.09465227276086807, + "learning_rate": 8.528672817601136e-05, + "loss": 0.029085099697113037, + "step": 103700 + }, + { + "epoch": 14.721078779276082, + "grad_norm": 0.2000981569290161, + "learning_rate": 8.528530872959546e-05, + "loss": 0.008482570946216583, + "step": 103710 + }, + { + "epoch": 14.72249822569198, + "grad_norm": 9.368997573852539, + "learning_rate": 8.528388928317957e-05, + "loss": 0.01685175597667694, + "step": 103720 + }, + { + "epoch": 14.723917672107877, + "grad_norm": 2.1341898441314697, + "learning_rate": 8.528246983676367e-05, + "loss": 0.010621855407953263, + "step": 103730 + }, + { + "epoch": 14.725337118523775, + "grad_norm": 4.980072021484375, + "learning_rate": 8.528105039034778e-05, + "loss": 0.04767651557922363, + "step": 103740 + }, + { + "epoch": 14.726756564939674, + "grad_norm": 8.494141578674316, + "learning_rate": 8.527963094393186e-05, + "loss": 0.007886487245559692, + "step": 103750 + }, + { + "epoch": 14.728176011355572, + "grad_norm": 0.6012231707572937, + "learning_rate": 8.527821149751597e-05, + "loss": 0.028650522232055664, + "step": 103760 + }, + { + "epoch": 14.729595457771469, + "grad_norm": 0.052750129252672195, + "learning_rate": 8.527679205110007e-05, + "loss": 0.011216586828231812, + "step": 103770 + }, + { + "epoch": 14.731014904187367, + "grad_norm": 0.11503159999847412, + "learning_rate": 8.527537260468418e-05, + "loss": 0.03377627432346344, + "step": 103780 + }, + { + "epoch": 14.732434350603265, + "grad_norm": 4.543156147003174, + "learning_rate": 8.527395315826828e-05, + "loss": 0.011570049822330475, + "step": 103790 + }, + { + "epoch": 14.733853797019162, + "grad_norm": 4.640334606170654, + "learning_rate": 8.527253371185239e-05, + "loss": 0.01897972822189331, + "step": 103800 + }, + { + "epoch": 14.73527324343506, + "grad_norm": 0.9010857939720154, + "learning_rate": 8.527111426543649e-05, + "loss": 0.009830554574728012, + "step": 103810 + }, + { + "epoch": 14.736692689850958, + "grad_norm": 16.031429290771484, + "learning_rate": 8.526969481902058e-05, + "loss": 0.023026317358016968, + "step": 103820 + }, + { + "epoch": 14.738112136266857, + "grad_norm": 0.9265277981758118, + "learning_rate": 8.52682753726047e-05, + "loss": 0.007235559821128845, + "step": 103830 + }, + { + "epoch": 14.739531582682753, + "grad_norm": 0.044633179903030396, + "learning_rate": 8.526685592618879e-05, + "loss": 0.07692786455154418, + "step": 103840 + }, + { + "epoch": 14.740951029098651, + "grad_norm": 0.18914464116096497, + "learning_rate": 8.52654364797729e-05, + "loss": 0.042169079184532166, + "step": 103850 + }, + { + "epoch": 14.74237047551455, + "grad_norm": 8.333174705505371, + "learning_rate": 8.526401703335699e-05, + "loss": 0.032961970567703246, + "step": 103860 + }, + { + "epoch": 14.743789921930446, + "grad_norm": 1.251344084739685, + "learning_rate": 8.52625975869411e-05, + "loss": 0.016368305683135985, + "step": 103870 + }, + { + "epoch": 14.745209368346345, + "grad_norm": 3.432236433029175, + "learning_rate": 8.52611781405252e-05, + "loss": 0.049349617958068845, + "step": 103880 + }, + { + "epoch": 14.746628814762243, + "grad_norm": 0.08397255092859268, + "learning_rate": 8.52597586941093e-05, + "loss": 0.010075490176677703, + "step": 103890 + }, + { + "epoch": 14.748048261178141, + "grad_norm": 1.3642832040786743, + "learning_rate": 8.52583392476934e-05, + "loss": 0.011244092881679536, + "step": 103900 + }, + { + "epoch": 14.749467707594038, + "grad_norm": 2.000920295715332, + "learning_rate": 8.52569198012775e-05, + "loss": 0.012861920893192292, + "step": 103910 + }, + { + "epoch": 14.750887154009936, + "grad_norm": 0.006451313849538565, + "learning_rate": 8.525550035486161e-05, + "loss": 0.021595826745033263, + "step": 103920 + }, + { + "epoch": 14.752306600425834, + "grad_norm": 0.347922682762146, + "learning_rate": 8.525408090844571e-05, + "loss": 0.020030558109283447, + "step": 103930 + }, + { + "epoch": 14.75372604684173, + "grad_norm": 0.16533195972442627, + "learning_rate": 8.525266146202982e-05, + "loss": 0.012387216091156006, + "step": 103940 + }, + { + "epoch": 14.75514549325763, + "grad_norm": 0.17642009258270264, + "learning_rate": 8.525124201561392e-05, + "loss": 0.004976727813482284, + "step": 103950 + }, + { + "epoch": 14.756564939673527, + "grad_norm": 0.16800779104232788, + "learning_rate": 8.524982256919801e-05, + "loss": 0.005400242283940315, + "step": 103960 + }, + { + "epoch": 14.757984386089426, + "grad_norm": 0.4111100435256958, + "learning_rate": 8.524840312278211e-05, + "loss": 0.044538623094558714, + "step": 103970 + }, + { + "epoch": 14.759403832505322, + "grad_norm": 7.620819091796875, + "learning_rate": 8.524698367636622e-05, + "loss": 0.03084087371826172, + "step": 103980 + }, + { + "epoch": 14.76082327892122, + "grad_norm": 0.06103351712226868, + "learning_rate": 8.524556422995032e-05, + "loss": 0.034201741218566895, + "step": 103990 + }, + { + "epoch": 14.762242725337119, + "grad_norm": 0.07585401087999344, + "learning_rate": 8.524414478353443e-05, + "loss": 0.01607244312763214, + "step": 104000 + }, + { + "epoch": 14.762242725337119, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.05828787013888359, + "eval_runtime": 30.4152, + "eval_samples_per_second": 517.077, + "eval_steps_per_second": 16.176, + "step": 104000 + }, + { + "epoch": 14.763662171753015, + "grad_norm": 1.165725827217102, + "learning_rate": 8.524272533711853e-05, + "loss": 0.02569354772567749, + "step": 104010 + }, + { + "epoch": 14.765081618168914, + "grad_norm": 8.496212005615234, + "learning_rate": 8.524130589070263e-05, + "loss": 0.022003328800201415, + "step": 104020 + }, + { + "epoch": 14.766501064584812, + "grad_norm": 2.057748317718506, + "learning_rate": 8.523988644428674e-05, + "loss": 0.01967623233795166, + "step": 104030 + }, + { + "epoch": 14.76792051100071, + "grad_norm": 8.422942161560059, + "learning_rate": 8.523846699787083e-05, + "loss": 0.06901317238807678, + "step": 104040 + }, + { + "epoch": 14.769339957416607, + "grad_norm": 0.8119853734970093, + "learning_rate": 8.523704755145494e-05, + "loss": 0.024189670383930207, + "step": 104050 + }, + { + "epoch": 14.770759403832505, + "grad_norm": 0.35583141446113586, + "learning_rate": 8.523562810503903e-05, + "loss": 0.01172533631324768, + "step": 104060 + }, + { + "epoch": 14.772178850248403, + "grad_norm": 0.700731635093689, + "learning_rate": 8.523420865862314e-05, + "loss": 0.01899768114089966, + "step": 104070 + }, + { + "epoch": 14.7735982966643, + "grad_norm": 0.21481211483478546, + "learning_rate": 8.523278921220724e-05, + "loss": 0.07336496114730835, + "step": 104080 + }, + { + "epoch": 14.775017743080198, + "grad_norm": 0.10782082378864288, + "learning_rate": 8.523136976579135e-05, + "loss": 0.07388808131217957, + "step": 104090 + }, + { + "epoch": 14.776437189496097, + "grad_norm": 12.216814994812012, + "learning_rate": 8.522995031937545e-05, + "loss": 0.0686156153678894, + "step": 104100 + }, + { + "epoch": 14.777856635911995, + "grad_norm": 0.35206928849220276, + "learning_rate": 8.522853087295954e-05, + "loss": 0.021819183230400087, + "step": 104110 + }, + { + "epoch": 14.779276082327891, + "grad_norm": 2.7776386737823486, + "learning_rate": 8.522711142654365e-05, + "loss": 0.06748580932617188, + "step": 104120 + }, + { + "epoch": 14.78069552874379, + "grad_norm": 2.451956033706665, + "learning_rate": 8.522569198012775e-05, + "loss": 0.012218570709228516, + "step": 104130 + }, + { + "epoch": 14.782114975159688, + "grad_norm": 1.476254940032959, + "learning_rate": 8.522427253371186e-05, + "loss": 0.01046212911605835, + "step": 104140 + }, + { + "epoch": 14.783534421575585, + "grad_norm": 0.10849824547767639, + "learning_rate": 8.522285308729596e-05, + "loss": 0.017551289498806, + "step": 104150 + }, + { + "epoch": 14.784953867991483, + "grad_norm": 9.550311088562012, + "learning_rate": 8.522143364088007e-05, + "loss": 0.02532818615436554, + "step": 104160 + }, + { + "epoch": 14.786373314407381, + "grad_norm": 0.09233774244785309, + "learning_rate": 8.522001419446415e-05, + "loss": 0.02874256670475006, + "step": 104170 + }, + { + "epoch": 14.78779276082328, + "grad_norm": 0.6827500462532043, + "learning_rate": 8.521859474804826e-05, + "loss": 0.012810790538787841, + "step": 104180 + }, + { + "epoch": 14.789212207239176, + "grad_norm": 7.668457984924316, + "learning_rate": 8.521717530163236e-05, + "loss": 0.0762434184551239, + "step": 104190 + }, + { + "epoch": 14.790631653655074, + "grad_norm": 0.3374784588813782, + "learning_rate": 8.521575585521647e-05, + "loss": 0.006024296581745148, + "step": 104200 + }, + { + "epoch": 14.792051100070973, + "grad_norm": 5.660341262817383, + "learning_rate": 8.521433640880057e-05, + "loss": 0.04634143710136414, + "step": 104210 + }, + { + "epoch": 14.79347054648687, + "grad_norm": 3.8380041122436523, + "learning_rate": 8.521291696238467e-05, + "loss": 0.05055670738220215, + "step": 104220 + }, + { + "epoch": 14.794889992902768, + "grad_norm": 6.826344966888428, + "learning_rate": 8.521149751596878e-05, + "loss": 0.020168723165988924, + "step": 104230 + }, + { + "epoch": 14.796309439318666, + "grad_norm": 0.6084926128387451, + "learning_rate": 8.521007806955288e-05, + "loss": 0.005605394020676613, + "step": 104240 + }, + { + "epoch": 14.797728885734564, + "grad_norm": 0.4939149022102356, + "learning_rate": 8.520865862313699e-05, + "loss": 0.04106263220310211, + "step": 104250 + }, + { + "epoch": 14.79914833215046, + "grad_norm": 0.3379480242729187, + "learning_rate": 8.520723917672108e-05, + "loss": 0.038764914870262145, + "step": 104260 + }, + { + "epoch": 14.800567778566359, + "grad_norm": 0.012404486536979675, + "learning_rate": 8.520581973030518e-05, + "loss": 0.01627178639173508, + "step": 104270 + }, + { + "epoch": 14.801987224982257, + "grad_norm": 0.01644447073340416, + "learning_rate": 8.520440028388928e-05, + "loss": 0.06491001844406127, + "step": 104280 + }, + { + "epoch": 14.803406671398154, + "grad_norm": 0.019314207136631012, + "learning_rate": 8.520298083747339e-05, + "loss": 0.025806555151939393, + "step": 104290 + }, + { + "epoch": 14.804826117814052, + "grad_norm": 5.47023868560791, + "learning_rate": 8.520156139105749e-05, + "loss": 0.044958552718162535, + "step": 104300 + }, + { + "epoch": 14.80624556422995, + "grad_norm": 3.8105883598327637, + "learning_rate": 8.52001419446416e-05, + "loss": 0.026090803742408752, + "step": 104310 + }, + { + "epoch": 14.807665010645849, + "grad_norm": 9.174649238586426, + "learning_rate": 8.51987224982257e-05, + "loss": 0.03889913856983185, + "step": 104320 + }, + { + "epoch": 14.809084457061745, + "grad_norm": 7.475338459014893, + "learning_rate": 8.51973030518098e-05, + "loss": 0.035494810342788695, + "step": 104330 + }, + { + "epoch": 14.810503903477644, + "grad_norm": 4.019817352294922, + "learning_rate": 8.51958836053939e-05, + "loss": 0.009954603016376495, + "step": 104340 + }, + { + "epoch": 14.811923349893542, + "grad_norm": 0.2154158651828766, + "learning_rate": 8.5194464158978e-05, + "loss": 0.028025662899017333, + "step": 104350 + }, + { + "epoch": 14.813342796309438, + "grad_norm": 0.098808154463768, + "learning_rate": 8.519304471256211e-05, + "loss": 0.013450905680656433, + "step": 104360 + }, + { + "epoch": 14.814762242725337, + "grad_norm": 1.249442458152771, + "learning_rate": 8.51916252661462e-05, + "loss": 0.014495487511157989, + "step": 104370 + }, + { + "epoch": 14.816181689141235, + "grad_norm": 7.714137554168701, + "learning_rate": 8.519020581973031e-05, + "loss": 0.03295493721961975, + "step": 104380 + }, + { + "epoch": 14.817601135557133, + "grad_norm": 1.7971025705337524, + "learning_rate": 8.51887863733144e-05, + "loss": 0.03349592685699463, + "step": 104390 + }, + { + "epoch": 14.81902058197303, + "grad_norm": 3.5626816749572754, + "learning_rate": 8.518736692689852e-05, + "loss": 0.010706099867820739, + "step": 104400 + }, + { + "epoch": 14.820440028388928, + "grad_norm": 0.09448494762182236, + "learning_rate": 8.518594748048261e-05, + "loss": 0.021884602308273316, + "step": 104410 + }, + { + "epoch": 14.821859474804826, + "grad_norm": 0.14466921985149384, + "learning_rate": 8.518452803406671e-05, + "loss": 0.00417226180434227, + "step": 104420 + }, + { + "epoch": 14.823278921220723, + "grad_norm": 0.17145518958568573, + "learning_rate": 8.518310858765082e-05, + "loss": 0.006554578989744186, + "step": 104430 + }, + { + "epoch": 14.824698367636621, + "grad_norm": 0.002625130582600832, + "learning_rate": 8.518168914123492e-05, + "loss": 0.009584589302539826, + "step": 104440 + }, + { + "epoch": 14.82611781405252, + "grad_norm": 0.03746289014816284, + "learning_rate": 8.518026969481903e-05, + "loss": 0.037434864044189456, + "step": 104450 + }, + { + "epoch": 14.827537260468418, + "grad_norm": 11.766426086425781, + "learning_rate": 8.517885024840313e-05, + "loss": 0.031741362810134885, + "step": 104460 + }, + { + "epoch": 14.828956706884314, + "grad_norm": 0.08067937195301056, + "learning_rate": 8.517743080198722e-05, + "loss": 0.015521128475666047, + "step": 104470 + }, + { + "epoch": 14.830376153300213, + "grad_norm": 0.03898005187511444, + "learning_rate": 8.517601135557132e-05, + "loss": 0.0194305419921875, + "step": 104480 + }, + { + "epoch": 14.831795599716111, + "grad_norm": 0.11849980056285858, + "learning_rate": 8.517459190915543e-05, + "loss": 0.005333187431097031, + "step": 104490 + }, + { + "epoch": 14.833215046132008, + "grad_norm": 0.06989427655935287, + "learning_rate": 8.517317246273953e-05, + "loss": 0.026983022689819336, + "step": 104500 + }, + { + "epoch": 14.833215046132008, + "eval_accuracy": 0.9844216951739048, + "eval_loss": 0.056505098938941956, + "eval_runtime": 31.0822, + "eval_samples_per_second": 505.98, + "eval_steps_per_second": 15.829, + "step": 104500 + }, + { + "epoch": 14.834634492547906, + "grad_norm": 2.388498544692993, + "learning_rate": 8.517175301632364e-05, + "loss": 0.03193688988685608, + "step": 104510 + }, + { + "epoch": 14.836053938963804, + "grad_norm": 0.0659419372677803, + "learning_rate": 8.517033356990775e-05, + "loss": 0.0162614181637764, + "step": 104520 + }, + { + "epoch": 14.837473385379703, + "grad_norm": 4.265310287475586, + "learning_rate": 8.516891412349184e-05, + "loss": 0.027747917175292968, + "step": 104530 + }, + { + "epoch": 14.838892831795599, + "grad_norm": 5.0632195472717285, + "learning_rate": 8.516749467707595e-05, + "loss": 0.0163781076669693, + "step": 104540 + }, + { + "epoch": 14.840312278211497, + "grad_norm": 0.06357064843177795, + "learning_rate": 8.516607523066004e-05, + "loss": 0.0017443560063838959, + "step": 104550 + }, + { + "epoch": 14.841731724627396, + "grad_norm": 0.15850457549095154, + "learning_rate": 8.516465578424415e-05, + "loss": 0.025368887186050414, + "step": 104560 + }, + { + "epoch": 14.843151171043292, + "grad_norm": 0.29624274373054504, + "learning_rate": 8.516323633782825e-05, + "loss": 0.014364491403102874, + "step": 104570 + }, + { + "epoch": 14.84457061745919, + "grad_norm": 6.2883524894714355, + "learning_rate": 8.516181689141235e-05, + "loss": 0.020932359993457793, + "step": 104580 + }, + { + "epoch": 14.845990063875089, + "grad_norm": 1.254381775856018, + "learning_rate": 8.516039744499645e-05, + "loss": 0.011461752653121948, + "step": 104590 + }, + { + "epoch": 14.847409510290987, + "grad_norm": 6.394966125488281, + "learning_rate": 8.515897799858056e-05, + "loss": 0.03518040776252747, + "step": 104600 + }, + { + "epoch": 14.848828956706884, + "grad_norm": 17.94300651550293, + "learning_rate": 8.515755855216467e-05, + "loss": 0.051870590448379515, + "step": 104610 + }, + { + "epoch": 14.850248403122782, + "grad_norm": 1.1413216590881348, + "learning_rate": 8.515613910574877e-05, + "loss": 0.08706502914428711, + "step": 104620 + }, + { + "epoch": 14.85166784953868, + "grad_norm": 0.050446055829524994, + "learning_rate": 8.515471965933286e-05, + "loss": 0.08013435006141663, + "step": 104630 + }, + { + "epoch": 14.853087295954577, + "grad_norm": 1.871097445487976, + "learning_rate": 8.515330021291696e-05, + "loss": 0.02684931755065918, + "step": 104640 + }, + { + "epoch": 14.854506742370475, + "grad_norm": 0.2473142445087433, + "learning_rate": 8.515188076650107e-05, + "loss": 0.012901613116264343, + "step": 104650 + }, + { + "epoch": 14.855926188786373, + "grad_norm": 10.848540306091309, + "learning_rate": 8.515046132008517e-05, + "loss": 0.017069482803344728, + "step": 104660 + }, + { + "epoch": 14.857345635202272, + "grad_norm": 1.1023898124694824, + "learning_rate": 8.514904187366928e-05, + "loss": 0.02099357098340988, + "step": 104670 + }, + { + "epoch": 14.858765081618168, + "grad_norm": 7.283102512359619, + "learning_rate": 8.514762242725336e-05, + "loss": 0.026400291919708253, + "step": 104680 + }, + { + "epoch": 14.860184528034067, + "grad_norm": 0.23161716759204865, + "learning_rate": 8.514620298083747e-05, + "loss": 0.007943221926689148, + "step": 104690 + }, + { + "epoch": 14.861603974449965, + "grad_norm": 0.08439341187477112, + "learning_rate": 8.514478353442159e-05, + "loss": 0.016894285380840302, + "step": 104700 + }, + { + "epoch": 14.863023420865863, + "grad_norm": 0.10445648431777954, + "learning_rate": 8.514336408800568e-05, + "loss": 0.02399568408727646, + "step": 104710 + }, + { + "epoch": 14.86444286728176, + "grad_norm": 2.852975845336914, + "learning_rate": 8.51419446415898e-05, + "loss": 0.028228983283042908, + "step": 104720 + }, + { + "epoch": 14.865862313697658, + "grad_norm": 0.03454967215657234, + "learning_rate": 8.514052519517388e-05, + "loss": 0.009380853176116944, + "step": 104730 + }, + { + "epoch": 14.867281760113556, + "grad_norm": 2.4463937282562256, + "learning_rate": 8.513910574875799e-05, + "loss": 0.02598130702972412, + "step": 104740 + }, + { + "epoch": 14.868701206529453, + "grad_norm": 4.604282379150391, + "learning_rate": 8.513768630234209e-05, + "loss": 0.005938100814819336, + "step": 104750 + }, + { + "epoch": 14.870120652945351, + "grad_norm": 0.04022074490785599, + "learning_rate": 8.51362668559262e-05, + "loss": 0.031351137161254886, + "step": 104760 + }, + { + "epoch": 14.87154009936125, + "grad_norm": 15.1404390335083, + "learning_rate": 8.51348474095103e-05, + "loss": 0.04066168069839478, + "step": 104770 + }, + { + "epoch": 14.872959545777148, + "grad_norm": 15.48460578918457, + "learning_rate": 8.513342796309439e-05, + "loss": 0.017590782046318053, + "step": 104780 + }, + { + "epoch": 14.874378992193044, + "grad_norm": 0.7865493893623352, + "learning_rate": 8.51320085166785e-05, + "loss": 0.017658084630966187, + "step": 104790 + }, + { + "epoch": 14.875798438608943, + "grad_norm": 0.7198042273521423, + "learning_rate": 8.51305890702626e-05, + "loss": 0.017467735707759856, + "step": 104800 + }, + { + "epoch": 14.87721788502484, + "grad_norm": 5.571759223937988, + "learning_rate": 8.512916962384671e-05, + "loss": 0.018814486265182496, + "step": 104810 + }, + { + "epoch": 14.878637331440737, + "grad_norm": 9.609874725341797, + "learning_rate": 8.512775017743081e-05, + "loss": 0.010202006250619889, + "step": 104820 + }, + { + "epoch": 14.880056777856636, + "grad_norm": 12.367914199829102, + "learning_rate": 8.51263307310149e-05, + "loss": 0.011622205376625061, + "step": 104830 + }, + { + "epoch": 14.881476224272534, + "grad_norm": 2.200284004211426, + "learning_rate": 8.5124911284599e-05, + "loss": 0.012384110689163208, + "step": 104840 + }, + { + "epoch": 14.882895670688432, + "grad_norm": 0.00814757589250803, + "learning_rate": 8.512349183818311e-05, + "loss": 0.03865731358528137, + "step": 104850 + }, + { + "epoch": 14.884315117104329, + "grad_norm": 3.8396785259246826, + "learning_rate": 8.512207239176721e-05, + "loss": 0.018626007437705993, + "step": 104860 + }, + { + "epoch": 14.885734563520227, + "grad_norm": 0.9540119171142578, + "learning_rate": 8.512065294535132e-05, + "loss": 0.010299560427665711, + "step": 104870 + }, + { + "epoch": 14.887154009936125, + "grad_norm": 0.29044824838638306, + "learning_rate": 8.511923349893542e-05, + "loss": 0.0174009770154953, + "step": 104880 + }, + { + "epoch": 14.888573456352022, + "grad_norm": 0.9899982213973999, + "learning_rate": 8.511781405251952e-05, + "loss": 0.003324838727712631, + "step": 104890 + }, + { + "epoch": 14.88999290276792, + "grad_norm": 0.12451029568910599, + "learning_rate": 8.511639460610363e-05, + "loss": 0.01388794630765915, + "step": 104900 + }, + { + "epoch": 14.891412349183819, + "grad_norm": 0.10587727278470993, + "learning_rate": 8.511497515968773e-05, + "loss": 0.02231362909078598, + "step": 104910 + }, + { + "epoch": 14.892831795599717, + "grad_norm": 0.5449631810188293, + "learning_rate": 8.511355571327184e-05, + "loss": 0.008616983890533447, + "step": 104920 + }, + { + "epoch": 14.894251242015613, + "grad_norm": 4.594874858856201, + "learning_rate": 8.511213626685593e-05, + "loss": 0.039649614691734315, + "step": 104930 + }, + { + "epoch": 14.895670688431512, + "grad_norm": 0.43790388107299805, + "learning_rate": 8.511071682044003e-05, + "loss": 0.033021801710128786, + "step": 104940 + }, + { + "epoch": 14.89709013484741, + "grad_norm": 9.739229202270508, + "learning_rate": 8.510929737402413e-05, + "loss": 0.028501474857330324, + "step": 104950 + }, + { + "epoch": 14.898509581263307, + "grad_norm": 0.18172407150268555, + "learning_rate": 8.510787792760824e-05, + "loss": 0.05176345109939575, + "step": 104960 + }, + { + "epoch": 14.899929027679205, + "grad_norm": 2.6929240226745605, + "learning_rate": 8.510645848119234e-05, + "loss": 0.005494722351431847, + "step": 104970 + }, + { + "epoch": 14.901348474095103, + "grad_norm": 0.0787600576877594, + "learning_rate": 8.510503903477645e-05, + "loss": 0.007409898936748505, + "step": 104980 + }, + { + "epoch": 14.902767920511002, + "grad_norm": 0.07710441201925278, + "learning_rate": 8.510361958836055e-05, + "loss": 0.0217500239610672, + "step": 104990 + }, + { + "epoch": 14.904187366926898, + "grad_norm": 1.6725579500198364, + "learning_rate": 8.510220014194464e-05, + "loss": 0.007238331437110901, + "step": 105000 + }, + { + "epoch": 14.904187366926898, + "eval_accuracy": 0.986011318115343, + "eval_loss": 0.04534267634153366, + "eval_runtime": 33.5028, + "eval_samples_per_second": 469.423, + "eval_steps_per_second": 14.685, + "step": 105000 + }, + { + "epoch": 14.905606813342796, + "grad_norm": 0.47815820574760437, + "learning_rate": 8.510078069552875e-05, + "loss": 0.01293196976184845, + "step": 105010 + }, + { + "epoch": 14.907026259758695, + "grad_norm": 0.06179656460881233, + "learning_rate": 8.509936124911285e-05, + "loss": 0.01665736585855484, + "step": 105020 + }, + { + "epoch": 14.908445706174591, + "grad_norm": 0.7428871393203735, + "learning_rate": 8.509794180269696e-05, + "loss": 0.014929351210594178, + "step": 105030 + }, + { + "epoch": 14.90986515259049, + "grad_norm": 0.4426209330558777, + "learning_rate": 8.509652235628105e-05, + "loss": 0.06493237614631653, + "step": 105040 + }, + { + "epoch": 14.911284599006388, + "grad_norm": 0.8456557393074036, + "learning_rate": 8.509510290986516e-05, + "loss": 0.0359197735786438, + "step": 105050 + }, + { + "epoch": 14.912704045422286, + "grad_norm": 2.24787974357605, + "learning_rate": 8.509368346344925e-05, + "loss": 0.008549968898296356, + "step": 105060 + }, + { + "epoch": 14.914123491838183, + "grad_norm": 0.15779711306095123, + "learning_rate": 8.509226401703336e-05, + "loss": 0.017723798751831055, + "step": 105070 + }, + { + "epoch": 14.915542938254081, + "grad_norm": 9.117578506469727, + "learning_rate": 8.509084457061746e-05, + "loss": 0.029659080505371093, + "step": 105080 + }, + { + "epoch": 14.91696238466998, + "grad_norm": 13.629226684570312, + "learning_rate": 8.508942512420156e-05, + "loss": 0.027601355314254762, + "step": 105090 + }, + { + "epoch": 14.918381831085876, + "grad_norm": 0.10228786617517471, + "learning_rate": 8.508800567778567e-05, + "loss": 0.05371737480163574, + "step": 105100 + }, + { + "epoch": 14.919801277501774, + "grad_norm": 0.003000596771016717, + "learning_rate": 8.508658623136977e-05, + "loss": 0.003889523446559906, + "step": 105110 + }, + { + "epoch": 14.921220723917672, + "grad_norm": 0.06478263437747955, + "learning_rate": 8.508530872959545e-05, + "loss": 0.08041570186614991, + "step": 105120 + }, + { + "epoch": 14.92264017033357, + "grad_norm": 0.06725726276636124, + "learning_rate": 8.508388928317956e-05, + "loss": 0.03126749694347382, + "step": 105130 + }, + { + "epoch": 14.924059616749467, + "grad_norm": 0.15821486711502075, + "learning_rate": 8.508246983676366e-05, + "loss": 0.006352770328521729, + "step": 105140 + }, + { + "epoch": 14.925479063165366, + "grad_norm": 4.99887752532959, + "learning_rate": 8.508105039034777e-05, + "loss": 0.00685601457953453, + "step": 105150 + }, + { + "epoch": 14.926898509581264, + "grad_norm": 0.07320870459079742, + "learning_rate": 8.507963094393187e-05, + "loss": 0.04207580983638763, + "step": 105160 + }, + { + "epoch": 14.92831795599716, + "grad_norm": 3.1809349060058594, + "learning_rate": 8.507821149751597e-05, + "loss": 0.021016967296600342, + "step": 105170 + }, + { + "epoch": 14.929737402413059, + "grad_norm": 0.059679560363292694, + "learning_rate": 8.507679205110008e-05, + "loss": 0.037418097257614136, + "step": 105180 + }, + { + "epoch": 14.931156848828957, + "grad_norm": 0.04984293133020401, + "learning_rate": 8.507537260468418e-05, + "loss": 0.010998521745204926, + "step": 105190 + }, + { + "epoch": 14.932576295244855, + "grad_norm": 0.038183312863111496, + "learning_rate": 8.507395315826829e-05, + "loss": 0.00536215603351593, + "step": 105200 + }, + { + "epoch": 14.933995741660752, + "grad_norm": 0.20261302590370178, + "learning_rate": 8.507253371185238e-05, + "loss": 0.013916152715682983, + "step": 105210 + }, + { + "epoch": 14.93541518807665, + "grad_norm": 21.86165428161621, + "learning_rate": 8.507111426543648e-05, + "loss": 0.04513751268386841, + "step": 105220 + }, + { + "epoch": 14.936834634492548, + "grad_norm": 0.756188690662384, + "learning_rate": 8.506969481902058e-05, + "loss": 0.019983695447444917, + "step": 105230 + }, + { + "epoch": 14.938254080908445, + "grad_norm": 0.05859207734465599, + "learning_rate": 8.506827537260469e-05, + "loss": 0.019867606461048126, + "step": 105240 + }, + { + "epoch": 14.939673527324343, + "grad_norm": 0.04819387570023537, + "learning_rate": 8.506685592618879e-05, + "loss": 0.01914552301168442, + "step": 105250 + }, + { + "epoch": 14.941092973740242, + "grad_norm": 0.14617569744586945, + "learning_rate": 8.50654364797729e-05, + "loss": 0.026956775784492494, + "step": 105260 + }, + { + "epoch": 14.94251242015614, + "grad_norm": 0.10384272783994675, + "learning_rate": 8.5064017033357e-05, + "loss": 0.01044590026140213, + "step": 105270 + }, + { + "epoch": 14.943931866572036, + "grad_norm": 0.032726746052503586, + "learning_rate": 8.506259758694109e-05, + "loss": 0.009355773031711579, + "step": 105280 + }, + { + "epoch": 14.945351312987935, + "grad_norm": 0.22472988069057465, + "learning_rate": 8.50611781405252e-05, + "loss": 0.016062158346176147, + "step": 105290 + }, + { + "epoch": 14.946770759403833, + "grad_norm": 0.8198087811470032, + "learning_rate": 8.50597586941093e-05, + "loss": 0.0025589760392904282, + "step": 105300 + }, + { + "epoch": 14.94819020581973, + "grad_norm": 3.6993165016174316, + "learning_rate": 8.505833924769341e-05, + "loss": 0.005632463097572327, + "step": 105310 + }, + { + "epoch": 14.949609652235628, + "grad_norm": 1.08851957321167, + "learning_rate": 8.50569198012775e-05, + "loss": 0.06681792140007019, + "step": 105320 + }, + { + "epoch": 14.951029098651526, + "grad_norm": 3.3110740184783936, + "learning_rate": 8.50555003548616e-05, + "loss": 0.007581328600645065, + "step": 105330 + }, + { + "epoch": 14.952448545067424, + "grad_norm": 0.3741854429244995, + "learning_rate": 8.50540809084457e-05, + "loss": 0.022600403428077696, + "step": 105340 + }, + { + "epoch": 14.953867991483321, + "grad_norm": 0.7727049589157104, + "learning_rate": 8.505266146202981e-05, + "loss": 0.010692685097455978, + "step": 105350 + }, + { + "epoch": 14.95528743789922, + "grad_norm": 0.1877894401550293, + "learning_rate": 8.505124201561393e-05, + "loss": 0.017357349395751953, + "step": 105360 + }, + { + "epoch": 14.956706884315118, + "grad_norm": 9.413021087646484, + "learning_rate": 8.504982256919801e-05, + "loss": 0.05534917116165161, + "step": 105370 + }, + { + "epoch": 14.958126330731014, + "grad_norm": 5.803071975708008, + "learning_rate": 8.504840312278212e-05, + "loss": 0.019386471807956697, + "step": 105380 + }, + { + "epoch": 14.959545777146912, + "grad_norm": 0.6982179880142212, + "learning_rate": 8.504698367636622e-05, + "loss": 0.011141490936279298, + "step": 105390 + }, + { + "epoch": 14.96096522356281, + "grad_norm": 0.8867304921150208, + "learning_rate": 8.504556422995033e-05, + "loss": 0.019689857959747314, + "step": 105400 + }, + { + "epoch": 14.962384669978709, + "grad_norm": 13.065898895263672, + "learning_rate": 8.504414478353443e-05, + "loss": 0.05626608729362488, + "step": 105410 + }, + { + "epoch": 14.963804116394606, + "grad_norm": 4.895026206970215, + "learning_rate": 8.504272533711852e-05, + "loss": 0.026846492290496828, + "step": 105420 + }, + { + "epoch": 14.965223562810504, + "grad_norm": 5.024219989776611, + "learning_rate": 8.504130589070262e-05, + "loss": 0.01723182052373886, + "step": 105430 + }, + { + "epoch": 14.966643009226402, + "grad_norm": 6.151825904846191, + "learning_rate": 8.503988644428673e-05, + "loss": 0.028924581408500672, + "step": 105440 + }, + { + "epoch": 14.968062455642299, + "grad_norm": 0.17709432542324066, + "learning_rate": 8.503846699787084e-05, + "loss": 0.03688434362411499, + "step": 105450 + }, + { + "epoch": 14.969481902058197, + "grad_norm": 5.439521789550781, + "learning_rate": 8.503704755145494e-05, + "loss": 0.023295867443084716, + "step": 105460 + }, + { + "epoch": 14.970901348474095, + "grad_norm": 0.4720461964607239, + "learning_rate": 8.503562810503904e-05, + "loss": 0.018590964376926422, + "step": 105470 + }, + { + "epoch": 14.972320794889994, + "grad_norm": 8.585654258728027, + "learning_rate": 8.503420865862313e-05, + "loss": 0.05288488268852234, + "step": 105480 + }, + { + "epoch": 14.97374024130589, + "grad_norm": 4.889590263366699, + "learning_rate": 8.503278921220725e-05, + "loss": 0.01930091977119446, + "step": 105490 + }, + { + "epoch": 14.975159687721789, + "grad_norm": 0.030981246381998062, + "learning_rate": 8.503136976579134e-05, + "loss": 0.02204904705286026, + "step": 105500 + }, + { + "epoch": 14.975159687721789, + "eval_accuracy": 0.983849430914987, + "eval_loss": 0.0565863735973835, + "eval_runtime": 30.8719, + "eval_samples_per_second": 509.428, + "eval_steps_per_second": 15.937, + "step": 105500 + }, + { + "epoch": 14.976579134137687, + "grad_norm": 1.1705716848373413, + "learning_rate": 8.502995031937545e-05, + "loss": 0.008624742925167083, + "step": 105510 + }, + { + "epoch": 14.977998580553583, + "grad_norm": 0.7375067472457886, + "learning_rate": 8.502853087295955e-05, + "loss": 0.02464774250984192, + "step": 105520 + }, + { + "epoch": 14.979418026969482, + "grad_norm": 0.8426643013954163, + "learning_rate": 8.502711142654365e-05, + "loss": 0.060261762142181395, + "step": 105530 + }, + { + "epoch": 14.98083747338538, + "grad_norm": 4.958010673522949, + "learning_rate": 8.502569198012776e-05, + "loss": 0.008713224530220031, + "step": 105540 + }, + { + "epoch": 14.982256919801278, + "grad_norm": 0.7848474979400635, + "learning_rate": 8.502427253371186e-05, + "loss": 0.025871086120605468, + "step": 105550 + }, + { + "epoch": 14.983676366217175, + "grad_norm": 6.699347972869873, + "learning_rate": 8.502285308729597e-05, + "loss": 0.0266726553440094, + "step": 105560 + }, + { + "epoch": 14.985095812633073, + "grad_norm": 10.014344215393066, + "learning_rate": 8.502143364088007e-05, + "loss": 0.02668716311454773, + "step": 105570 + }, + { + "epoch": 14.986515259048971, + "grad_norm": 2.3931314945220947, + "learning_rate": 8.502001419446416e-05, + "loss": 0.02140100598335266, + "step": 105580 + }, + { + "epoch": 14.987934705464868, + "grad_norm": 0.6117798089981079, + "learning_rate": 8.501859474804826e-05, + "loss": 0.007049883902072907, + "step": 105590 + }, + { + "epoch": 14.989354151880766, + "grad_norm": 12.927014350891113, + "learning_rate": 8.501717530163237e-05, + "loss": 0.048466211557388304, + "step": 105600 + }, + { + "epoch": 14.990773598296665, + "grad_norm": 7.8452630043029785, + "learning_rate": 8.501575585521647e-05, + "loss": 0.014284107089042663, + "step": 105610 + }, + { + "epoch": 14.992193044712563, + "grad_norm": 0.233725905418396, + "learning_rate": 8.501433640880058e-05, + "loss": 0.04717913269996643, + "step": 105620 + }, + { + "epoch": 14.99361249112846, + "grad_norm": 0.1834932565689087, + "learning_rate": 8.501291696238468e-05, + "loss": 0.03222338557243347, + "step": 105630 + }, + { + "epoch": 14.995031937544358, + "grad_norm": 6.620406150817871, + "learning_rate": 8.501149751596877e-05, + "loss": 0.04589863419532776, + "step": 105640 + }, + { + "epoch": 14.996451383960256, + "grad_norm": 0.19498220086097717, + "learning_rate": 8.501007806955288e-05, + "loss": 0.0016890153288841247, + "step": 105650 + }, + { + "epoch": 14.997870830376153, + "grad_norm": 0.051954738795757294, + "learning_rate": 8.500865862313698e-05, + "loss": 0.01796208769083023, + "step": 105660 + }, + { + "epoch": 14.99929027679205, + "grad_norm": 0.3377091586589813, + "learning_rate": 8.50072391767211e-05, + "loss": 0.027283614873886107, + "step": 105670 + }, + { + "epoch": 15.00070972320795, + "grad_norm": 9.270378112792969, + "learning_rate": 8.500581973030518e-05, + "loss": 0.030120304226875304, + "step": 105680 + }, + { + "epoch": 15.002129169623847, + "grad_norm": 5.4814229011535645, + "learning_rate": 8.500440028388929e-05, + "loss": 0.025646063685417175, + "step": 105690 + }, + { + "epoch": 15.003548616039744, + "grad_norm": 0.20738860964775085, + "learning_rate": 8.500298083747339e-05, + "loss": 0.03865569829940796, + "step": 105700 + }, + { + "epoch": 15.004968062455642, + "grad_norm": 0.10520824044942856, + "learning_rate": 8.50015613910575e-05, + "loss": 0.010026749223470688, + "step": 105710 + }, + { + "epoch": 15.00638750887154, + "grad_norm": 10.182268142700195, + "learning_rate": 8.50001419446416e-05, + "loss": 0.023018835484981535, + "step": 105720 + }, + { + "epoch": 15.007806955287437, + "grad_norm": 0.039473164826631546, + "learning_rate": 8.499872249822569e-05, + "loss": 0.023001056909561158, + "step": 105730 + }, + { + "epoch": 15.009226401703335, + "grad_norm": 0.15117046236991882, + "learning_rate": 8.49973030518098e-05, + "loss": 0.0272275447845459, + "step": 105740 + }, + { + "epoch": 15.010645848119234, + "grad_norm": 2.397531747817993, + "learning_rate": 8.49958836053939e-05, + "loss": 0.03603595197200775, + "step": 105750 + }, + { + "epoch": 15.012065294535132, + "grad_norm": 0.23408842086791992, + "learning_rate": 8.499446415897801e-05, + "loss": 0.004264497011899948, + "step": 105760 + }, + { + "epoch": 15.013484740951029, + "grad_norm": 13.141559600830078, + "learning_rate": 8.499304471256211e-05, + "loss": 0.04609453082084656, + "step": 105770 + }, + { + "epoch": 15.014904187366927, + "grad_norm": 7.471634864807129, + "learning_rate": 8.49916252661462e-05, + "loss": 0.03790695369243622, + "step": 105780 + }, + { + "epoch": 15.016323633782825, + "grad_norm": 8.660481452941895, + "learning_rate": 8.49902058197303e-05, + "loss": 0.014696374535560608, + "step": 105790 + }, + { + "epoch": 15.017743080198722, + "grad_norm": 3.2237870693206787, + "learning_rate": 8.498878637331441e-05, + "loss": 0.03969519138336182, + "step": 105800 + }, + { + "epoch": 15.01916252661462, + "grad_norm": 7.434295177459717, + "learning_rate": 8.498736692689851e-05, + "loss": 0.06669918894767761, + "step": 105810 + }, + { + "epoch": 15.020581973030518, + "grad_norm": 0.9601563215255737, + "learning_rate": 8.498594748048262e-05, + "loss": 0.018045979738235473, + "step": 105820 + }, + { + "epoch": 15.022001419446417, + "grad_norm": 0.553522527217865, + "learning_rate": 8.498452803406672e-05, + "loss": 0.004522566124796868, + "step": 105830 + }, + { + "epoch": 15.023420865862313, + "grad_norm": 0.238037571310997, + "learning_rate": 8.498310858765082e-05, + "loss": 0.006286618113517761, + "step": 105840 + }, + { + "epoch": 15.024840312278211, + "grad_norm": 0.02164604514837265, + "learning_rate": 8.498168914123493e-05, + "loss": 0.06347473859786987, + "step": 105850 + }, + { + "epoch": 15.02625975869411, + "grad_norm": 6.417102336883545, + "learning_rate": 8.498026969481902e-05, + "loss": 0.03233658075332642, + "step": 105860 + }, + { + "epoch": 15.027679205110006, + "grad_norm": 0.07468276470899582, + "learning_rate": 8.497885024840314e-05, + "loss": 0.01961416006088257, + "step": 105870 + }, + { + "epoch": 15.029098651525905, + "grad_norm": 0.08328916877508163, + "learning_rate": 8.497743080198723e-05, + "loss": 0.02036486268043518, + "step": 105880 + }, + { + "epoch": 15.030518097941803, + "grad_norm": 0.13861192762851715, + "learning_rate": 8.497601135557133e-05, + "loss": 0.016381070017814636, + "step": 105890 + }, + { + "epoch": 15.031937544357701, + "grad_norm": 0.012162907980382442, + "learning_rate": 8.497459190915543e-05, + "loss": 0.025388899445533752, + "step": 105900 + }, + { + "epoch": 15.033356990773598, + "grad_norm": 0.12118402123451233, + "learning_rate": 8.497317246273954e-05, + "loss": 0.014161121845245362, + "step": 105910 + }, + { + "epoch": 15.034776437189496, + "grad_norm": 0.8070512413978577, + "learning_rate": 8.497175301632364e-05, + "loss": 0.005332186818122864, + "step": 105920 + }, + { + "epoch": 15.036195883605394, + "grad_norm": 1.0391985177993774, + "learning_rate": 8.497033356990775e-05, + "loss": 0.015077903866767883, + "step": 105930 + }, + { + "epoch": 15.037615330021291, + "grad_norm": 0.5412289500236511, + "learning_rate": 8.496891412349184e-05, + "loss": 0.003560176119208336, + "step": 105940 + }, + { + "epoch": 15.03903477643719, + "grad_norm": 0.17096680402755737, + "learning_rate": 8.496749467707594e-05, + "loss": 0.004259506985545158, + "step": 105950 + }, + { + "epoch": 15.040454222853088, + "grad_norm": 0.10446897894144058, + "learning_rate": 8.496607523066005e-05, + "loss": 0.013970400393009185, + "step": 105960 + }, + { + "epoch": 15.041873669268986, + "grad_norm": 6.217586994171143, + "learning_rate": 8.496465578424415e-05, + "loss": 0.012521201372146606, + "step": 105970 + }, + { + "epoch": 15.043293115684882, + "grad_norm": 5.13002347946167, + "learning_rate": 8.496323633782826e-05, + "loss": 0.009574723988771438, + "step": 105980 + }, + { + "epoch": 15.04471256210078, + "grad_norm": 0.07002128660678864, + "learning_rate": 8.496181689141234e-05, + "loss": 0.0025296185165643694, + "step": 105990 + }, + { + "epoch": 15.046132008516679, + "grad_norm": 0.8163052201271057, + "learning_rate": 8.496039744499646e-05, + "loss": 0.008115919679403305, + "step": 106000 + }, + { + "epoch": 15.046132008516679, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.047789935022592545, + "eval_runtime": 32.3739, + "eval_samples_per_second": 485.792, + "eval_steps_per_second": 15.197, + "step": 106000 + }, + { + "epoch": 15.047551454932576, + "grad_norm": 1.0925426483154297, + "learning_rate": 8.495897799858055e-05, + "loss": 0.05336245894432068, + "step": 106010 + }, + { + "epoch": 15.048970901348474, + "grad_norm": 0.4548809230327606, + "learning_rate": 8.495755855216466e-05, + "loss": 0.033079445362091064, + "step": 106020 + }, + { + "epoch": 15.050390347764372, + "grad_norm": 0.33710336685180664, + "learning_rate": 8.495613910574876e-05, + "loss": 0.021994808316230775, + "step": 106030 + }, + { + "epoch": 15.05180979418027, + "grad_norm": 1.0806621313095093, + "learning_rate": 8.495471965933286e-05, + "loss": 0.0022130701690912247, + "step": 106040 + }, + { + "epoch": 15.053229240596167, + "grad_norm": 0.9031606316566467, + "learning_rate": 8.495330021291697e-05, + "loss": 0.003949935734272003, + "step": 106050 + }, + { + "epoch": 15.054648687012065, + "grad_norm": 0.3298092782497406, + "learning_rate": 8.495188076650107e-05, + "loss": 0.04351118803024292, + "step": 106060 + }, + { + "epoch": 15.056068133427964, + "grad_norm": 1.8871431350708008, + "learning_rate": 8.495046132008518e-05, + "loss": 0.022950419783592226, + "step": 106070 + }, + { + "epoch": 15.05748757984386, + "grad_norm": 3.592064142227173, + "learning_rate": 8.494904187366928e-05, + "loss": 0.03793198466300964, + "step": 106080 + }, + { + "epoch": 15.058907026259758, + "grad_norm": 0.11663145571947098, + "learning_rate": 8.494762242725337e-05, + "loss": 0.015685516595840453, + "step": 106090 + }, + { + "epoch": 15.060326472675657, + "grad_norm": 0.003959988709539175, + "learning_rate": 8.494620298083747e-05, + "loss": 0.0015076756477355956, + "step": 106100 + }, + { + "epoch": 15.061745919091555, + "grad_norm": 2.6824986934661865, + "learning_rate": 8.494478353442158e-05, + "loss": 0.033915793895721434, + "step": 106110 + }, + { + "epoch": 15.063165365507452, + "grad_norm": 4.639670372009277, + "learning_rate": 8.494336408800568e-05, + "loss": 0.013209784030914306, + "step": 106120 + }, + { + "epoch": 15.06458481192335, + "grad_norm": 0.9888678789138794, + "learning_rate": 8.494194464158979e-05, + "loss": 0.04018623828887939, + "step": 106130 + }, + { + "epoch": 15.066004258339248, + "grad_norm": 3.075080156326294, + "learning_rate": 8.494052519517389e-05, + "loss": 0.03815135657787323, + "step": 106140 + }, + { + "epoch": 15.067423704755145, + "grad_norm": 1.7605953216552734, + "learning_rate": 8.493910574875798e-05, + "loss": 0.007596167922019959, + "step": 106150 + }, + { + "epoch": 15.068843151171043, + "grad_norm": 0.012811378575861454, + "learning_rate": 8.49376863023421e-05, + "loss": 0.006362202018499375, + "step": 106160 + }, + { + "epoch": 15.070262597586941, + "grad_norm": 1.7996217012405396, + "learning_rate": 8.493626685592619e-05, + "loss": 0.009276589751243592, + "step": 106170 + }, + { + "epoch": 15.07168204400284, + "grad_norm": 6.258857727050781, + "learning_rate": 8.49348474095103e-05, + "loss": 0.025115305185317995, + "step": 106180 + }, + { + "epoch": 15.073101490418736, + "grad_norm": 4.9005632400512695, + "learning_rate": 8.493342796309439e-05, + "loss": 0.009275999665260316, + "step": 106190 + }, + { + "epoch": 15.074520936834634, + "grad_norm": 0.04640462249517441, + "learning_rate": 8.49320085166785e-05, + "loss": 0.028778478503227234, + "step": 106200 + }, + { + "epoch": 15.075940383250533, + "grad_norm": 0.02235148847103119, + "learning_rate": 8.49305890702626e-05, + "loss": 0.06231725811958313, + "step": 106210 + }, + { + "epoch": 15.07735982966643, + "grad_norm": 0.03783341869711876, + "learning_rate": 8.49291696238467e-05, + "loss": 0.010832040011882782, + "step": 106220 + }, + { + "epoch": 15.078779276082328, + "grad_norm": 9.759112358093262, + "learning_rate": 8.49277501774308e-05, + "loss": 0.02757435142993927, + "step": 106230 + }, + { + "epoch": 15.080198722498226, + "grad_norm": 13.926852226257324, + "learning_rate": 8.492633073101491e-05, + "loss": 0.019434280693531036, + "step": 106240 + }, + { + "epoch": 15.081618168914124, + "grad_norm": 0.11499132961034775, + "learning_rate": 8.492491128459901e-05, + "loss": 0.03727405369281769, + "step": 106250 + }, + { + "epoch": 15.08303761533002, + "grad_norm": 8.562307357788086, + "learning_rate": 8.492349183818311e-05, + "loss": 0.03538262248039246, + "step": 106260 + }, + { + "epoch": 15.084457061745919, + "grad_norm": 19.21854019165039, + "learning_rate": 8.492207239176722e-05, + "loss": 0.019014018774032592, + "step": 106270 + }, + { + "epoch": 15.085876508161817, + "grad_norm": 0.5749403238296509, + "learning_rate": 8.492065294535132e-05, + "loss": 0.03317195773124695, + "step": 106280 + }, + { + "epoch": 15.087295954577714, + "grad_norm": 9.464776039123535, + "learning_rate": 8.491923349893543e-05, + "loss": 0.02200724482536316, + "step": 106290 + }, + { + "epoch": 15.088715400993612, + "grad_norm": 0.37284818291664124, + "learning_rate": 8.491781405251951e-05, + "loss": 0.03638032078742981, + "step": 106300 + }, + { + "epoch": 15.09013484740951, + "grad_norm": 6.311309814453125, + "learning_rate": 8.491639460610362e-05, + "loss": 0.03605286777019501, + "step": 106310 + }, + { + "epoch": 15.091554293825409, + "grad_norm": 0.11901773512363434, + "learning_rate": 8.491497515968772e-05, + "loss": 0.03141979277133942, + "step": 106320 + }, + { + "epoch": 15.092973740241305, + "grad_norm": 0.030311891809105873, + "learning_rate": 8.491355571327183e-05, + "loss": 0.016466012597084044, + "step": 106330 + }, + { + "epoch": 15.094393186657204, + "grad_norm": 2.06520414352417, + "learning_rate": 8.491213626685593e-05, + "loss": 0.011501440405845642, + "step": 106340 + }, + { + "epoch": 15.095812633073102, + "grad_norm": 0.344914972782135, + "learning_rate": 8.491071682044003e-05, + "loss": 0.01476086676120758, + "step": 106350 + }, + { + "epoch": 15.097232079488998, + "grad_norm": 7.9556498527526855, + "learning_rate": 8.490929737402414e-05, + "loss": 0.006111126765608788, + "step": 106360 + }, + { + "epoch": 15.098651525904897, + "grad_norm": 0.5263380408287048, + "learning_rate": 8.490787792760823e-05, + "loss": 0.006287018954753876, + "step": 106370 + }, + { + "epoch": 15.100070972320795, + "grad_norm": 0.007816202007234097, + "learning_rate": 8.490645848119235e-05, + "loss": 0.035767942667007446, + "step": 106380 + }, + { + "epoch": 15.101490418736693, + "grad_norm": 0.1581820547580719, + "learning_rate": 8.490503903477644e-05, + "loss": 0.006314507126808167, + "step": 106390 + }, + { + "epoch": 15.10290986515259, + "grad_norm": 0.29545098543167114, + "learning_rate": 8.490361958836054e-05, + "loss": 0.02759753167629242, + "step": 106400 + }, + { + "epoch": 15.104329311568488, + "grad_norm": 0.3469163775444031, + "learning_rate": 8.490220014194464e-05, + "loss": 0.0040346551686525345, + "step": 106410 + }, + { + "epoch": 15.105748757984387, + "grad_norm": 0.24172019958496094, + "learning_rate": 8.490078069552875e-05, + "loss": 0.02106695920228958, + "step": 106420 + }, + { + "epoch": 15.107168204400283, + "grad_norm": 0.20899048447608948, + "learning_rate": 8.489936124911285e-05, + "loss": 0.013521634042263031, + "step": 106430 + }, + { + "epoch": 15.108587650816181, + "grad_norm": 2.07224440574646, + "learning_rate": 8.489794180269696e-05, + "loss": 0.02519119381904602, + "step": 106440 + }, + { + "epoch": 15.11000709723208, + "grad_norm": 1.6022323369979858, + "learning_rate": 8.489652235628105e-05, + "loss": 0.011256136745214463, + "step": 106450 + }, + { + "epoch": 15.111426543647978, + "grad_norm": 0.06830890476703644, + "learning_rate": 8.489510290986515e-05, + "loss": 0.042464840412139895, + "step": 106460 + }, + { + "epoch": 15.112845990063875, + "grad_norm": 0.025041621178388596, + "learning_rate": 8.489368346344926e-05, + "loss": 0.0023004353046417237, + "step": 106470 + }, + { + "epoch": 15.114265436479773, + "grad_norm": 3.737102508544922, + "learning_rate": 8.489226401703336e-05, + "loss": 0.03057832717895508, + "step": 106480 + }, + { + "epoch": 15.115684882895671, + "grad_norm": 0.3811050057411194, + "learning_rate": 8.489084457061747e-05, + "loss": 0.010930734872817992, + "step": 106490 + }, + { + "epoch": 15.117104329311568, + "grad_norm": 1.066455602645874, + "learning_rate": 8.488942512420155e-05, + "loss": 0.0034047245979309084, + "step": 106500 + }, + { + "epoch": 15.117104329311568, + "eval_accuracy": 0.9809245247027405, + "eval_loss": 0.06730737537145615, + "eval_runtime": 33.0311, + "eval_samples_per_second": 476.126, + "eval_steps_per_second": 14.895, + "step": 106500 + }, + { + "epoch": 15.118523775727466, + "grad_norm": 14.054121971130371, + "learning_rate": 8.488800567778567e-05, + "loss": 0.021509627997875213, + "step": 106510 + }, + { + "epoch": 15.119943222143364, + "grad_norm": 2.574340343475342, + "learning_rate": 8.488658623136976e-05, + "loss": 0.015637876093387605, + "step": 106520 + }, + { + "epoch": 15.121362668559263, + "grad_norm": 0.16894987225532532, + "learning_rate": 8.488516678495387e-05, + "loss": 0.012121076881885528, + "step": 106530 + }, + { + "epoch": 15.12278211497516, + "grad_norm": 0.2743363678455353, + "learning_rate": 8.488374733853797e-05, + "loss": 0.009187763184309005, + "step": 106540 + }, + { + "epoch": 15.124201561391057, + "grad_norm": 1.0792840719223022, + "learning_rate": 8.488232789212207e-05, + "loss": 0.020037105679512023, + "step": 106550 + }, + { + "epoch": 15.125621007806956, + "grad_norm": 0.768378496170044, + "learning_rate": 8.488090844570618e-05, + "loss": 0.061950075626373294, + "step": 106560 + }, + { + "epoch": 15.127040454222852, + "grad_norm": 3.3310279846191406, + "learning_rate": 8.487948899929028e-05, + "loss": 0.011520791053771972, + "step": 106570 + }, + { + "epoch": 15.12845990063875, + "grad_norm": 1.0102425813674927, + "learning_rate": 8.487806955287439e-05, + "loss": 0.04175326824188232, + "step": 106580 + }, + { + "epoch": 15.129879347054649, + "grad_norm": 0.11006898432970047, + "learning_rate": 8.487665010645849e-05, + "loss": 0.014114585518836976, + "step": 106590 + }, + { + "epoch": 15.131298793470547, + "grad_norm": 0.9182747602462769, + "learning_rate": 8.48752306600426e-05, + "loss": 0.037896940112113954, + "step": 106600 + }, + { + "epoch": 15.132718239886444, + "grad_norm": 0.8234381079673767, + "learning_rate": 8.487381121362668e-05, + "loss": 0.03469969928264618, + "step": 106610 + }, + { + "epoch": 15.134137686302342, + "grad_norm": 5.307381629943848, + "learning_rate": 8.487239176721079e-05, + "loss": 0.008131003379821778, + "step": 106620 + }, + { + "epoch": 15.13555713271824, + "grad_norm": 0.09964298456907272, + "learning_rate": 8.487097232079489e-05, + "loss": 0.03536056876182556, + "step": 106630 + }, + { + "epoch": 15.136976579134137, + "grad_norm": 0.10982150584459305, + "learning_rate": 8.4869552874379e-05, + "loss": 0.008714452385902405, + "step": 106640 + }, + { + "epoch": 15.138396025550035, + "grad_norm": 0.2031431943178177, + "learning_rate": 8.48681334279631e-05, + "loss": 0.05170516967773438, + "step": 106650 + }, + { + "epoch": 15.139815471965933, + "grad_norm": 0.2435196489095688, + "learning_rate": 8.48667139815472e-05, + "loss": 0.016551582515239714, + "step": 106660 + }, + { + "epoch": 15.141234918381832, + "grad_norm": 0.15026158094406128, + "learning_rate": 8.48652945351313e-05, + "loss": 0.021115848422050477, + "step": 106670 + }, + { + "epoch": 15.142654364797728, + "grad_norm": 0.7808573842048645, + "learning_rate": 8.48638750887154e-05, + "loss": 0.006684541702270508, + "step": 106680 + }, + { + "epoch": 15.144073811213627, + "grad_norm": 1.9700183868408203, + "learning_rate": 8.486245564229951e-05, + "loss": 0.014693331718444825, + "step": 106690 + }, + { + "epoch": 15.145493257629525, + "grad_norm": 4.157971382141113, + "learning_rate": 8.486103619588361e-05, + "loss": 0.023039808869361876, + "step": 106700 + }, + { + "epoch": 15.146912704045421, + "grad_norm": 0.5956658720970154, + "learning_rate": 8.485961674946771e-05, + "loss": 0.012369179725646972, + "step": 106710 + }, + { + "epoch": 15.14833215046132, + "grad_norm": 0.15433919429779053, + "learning_rate": 8.48581973030518e-05, + "loss": 0.027466484904289247, + "step": 106720 + }, + { + "epoch": 15.149751596877218, + "grad_norm": 2.2408993244171143, + "learning_rate": 8.485677785663592e-05, + "loss": 0.03421107828617096, + "step": 106730 + }, + { + "epoch": 15.151171043293116, + "grad_norm": 0.34294793009757996, + "learning_rate": 8.485535841022001e-05, + "loss": 0.011824176460504533, + "step": 106740 + }, + { + "epoch": 15.152590489709013, + "grad_norm": 7.680258750915527, + "learning_rate": 8.485393896380412e-05, + "loss": 0.02799707055091858, + "step": 106750 + }, + { + "epoch": 15.154009936124911, + "grad_norm": 10.662795066833496, + "learning_rate": 8.485251951738822e-05, + "loss": 0.01989738643169403, + "step": 106760 + }, + { + "epoch": 15.15542938254081, + "grad_norm": 0.026028187945485115, + "learning_rate": 8.485110007097232e-05, + "loss": 0.0778480350971222, + "step": 106770 + }, + { + "epoch": 15.156848828956706, + "grad_norm": 0.5896021723747253, + "learning_rate": 8.484968062455643e-05, + "loss": 0.0032101761549711227, + "step": 106780 + }, + { + "epoch": 15.158268275372604, + "grad_norm": 0.2833305895328522, + "learning_rate": 8.484826117814053e-05, + "loss": 0.06184155941009521, + "step": 106790 + }, + { + "epoch": 15.159687721788503, + "grad_norm": 16.18917465209961, + "learning_rate": 8.484684173172464e-05, + "loss": 0.01600014418363571, + "step": 106800 + }, + { + "epoch": 15.161107168204401, + "grad_norm": 0.010805347934365273, + "learning_rate": 8.484542228530872e-05, + "loss": 0.009944060444831848, + "step": 106810 + }, + { + "epoch": 15.162526614620297, + "grad_norm": 5.7854108810424805, + "learning_rate": 8.484400283889283e-05, + "loss": 0.01406932771205902, + "step": 106820 + }, + { + "epoch": 15.163946061036196, + "grad_norm": 5.306714057922363, + "learning_rate": 8.484258339247693e-05, + "loss": 0.009397006034851075, + "step": 106830 + }, + { + "epoch": 15.165365507452094, + "grad_norm": 4.2969651222229, + "learning_rate": 8.484116394606104e-05, + "loss": 0.018796910345554353, + "step": 106840 + }, + { + "epoch": 15.16678495386799, + "grad_norm": 5.016585350036621, + "learning_rate": 8.483974449964515e-05, + "loss": 0.009421862661838531, + "step": 106850 + }, + { + "epoch": 15.168204400283889, + "grad_norm": 0.25573283433914185, + "learning_rate": 8.483832505322924e-05, + "loss": 0.0481646716594696, + "step": 106860 + }, + { + "epoch": 15.169623846699787, + "grad_norm": 0.49223005771636963, + "learning_rate": 8.483690560681335e-05, + "loss": 0.05817559361457825, + "step": 106870 + }, + { + "epoch": 15.171043293115686, + "grad_norm": 0.015384448692202568, + "learning_rate": 8.483548616039744e-05, + "loss": 0.02148095369338989, + "step": 106880 + }, + { + "epoch": 15.172462739531582, + "grad_norm": 0.05889136344194412, + "learning_rate": 8.483406671398156e-05, + "loss": 0.003951547667384148, + "step": 106890 + }, + { + "epoch": 15.17388218594748, + "grad_norm": 0.9542629718780518, + "learning_rate": 8.483264726756565e-05, + "loss": 0.03866635262966156, + "step": 106900 + }, + { + "epoch": 15.175301632363379, + "grad_norm": 0.1657920777797699, + "learning_rate": 8.483122782114975e-05, + "loss": 0.033595597743988036, + "step": 106910 + }, + { + "epoch": 15.176721078779275, + "grad_norm": 14.490575790405273, + "learning_rate": 8.482980837473385e-05, + "loss": 0.020913679897785187, + "step": 106920 + }, + { + "epoch": 15.178140525195174, + "grad_norm": 0.5272268652915955, + "learning_rate": 8.482838892831796e-05, + "loss": 0.008652272075414658, + "step": 106930 + }, + { + "epoch": 15.179559971611072, + "grad_norm": 0.7444571256637573, + "learning_rate": 8.482696948190207e-05, + "loss": 0.007965491712093353, + "step": 106940 + }, + { + "epoch": 15.18097941802697, + "grad_norm": 0.338965505361557, + "learning_rate": 8.482555003548617e-05, + "loss": 0.02447792887687683, + "step": 106950 + }, + { + "epoch": 15.182398864442867, + "grad_norm": 9.44382095336914, + "learning_rate": 8.482413058907028e-05, + "loss": 0.015413524210453033, + "step": 106960 + }, + { + "epoch": 15.183818310858765, + "grad_norm": 0.007815083488821983, + "learning_rate": 8.482271114265436e-05, + "loss": 0.12182474136352539, + "step": 106970 + }, + { + "epoch": 15.185237757274663, + "grad_norm": 4.471283435821533, + "learning_rate": 8.482129169623847e-05, + "loss": 0.029788649082183837, + "step": 106980 + }, + { + "epoch": 15.18665720369056, + "grad_norm": 19.09647560119629, + "learning_rate": 8.481987224982257e-05, + "loss": 0.03674539923667908, + "step": 106990 + }, + { + "epoch": 15.188076650106458, + "grad_norm": 0.18893034756183624, + "learning_rate": 8.481845280340668e-05, + "loss": 0.014733706414699555, + "step": 107000 + }, + { + "epoch": 15.188076650106458, + "eval_accuracy": 0.9782539581611241, + "eval_loss": 0.08096309751272202, + "eval_runtime": 31.867, + "eval_samples_per_second": 493.52, + "eval_steps_per_second": 15.439, + "step": 107000 + }, + { + "epoch": 15.189496096522356, + "grad_norm": 1.9686450958251953, + "learning_rate": 8.481703335699078e-05, + "loss": 0.038543257117271426, + "step": 107010 + }, + { + "epoch": 15.190915542938255, + "grad_norm": 2.9802894592285156, + "learning_rate": 8.481561391057488e-05, + "loss": 0.014302444458007813, + "step": 107020 + }, + { + "epoch": 15.192334989354151, + "grad_norm": 0.6058565974235535, + "learning_rate": 8.481419446415899e-05, + "loss": 0.05712045431137085, + "step": 107030 + }, + { + "epoch": 15.19375443577005, + "grad_norm": 0.004902772139757872, + "learning_rate": 8.481277501774308e-05, + "loss": 0.012648455798625946, + "step": 107040 + }, + { + "epoch": 15.195173882185948, + "grad_norm": 7.411450386047363, + "learning_rate": 8.48113555713272e-05, + "loss": 0.0784771203994751, + "step": 107050 + }, + { + "epoch": 15.196593328601844, + "grad_norm": 0.8683082461357117, + "learning_rate": 8.480993612491129e-05, + "loss": 0.026170209050178528, + "step": 107060 + }, + { + "epoch": 15.198012775017743, + "grad_norm": 0.04517837241292, + "learning_rate": 8.480851667849539e-05, + "loss": 0.0017347116023302077, + "step": 107070 + }, + { + "epoch": 15.199432221433641, + "grad_norm": 13.40206241607666, + "learning_rate": 8.480709723207949e-05, + "loss": 0.045030486583709714, + "step": 107080 + }, + { + "epoch": 15.20085166784954, + "grad_norm": 1.9814329147338867, + "learning_rate": 8.48056777856636e-05, + "loss": 0.03218642473220825, + "step": 107090 + }, + { + "epoch": 15.202271114265436, + "grad_norm": 1.8605858087539673, + "learning_rate": 8.48042583392477e-05, + "loss": 0.02088761329650879, + "step": 107100 + }, + { + "epoch": 15.203690560681334, + "grad_norm": 0.3231185972690582, + "learning_rate": 8.48028388928318e-05, + "loss": 0.03878544867038727, + "step": 107110 + }, + { + "epoch": 15.205110007097232, + "grad_norm": 1.338391661643982, + "learning_rate": 8.48014194464159e-05, + "loss": 0.019712454080581664, + "step": 107120 + }, + { + "epoch": 15.206529453513129, + "grad_norm": 0.7573134899139404, + "learning_rate": 8.48e-05, + "loss": 0.030023956298828126, + "step": 107130 + }, + { + "epoch": 15.207948899929027, + "grad_norm": 0.08245470374822617, + "learning_rate": 8.479858055358411e-05, + "loss": 0.008657753467559814, + "step": 107140 + }, + { + "epoch": 15.209368346344926, + "grad_norm": 4.181887149810791, + "learning_rate": 8.479716110716821e-05, + "loss": 0.017116375267505646, + "step": 107150 + }, + { + "epoch": 15.210787792760824, + "grad_norm": 0.013549786061048508, + "learning_rate": 8.479574166075232e-05, + "loss": 0.01756223291158676, + "step": 107160 + }, + { + "epoch": 15.21220723917672, + "grad_norm": 5.840518951416016, + "learning_rate": 8.47943222143364e-05, + "loss": 0.0685306191444397, + "step": 107170 + }, + { + "epoch": 15.213626685592619, + "grad_norm": 0.18977539241313934, + "learning_rate": 8.479290276792052e-05, + "loss": 0.033226925134658816, + "step": 107180 + }, + { + "epoch": 15.215046132008517, + "grad_norm": 0.1447950005531311, + "learning_rate": 8.479148332150461e-05, + "loss": 0.026240897178649903, + "step": 107190 + }, + { + "epoch": 15.216465578424414, + "grad_norm": 0.12102290987968445, + "learning_rate": 8.479006387508872e-05, + "loss": 0.0039006978273391723, + "step": 107200 + }, + { + "epoch": 15.217885024840312, + "grad_norm": 0.09665011614561081, + "learning_rate": 8.478864442867282e-05, + "loss": 0.009919333457946777, + "step": 107210 + }, + { + "epoch": 15.21930447125621, + "grad_norm": 0.2344123125076294, + "learning_rate": 8.478722498225692e-05, + "loss": 0.042432117462158206, + "step": 107220 + }, + { + "epoch": 15.220723917672109, + "grad_norm": 1.6214635372161865, + "learning_rate": 8.478580553584103e-05, + "loss": 0.010458789765834808, + "step": 107230 + }, + { + "epoch": 15.222143364088005, + "grad_norm": 0.058471862226724625, + "learning_rate": 8.478438608942513e-05, + "loss": 0.0985255241394043, + "step": 107240 + }, + { + "epoch": 15.223562810503903, + "grad_norm": 5.8149495124816895, + "learning_rate": 8.478296664300924e-05, + "loss": 0.025858157873153688, + "step": 107250 + }, + { + "epoch": 15.224982256919802, + "grad_norm": 0.12123987078666687, + "learning_rate": 8.478154719659333e-05, + "loss": 0.021722464263439177, + "step": 107260 + }, + { + "epoch": 15.2264017033357, + "grad_norm": 0.7817923426628113, + "learning_rate": 8.478012775017743e-05, + "loss": 0.01945972293615341, + "step": 107270 + }, + { + "epoch": 15.227821149751597, + "grad_norm": 0.03892279788851738, + "learning_rate": 8.477870830376153e-05, + "loss": 0.009514982998371124, + "step": 107280 + }, + { + "epoch": 15.229240596167495, + "grad_norm": 1.8403258323669434, + "learning_rate": 8.477728885734564e-05, + "loss": 0.04027565121650696, + "step": 107290 + }, + { + "epoch": 15.230660042583393, + "grad_norm": 0.049720872193574905, + "learning_rate": 8.477586941092974e-05, + "loss": 0.012929567694664001, + "step": 107300 + }, + { + "epoch": 15.23207948899929, + "grad_norm": 1.8823243379592896, + "learning_rate": 8.477444996451385e-05, + "loss": 0.02587153613567352, + "step": 107310 + }, + { + "epoch": 15.233498935415188, + "grad_norm": 0.21937565505504608, + "learning_rate": 8.477303051809795e-05, + "loss": 0.026926514506340028, + "step": 107320 + }, + { + "epoch": 15.234918381831086, + "grad_norm": 0.034718845039606094, + "learning_rate": 8.477161107168204e-05, + "loss": 0.0010982461273670197, + "step": 107330 + }, + { + "epoch": 15.236337828246985, + "grad_norm": 0.1314888447523117, + "learning_rate": 8.477019162526615e-05, + "loss": 0.032864326238632204, + "step": 107340 + }, + { + "epoch": 15.237757274662881, + "grad_norm": 0.024118224158883095, + "learning_rate": 8.476877217885025e-05, + "loss": 0.004268708452582359, + "step": 107350 + }, + { + "epoch": 15.23917672107878, + "grad_norm": 0.0030872609931975603, + "learning_rate": 8.476735273243436e-05, + "loss": 0.014760005474090575, + "step": 107360 + }, + { + "epoch": 15.240596167494678, + "grad_norm": 0.00926938932389021, + "learning_rate": 8.476593328601846e-05, + "loss": 0.044267669320106506, + "step": 107370 + }, + { + "epoch": 15.242015613910574, + "grad_norm": 0.8168309330940247, + "learning_rate": 8.476451383960256e-05, + "loss": 0.007230883836746216, + "step": 107380 + }, + { + "epoch": 15.243435060326473, + "grad_norm": 0.06650938093662262, + "learning_rate": 8.476309439318665e-05, + "loss": 0.0143302783370018, + "step": 107390 + }, + { + "epoch": 15.24485450674237, + "grad_norm": 1.2258223295211792, + "learning_rate": 8.476167494677077e-05, + "loss": 0.0024061430245637893, + "step": 107400 + }, + { + "epoch": 15.24627395315827, + "grad_norm": 0.09853166341781616, + "learning_rate": 8.476025550035486e-05, + "loss": 0.0160846084356308, + "step": 107410 + }, + { + "epoch": 15.247693399574166, + "grad_norm": 5.717872619628906, + "learning_rate": 8.475883605393897e-05, + "loss": 0.027300027012825013, + "step": 107420 + }, + { + "epoch": 15.249112845990064, + "grad_norm": 6.418786525726318, + "learning_rate": 8.475741660752307e-05, + "loss": 0.017071712017059325, + "step": 107430 + }, + { + "epoch": 15.250532292405962, + "grad_norm": 0.07067640870809555, + "learning_rate": 8.475599716110717e-05, + "loss": 0.024442729353904725, + "step": 107440 + }, + { + "epoch": 15.251951738821859, + "grad_norm": 1.3273271322250366, + "learning_rate": 8.475457771469128e-05, + "loss": 0.00802953988313675, + "step": 107450 + }, + { + "epoch": 15.253371185237757, + "grad_norm": 6.9608964920043945, + "learning_rate": 8.475315826827538e-05, + "loss": 0.018375779688358306, + "step": 107460 + }, + { + "epoch": 15.254790631653655, + "grad_norm": 0.00919936690479517, + "learning_rate": 8.475173882185949e-05, + "loss": 0.018325018882751464, + "step": 107470 + }, + { + "epoch": 15.256210078069554, + "grad_norm": 1.523152232170105, + "learning_rate": 8.475031937544357e-05, + "loss": 0.005215193331241608, + "step": 107480 + }, + { + "epoch": 15.25762952448545, + "grad_norm": 0.3632388710975647, + "learning_rate": 8.474889992902768e-05, + "loss": 0.017545858025550844, + "step": 107490 + }, + { + "epoch": 15.259048970901349, + "grad_norm": 1.890334963798523, + "learning_rate": 8.474748048261178e-05, + "loss": 0.03647333979606628, + "step": 107500 + }, + { + "epoch": 15.259048970901349, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.04831605404615402, + "eval_runtime": 32.764, + "eval_samples_per_second": 480.009, + "eval_steps_per_second": 15.017, + "step": 107500 + }, + { + "epoch": 15.260468417317247, + "grad_norm": 0.5332909822463989, + "learning_rate": 8.474606103619589e-05, + "loss": 0.01102583259344101, + "step": 107510 + }, + { + "epoch": 15.261887863733143, + "grad_norm": 2.9368104934692383, + "learning_rate": 8.474464158977999e-05, + "loss": 0.07062627673149109, + "step": 107520 + }, + { + "epoch": 15.263307310149042, + "grad_norm": 0.2998633086681366, + "learning_rate": 8.474322214336409e-05, + "loss": 0.011581972986459733, + "step": 107530 + }, + { + "epoch": 15.26472675656494, + "grad_norm": 0.20585720241069794, + "learning_rate": 8.47418026969482e-05, + "loss": 0.023847994208335877, + "step": 107540 + }, + { + "epoch": 15.266146202980838, + "grad_norm": 0.44639578461647034, + "learning_rate": 8.47403832505323e-05, + "loss": 0.02701796293258667, + "step": 107550 + }, + { + "epoch": 15.267565649396735, + "grad_norm": 13.586299896240234, + "learning_rate": 8.47389638041164e-05, + "loss": 0.030145710706710814, + "step": 107560 + }, + { + "epoch": 15.268985095812633, + "grad_norm": 0.13911595940589905, + "learning_rate": 8.47375443577005e-05, + "loss": 0.03382292091846466, + "step": 107570 + }, + { + "epoch": 15.270404542228531, + "grad_norm": 0.6651485562324524, + "learning_rate": 8.47361249112846e-05, + "loss": 0.018535080552101135, + "step": 107580 + }, + { + "epoch": 15.271823988644428, + "grad_norm": 10.191615104675293, + "learning_rate": 8.47347054648687e-05, + "loss": 0.038520559668540955, + "step": 107590 + }, + { + "epoch": 15.273243435060326, + "grad_norm": 11.469804763793945, + "learning_rate": 8.473328601845281e-05, + "loss": 0.02716066837310791, + "step": 107600 + }, + { + "epoch": 15.274662881476225, + "grad_norm": 3.524324417114258, + "learning_rate": 8.47318665720369e-05, + "loss": 0.009171813726425171, + "step": 107610 + }, + { + "epoch": 15.276082327892123, + "grad_norm": 0.7207483053207397, + "learning_rate": 8.473044712562102e-05, + "loss": 0.017212912440299988, + "step": 107620 + }, + { + "epoch": 15.27750177430802, + "grad_norm": 9.047149658203125, + "learning_rate": 8.472902767920511e-05, + "loss": 0.03292661309242249, + "step": 107630 + }, + { + "epoch": 15.278921220723918, + "grad_norm": 13.497153282165527, + "learning_rate": 8.472760823278921e-05, + "loss": 0.0390909492969513, + "step": 107640 + }, + { + "epoch": 15.280340667139816, + "grad_norm": 0.6657223701477051, + "learning_rate": 8.472618878637332e-05, + "loss": 0.012965390086174011, + "step": 107650 + }, + { + "epoch": 15.281760113555713, + "grad_norm": 4.909111022949219, + "learning_rate": 8.472476933995742e-05, + "loss": 0.02200748324394226, + "step": 107660 + }, + { + "epoch": 15.283179559971611, + "grad_norm": 6.05718994140625, + "learning_rate": 8.472334989354153e-05, + "loss": 0.009501121938228607, + "step": 107670 + }, + { + "epoch": 15.28459900638751, + "grad_norm": 0.006838800385594368, + "learning_rate": 8.472193044712563e-05, + "loss": 0.035414910316467284, + "step": 107680 + }, + { + "epoch": 15.286018452803408, + "grad_norm": 7.050548553466797, + "learning_rate": 8.472051100070973e-05, + "loss": 0.0719907522201538, + "step": 107690 + }, + { + "epoch": 15.287437899219304, + "grad_norm": 0.3600425124168396, + "learning_rate": 8.471909155429382e-05, + "loss": 0.006389583647251129, + "step": 107700 + }, + { + "epoch": 15.288857345635202, + "grad_norm": 0.07480444014072418, + "learning_rate": 8.471767210787793e-05, + "loss": 0.023404929041862487, + "step": 107710 + }, + { + "epoch": 15.2902767920511, + "grad_norm": 0.056176621466875076, + "learning_rate": 8.471625266146203e-05, + "loss": 0.005539464205503464, + "step": 107720 + }, + { + "epoch": 15.291696238466997, + "grad_norm": 0.8116849064826965, + "learning_rate": 8.471483321504614e-05, + "loss": 0.01017369031906128, + "step": 107730 + }, + { + "epoch": 15.293115684882896, + "grad_norm": 1.566999912261963, + "learning_rate": 8.471341376863024e-05, + "loss": 0.008306996524333954, + "step": 107740 + }, + { + "epoch": 15.294535131298794, + "grad_norm": 0.6852664351463318, + "learning_rate": 8.471199432221434e-05, + "loss": 0.02082604467868805, + "step": 107750 + }, + { + "epoch": 15.295954577714692, + "grad_norm": 1.0683574676513672, + "learning_rate": 8.471057487579845e-05, + "loss": 0.0872978389263153, + "step": 107760 + }, + { + "epoch": 15.297374024130589, + "grad_norm": 0.01093759760260582, + "learning_rate": 8.470915542938254e-05, + "loss": 0.014742153882980346, + "step": 107770 + }, + { + "epoch": 15.298793470546487, + "grad_norm": 5.044257640838623, + "learning_rate": 8.470773598296666e-05, + "loss": 0.012482000142335891, + "step": 107780 + }, + { + "epoch": 15.300212916962385, + "grad_norm": 0.09109003096818924, + "learning_rate": 8.470631653655074e-05, + "loss": 0.006877711415290833, + "step": 107790 + }, + { + "epoch": 15.301632363378282, + "grad_norm": 13.097904205322266, + "learning_rate": 8.470489709013485e-05, + "loss": 0.018438270688056944, + "step": 107800 + }, + { + "epoch": 15.30305180979418, + "grad_norm": 0.045856866985559464, + "learning_rate": 8.470347764371895e-05, + "loss": 0.006077097356319427, + "step": 107810 + }, + { + "epoch": 15.304471256210078, + "grad_norm": 0.6873058080673218, + "learning_rate": 8.470205819730306e-05, + "loss": 0.024769291281700134, + "step": 107820 + }, + { + "epoch": 15.305890702625977, + "grad_norm": 9.974763870239258, + "learning_rate": 8.470063875088716e-05, + "loss": 0.02486882209777832, + "step": 107830 + }, + { + "epoch": 15.307310149041873, + "grad_norm": 10.30651569366455, + "learning_rate": 8.469921930447125e-05, + "loss": 0.029645463824272154, + "step": 107840 + }, + { + "epoch": 15.308729595457772, + "grad_norm": 8.100309371948242, + "learning_rate": 8.469779985805536e-05, + "loss": 0.009516823291778564, + "step": 107850 + }, + { + "epoch": 15.31014904187367, + "grad_norm": 0.0811367779970169, + "learning_rate": 8.469638041163946e-05, + "loss": 0.058248645067214964, + "step": 107860 + }, + { + "epoch": 15.311568488289566, + "grad_norm": 4.043773651123047, + "learning_rate": 8.469496096522357e-05, + "loss": 0.027607759833335875, + "step": 107870 + }, + { + "epoch": 15.312987934705465, + "grad_norm": 0.33214518427848816, + "learning_rate": 8.469354151880767e-05, + "loss": 0.03843773007392883, + "step": 107880 + }, + { + "epoch": 15.314407381121363, + "grad_norm": 6.0449604988098145, + "learning_rate": 8.469212207239177e-05, + "loss": 0.013122335076332092, + "step": 107890 + }, + { + "epoch": 15.315826827537261, + "grad_norm": 1.2720344066619873, + "learning_rate": 8.469070262597587e-05, + "loss": 0.00861743837594986, + "step": 107900 + }, + { + "epoch": 15.317246273953158, + "grad_norm": 2.66919207572937, + "learning_rate": 8.468928317955998e-05, + "loss": 0.05666149854660034, + "step": 107910 + }, + { + "epoch": 15.318665720369056, + "grad_norm": 5.7319159507751465, + "learning_rate": 8.468786373314407e-05, + "loss": 0.023085048794746398, + "step": 107920 + }, + { + "epoch": 15.320085166784954, + "grad_norm": 0.14215344190597534, + "learning_rate": 8.468644428672818e-05, + "loss": 0.03705045878887177, + "step": 107930 + }, + { + "epoch": 15.321504613200851, + "grad_norm": 0.012645716778934002, + "learning_rate": 8.468502484031228e-05, + "loss": 0.14912995100021362, + "step": 107940 + }, + { + "epoch": 15.32292405961675, + "grad_norm": 0.10450851917266846, + "learning_rate": 8.468360539389638e-05, + "loss": 0.008530506491661071, + "step": 107950 + }, + { + "epoch": 15.324343506032648, + "grad_norm": 9.909424781799316, + "learning_rate": 8.468218594748049e-05, + "loss": 0.03583186268806458, + "step": 107960 + }, + { + "epoch": 15.325762952448546, + "grad_norm": 4.21767520904541, + "learning_rate": 8.468076650106459e-05, + "loss": 0.00969957560300827, + "step": 107970 + }, + { + "epoch": 15.327182398864442, + "grad_norm": 0.27315735816955566, + "learning_rate": 8.46793470546487e-05, + "loss": 0.015069955587387085, + "step": 107980 + }, + { + "epoch": 15.32860184528034, + "grad_norm": 4.084084987640381, + "learning_rate": 8.46779276082328e-05, + "loss": 0.014058254659175873, + "step": 107990 + }, + { + "epoch": 15.330021291696239, + "grad_norm": 0.011087893508374691, + "learning_rate": 8.467650816181689e-05, + "loss": 0.0190117746591568, + "step": 108000 + }, + { + "epoch": 15.330021291696239, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.03986372798681259, + "eval_runtime": 32.6811, + "eval_samples_per_second": 481.226, + "eval_steps_per_second": 15.055, + "step": 108000 + }, + { + "epoch": 15.331440738112136, + "grad_norm": 3.722987174987793, + "learning_rate": 8.467508871540099e-05, + "loss": 0.03890916705131531, + "step": 108010 + }, + { + "epoch": 15.332860184528034, + "grad_norm": 0.38124150037765503, + "learning_rate": 8.46736692689851e-05, + "loss": 0.01923559159040451, + "step": 108020 + }, + { + "epoch": 15.334279630943932, + "grad_norm": 8.261305809020996, + "learning_rate": 8.46722498225692e-05, + "loss": 0.025743892788887023, + "step": 108030 + }, + { + "epoch": 15.33569907735983, + "grad_norm": 0.09685249626636505, + "learning_rate": 8.467083037615331e-05, + "loss": 0.04506351947784424, + "step": 108040 + }, + { + "epoch": 15.337118523775727, + "grad_norm": 0.41357406973838806, + "learning_rate": 8.466941092973741e-05, + "loss": 0.007587555050849915, + "step": 108050 + }, + { + "epoch": 15.338537970191625, + "grad_norm": 0.4583379030227661, + "learning_rate": 8.46679914833215e-05, + "loss": 0.010946182906627655, + "step": 108060 + }, + { + "epoch": 15.339957416607524, + "grad_norm": 3.243618965148926, + "learning_rate": 8.466657203690562e-05, + "loss": 0.017219507694244386, + "step": 108070 + }, + { + "epoch": 15.34137686302342, + "grad_norm": 1.3947383165359497, + "learning_rate": 8.466515259048971e-05, + "loss": 0.007583361119031906, + "step": 108080 + }, + { + "epoch": 15.342796309439318, + "grad_norm": 0.377756267786026, + "learning_rate": 8.466373314407382e-05, + "loss": 0.014791847765445709, + "step": 108090 + }, + { + "epoch": 15.344215755855217, + "grad_norm": 0.04573707655072212, + "learning_rate": 8.466231369765791e-05, + "loss": 0.01001567840576172, + "step": 108100 + }, + { + "epoch": 15.345635202271115, + "grad_norm": 0.047843072563409805, + "learning_rate": 8.466089425124202e-05, + "loss": 0.0023218248039484023, + "step": 108110 + }, + { + "epoch": 15.347054648687012, + "grad_norm": 0.05948958918452263, + "learning_rate": 8.465947480482612e-05, + "loss": 0.02049640268087387, + "step": 108120 + }, + { + "epoch": 15.34847409510291, + "grad_norm": 0.03125576674938202, + "learning_rate": 8.465805535841023e-05, + "loss": 0.006368052214384079, + "step": 108130 + }, + { + "epoch": 15.349893541518808, + "grad_norm": 4.6414313316345215, + "learning_rate": 8.465663591199434e-05, + "loss": 0.04324187040328979, + "step": 108140 + }, + { + "epoch": 15.351312987934705, + "grad_norm": 11.299116134643555, + "learning_rate": 8.465521646557842e-05, + "loss": 0.02383486181497574, + "step": 108150 + }, + { + "epoch": 15.352732434350603, + "grad_norm": 3.3975841999053955, + "learning_rate": 8.465379701916253e-05, + "loss": 0.024656203389167786, + "step": 108160 + }, + { + "epoch": 15.354151880766501, + "grad_norm": 0.009448827244341373, + "learning_rate": 8.465237757274663e-05, + "loss": 0.00397581048309803, + "step": 108170 + }, + { + "epoch": 15.3555713271824, + "grad_norm": 2.3756935596466064, + "learning_rate": 8.465095812633074e-05, + "loss": 0.014156597852706908, + "step": 108180 + }, + { + "epoch": 15.356990773598296, + "grad_norm": 5.241497993469238, + "learning_rate": 8.464953867991484e-05, + "loss": 0.039077538251876834, + "step": 108190 + }, + { + "epoch": 15.358410220014195, + "grad_norm": 0.07957743853330612, + "learning_rate": 8.464811923349894e-05, + "loss": 0.08788187503814697, + "step": 108200 + }, + { + "epoch": 15.359829666430093, + "grad_norm": 0.07085578143596649, + "learning_rate": 8.464669978708303e-05, + "loss": 0.03421534895896912, + "step": 108210 + }, + { + "epoch": 15.36124911284599, + "grad_norm": 0.024479210376739502, + "learning_rate": 8.464528034066714e-05, + "loss": 0.006818431615829468, + "step": 108220 + }, + { + "epoch": 15.362668559261888, + "grad_norm": 0.16495859622955322, + "learning_rate": 8.464386089425124e-05, + "loss": 0.04878163933753967, + "step": 108230 + }, + { + "epoch": 15.364088005677786, + "grad_norm": 0.0742361769080162, + "learning_rate": 8.464244144783535e-05, + "loss": 0.02441754937171936, + "step": 108240 + }, + { + "epoch": 15.365507452093684, + "grad_norm": 1.2929010391235352, + "learning_rate": 8.464102200141945e-05, + "loss": 0.010754087567329406, + "step": 108250 + }, + { + "epoch": 15.36692689850958, + "grad_norm": 4.1936492919921875, + "learning_rate": 8.463960255500355e-05, + "loss": 0.0323737770318985, + "step": 108260 + }, + { + "epoch": 15.36834634492548, + "grad_norm": 0.1088828593492508, + "learning_rate": 8.463818310858766e-05, + "loss": 0.0027613572776317595, + "step": 108270 + }, + { + "epoch": 15.369765791341377, + "grad_norm": 0.1193610355257988, + "learning_rate": 8.463676366217176e-05, + "loss": 0.04451070725917816, + "step": 108280 + }, + { + "epoch": 15.371185237757274, + "grad_norm": 0.012951977550983429, + "learning_rate": 8.463534421575587e-05, + "loss": 0.04991414546966553, + "step": 108290 + }, + { + "epoch": 15.372604684173172, + "grad_norm": 0.5242592096328735, + "learning_rate": 8.463392476933995e-05, + "loss": 0.017808882892131804, + "step": 108300 + }, + { + "epoch": 15.37402413058907, + "grad_norm": 0.4178023338317871, + "learning_rate": 8.463250532292406e-05, + "loss": 0.054301905632019046, + "step": 108310 + }, + { + "epoch": 15.375443577004969, + "grad_norm": 0.2042308747768402, + "learning_rate": 8.463108587650816e-05, + "loss": 0.030471009016036988, + "step": 108320 + }, + { + "epoch": 15.376863023420865, + "grad_norm": 3.57515025138855, + "learning_rate": 8.462966643009227e-05, + "loss": 0.06504546403884888, + "step": 108330 + }, + { + "epoch": 15.378282469836764, + "grad_norm": 1.0132148265838623, + "learning_rate": 8.462824698367638e-05, + "loss": 0.008075962215662003, + "step": 108340 + }, + { + "epoch": 15.379701916252662, + "grad_norm": 0.360158771276474, + "learning_rate": 8.462682753726048e-05, + "loss": 0.06669944524765015, + "step": 108350 + }, + { + "epoch": 15.381121362668559, + "grad_norm": 0.13241221010684967, + "learning_rate": 8.462540809084457e-05, + "loss": 0.021548727154731752, + "step": 108360 + }, + { + "epoch": 15.382540809084457, + "grad_norm": 0.6008273363113403, + "learning_rate": 8.462398864442867e-05, + "loss": 0.014168235659599304, + "step": 108370 + }, + { + "epoch": 15.383960255500355, + "grad_norm": 1.9323714971542358, + "learning_rate": 8.462256919801278e-05, + "loss": 0.043941497802734375, + "step": 108380 + }, + { + "epoch": 15.385379701916253, + "grad_norm": 0.8188585042953491, + "learning_rate": 8.462114975159688e-05, + "loss": 0.04607037603855133, + "step": 108390 + }, + { + "epoch": 15.38679914833215, + "grad_norm": 5.428987503051758, + "learning_rate": 8.461973030518099e-05, + "loss": 0.04405255615711212, + "step": 108400 + }, + { + "epoch": 15.388218594748048, + "grad_norm": 0.852557361125946, + "learning_rate": 8.461831085876508e-05, + "loss": 0.01311432421207428, + "step": 108410 + }, + { + "epoch": 15.389638041163947, + "grad_norm": 0.11089548468589783, + "learning_rate": 8.461689141234919e-05, + "loss": 0.01127699315547943, + "step": 108420 + }, + { + "epoch": 15.391057487579843, + "grad_norm": 5.423348903656006, + "learning_rate": 8.46154719659333e-05, + "loss": 0.010904674232006074, + "step": 108430 + }, + { + "epoch": 15.392476933995741, + "grad_norm": 0.7922012209892273, + "learning_rate": 8.46140525195174e-05, + "loss": 0.005975948274135589, + "step": 108440 + }, + { + "epoch": 15.39389638041164, + "grad_norm": 0.041600536555051804, + "learning_rate": 8.46126330731015e-05, + "loss": 0.015418723225593567, + "step": 108450 + }, + { + "epoch": 15.395315826827538, + "grad_norm": 3.9857535362243652, + "learning_rate": 8.461121362668559e-05, + "loss": 0.035864454507827756, + "step": 108460 + }, + { + "epoch": 15.396735273243435, + "grad_norm": 1.131729245185852, + "learning_rate": 8.46097941802697e-05, + "loss": 0.017155754566192626, + "step": 108470 + }, + { + "epoch": 15.398154719659333, + "grad_norm": 1.1946663856506348, + "learning_rate": 8.46083747338538e-05, + "loss": 0.0766279399394989, + "step": 108480 + }, + { + "epoch": 15.399574166075231, + "grad_norm": 3.973695993423462, + "learning_rate": 8.460695528743791e-05, + "loss": 0.012332093715667725, + "step": 108490 + }, + { + "epoch": 15.400993612491128, + "grad_norm": 7.021789073944092, + "learning_rate": 8.4605535841022e-05, + "loss": 0.020251087844371796, + "step": 108500 + }, + { + "epoch": 15.400993612491128, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.04970347508788109, + "eval_runtime": 31.7417, + "eval_samples_per_second": 495.469, + "eval_steps_per_second": 15.5, + "step": 108500 + }, + { + "epoch": 15.402413058907026, + "grad_norm": 0.031689513474702835, + "learning_rate": 8.46041163946061e-05, + "loss": 0.02430199831724167, + "step": 108510 + }, + { + "epoch": 15.403832505322924, + "grad_norm": 0.1859818547964096, + "learning_rate": 8.460269694819021e-05, + "loss": 0.021585284173488616, + "step": 108520 + }, + { + "epoch": 15.405251951738823, + "grad_norm": 0.9980630278587341, + "learning_rate": 8.460127750177431e-05, + "loss": 0.0029014710336923598, + "step": 108530 + }, + { + "epoch": 15.40667139815472, + "grad_norm": 0.061858199536800385, + "learning_rate": 8.459985805535842e-05, + "loss": 0.012076663225889206, + "step": 108540 + }, + { + "epoch": 15.408090844570618, + "grad_norm": 7.55286979675293, + "learning_rate": 8.459843860894252e-05, + "loss": 0.016109907627105714, + "step": 108550 + }, + { + "epoch": 15.409510290986516, + "grad_norm": 0.013273878023028374, + "learning_rate": 8.459701916252662e-05, + "loss": 0.025737404823303223, + "step": 108560 + }, + { + "epoch": 15.410929737402412, + "grad_norm": 0.9089930057525635, + "learning_rate": 8.459559971611071e-05, + "loss": 0.019002526998519897, + "step": 108570 + }, + { + "epoch": 15.41234918381831, + "grad_norm": 0.5245806574821472, + "learning_rate": 8.459418026969483e-05, + "loss": 0.015612088143825531, + "step": 108580 + }, + { + "epoch": 15.413768630234209, + "grad_norm": 3.1340649127960205, + "learning_rate": 8.459276082327892e-05, + "loss": 0.015143591165542602, + "step": 108590 + }, + { + "epoch": 15.415188076650107, + "grad_norm": 0.5625861287117004, + "learning_rate": 8.459134137686303e-05, + "loss": 0.014427962899208068, + "step": 108600 + }, + { + "epoch": 15.416607523066004, + "grad_norm": 0.2291407436132431, + "learning_rate": 8.458992193044713e-05, + "loss": 0.03853601813316345, + "step": 108610 + }, + { + "epoch": 15.418026969481902, + "grad_norm": 0.009141476824879646, + "learning_rate": 8.458850248403123e-05, + "loss": 0.005736962333321572, + "step": 108620 + }, + { + "epoch": 15.4194464158978, + "grad_norm": 0.04585874825716019, + "learning_rate": 8.458708303761534e-05, + "loss": 0.030605682730674745, + "step": 108630 + }, + { + "epoch": 15.420865862313697, + "grad_norm": 0.16862089931964874, + "learning_rate": 8.458566359119944e-05, + "loss": 0.03745960295200348, + "step": 108640 + }, + { + "epoch": 15.422285308729595, + "grad_norm": 0.052765995264053345, + "learning_rate": 8.458424414478355e-05, + "loss": 0.01019173339009285, + "step": 108650 + }, + { + "epoch": 15.423704755145494, + "grad_norm": 0.11879035085439682, + "learning_rate": 8.458282469836763e-05, + "loss": 0.002845485135912895, + "step": 108660 + }, + { + "epoch": 15.425124201561392, + "grad_norm": 8.916354179382324, + "learning_rate": 8.458140525195174e-05, + "loss": 0.023224112391471863, + "step": 108670 + }, + { + "epoch": 15.426543647977288, + "grad_norm": 1.4591403007507324, + "learning_rate": 8.457998580553584e-05, + "loss": 0.024321596324443816, + "step": 108680 + }, + { + "epoch": 15.427963094393187, + "grad_norm": 1.928444743156433, + "learning_rate": 8.457856635911995e-05, + "loss": 0.01079963445663452, + "step": 108690 + }, + { + "epoch": 15.429382540809085, + "grad_norm": 2.116582155227661, + "learning_rate": 8.457714691270405e-05, + "loss": 0.0035361595451831817, + "step": 108700 + }, + { + "epoch": 15.430801987224982, + "grad_norm": 0.03351214528083801, + "learning_rate": 8.457572746628816e-05, + "loss": 0.03481769263744354, + "step": 108710 + }, + { + "epoch": 15.43222143364088, + "grad_norm": 0.4059963822364807, + "learning_rate": 8.457430801987226e-05, + "loss": 0.0201575830578804, + "step": 108720 + }, + { + "epoch": 15.433640880056778, + "grad_norm": 0.11144604533910751, + "learning_rate": 8.457288857345635e-05, + "loss": 0.02286584973335266, + "step": 108730 + }, + { + "epoch": 15.435060326472676, + "grad_norm": 0.06553922593593597, + "learning_rate": 8.457146912704046e-05, + "loss": 0.0369859516620636, + "step": 108740 + }, + { + "epoch": 15.436479772888573, + "grad_norm": 14.971281051635742, + "learning_rate": 8.457004968062456e-05, + "loss": 0.06031980514526367, + "step": 108750 + }, + { + "epoch": 15.437899219304471, + "grad_norm": 9.861590385437012, + "learning_rate": 8.456863023420867e-05, + "loss": 0.03545091152191162, + "step": 108760 + }, + { + "epoch": 15.43931866572037, + "grad_norm": 0.8584105372428894, + "learning_rate": 8.456721078779276e-05, + "loss": 0.02768990397453308, + "step": 108770 + }, + { + "epoch": 15.440738112136266, + "grad_norm": 0.7349197864532471, + "learning_rate": 8.456579134137687e-05, + "loss": 0.04703467786312103, + "step": 108780 + }, + { + "epoch": 15.442157558552164, + "grad_norm": 1.0171449184417725, + "learning_rate": 8.456437189496097e-05, + "loss": 0.008258529007434845, + "step": 108790 + }, + { + "epoch": 15.443577004968063, + "grad_norm": 0.018382063135504723, + "learning_rate": 8.456295244854508e-05, + "loss": 0.01452644169330597, + "step": 108800 + }, + { + "epoch": 15.444996451383961, + "grad_norm": 12.21877384185791, + "learning_rate": 8.456153300212917e-05, + "loss": 0.0642141580581665, + "step": 108810 + }, + { + "epoch": 15.446415897799858, + "grad_norm": 13.660399436950684, + "learning_rate": 8.456011355571327e-05, + "loss": 0.0224165216088295, + "step": 108820 + }, + { + "epoch": 15.447835344215756, + "grad_norm": 1.5048636198043823, + "learning_rate": 8.455869410929738e-05, + "loss": 0.049782159924507144, + "step": 108830 + }, + { + "epoch": 15.449254790631654, + "grad_norm": 0.023611735552549362, + "learning_rate": 8.455727466288148e-05, + "loss": 0.04725245237350464, + "step": 108840 + }, + { + "epoch": 15.45067423704755, + "grad_norm": 3.5750725269317627, + "learning_rate": 8.455585521646559e-05, + "loss": 0.00850258320569992, + "step": 108850 + }, + { + "epoch": 15.452093683463449, + "grad_norm": 0.11319718509912491, + "learning_rate": 8.455443577004969e-05, + "loss": 0.009740934520959855, + "step": 108860 + }, + { + "epoch": 15.453513129879347, + "grad_norm": 0.16189701855182648, + "learning_rate": 8.455301632363378e-05, + "loss": 0.01002160757780075, + "step": 108870 + }, + { + "epoch": 15.454932576295246, + "grad_norm": 0.027291517704725266, + "learning_rate": 8.455159687721788e-05, + "loss": 0.020919431746006013, + "step": 108880 + }, + { + "epoch": 15.456352022711142, + "grad_norm": 0.10434415936470032, + "learning_rate": 8.455017743080199e-05, + "loss": 0.03278235793113708, + "step": 108890 + }, + { + "epoch": 15.45777146912704, + "grad_norm": 0.10862031579017639, + "learning_rate": 8.454875798438609e-05, + "loss": 0.013124911487102509, + "step": 108900 + }, + { + "epoch": 15.459190915542939, + "grad_norm": 0.21281233429908752, + "learning_rate": 8.45473385379702e-05, + "loss": 0.04097933173179626, + "step": 108910 + }, + { + "epoch": 15.460610361958835, + "grad_norm": 0.7787414193153381, + "learning_rate": 8.45459190915543e-05, + "loss": 0.02274901270866394, + "step": 108920 + }, + { + "epoch": 15.462029808374734, + "grad_norm": 2.227569818496704, + "learning_rate": 8.45444996451384e-05, + "loss": 0.012134695053100586, + "step": 108930 + }, + { + "epoch": 15.463449254790632, + "grad_norm": 0.4353095591068268, + "learning_rate": 8.454308019872251e-05, + "loss": 0.017594021558761597, + "step": 108940 + }, + { + "epoch": 15.46486870120653, + "grad_norm": 0.0655631497502327, + "learning_rate": 8.45416607523066e-05, + "loss": 0.015526409447193145, + "step": 108950 + }, + { + "epoch": 15.466288147622427, + "grad_norm": 0.017096268013119698, + "learning_rate": 8.454024130589072e-05, + "loss": 0.0432012140750885, + "step": 108960 + }, + { + "epoch": 15.467707594038325, + "grad_norm": 4.582834720611572, + "learning_rate": 8.45388218594748e-05, + "loss": 0.02698971927165985, + "step": 108970 + }, + { + "epoch": 15.469127040454223, + "grad_norm": 0.517230749130249, + "learning_rate": 8.453740241305891e-05, + "loss": 0.052729862928390506, + "step": 108980 + }, + { + "epoch": 15.47054648687012, + "grad_norm": 0.03594077005982399, + "learning_rate": 8.453598296664301e-05, + "loss": 0.014178904891014098, + "step": 108990 + }, + { + "epoch": 15.471965933286018, + "grad_norm": 3.0221614837646484, + "learning_rate": 8.453456352022712e-05, + "loss": 0.02753186821937561, + "step": 109000 + }, + { + "epoch": 15.471965933286018, + "eval_accuracy": 0.9848032046798499, + "eval_loss": 0.05491510033607483, + "eval_runtime": 32.6115, + "eval_samples_per_second": 482.254, + "eval_steps_per_second": 15.087, + "step": 109000 + }, + { + "epoch": 15.473385379701917, + "grad_norm": 0.011246667243540287, + "learning_rate": 8.453314407381122e-05, + "loss": 0.011115138232707978, + "step": 109010 + }, + { + "epoch": 15.474804826117815, + "grad_norm": 0.08684574067592621, + "learning_rate": 8.453172462739531e-05, + "loss": 0.014247065782546997, + "step": 109020 + }, + { + "epoch": 15.476224272533711, + "grad_norm": 6.032614231109619, + "learning_rate": 8.453030518097942e-05, + "loss": 0.02520335018634796, + "step": 109030 + }, + { + "epoch": 15.47764371894961, + "grad_norm": 0.05028625950217247, + "learning_rate": 8.452888573456352e-05, + "loss": 0.018765005469322204, + "step": 109040 + }, + { + "epoch": 15.479063165365508, + "grad_norm": 0.1101774200797081, + "learning_rate": 8.452746628814763e-05, + "loss": 0.06100413203239441, + "step": 109050 + }, + { + "epoch": 15.480482611781405, + "grad_norm": 0.01917419768869877, + "learning_rate": 8.452604684173173e-05, + "loss": 0.023857808113098143, + "step": 109060 + }, + { + "epoch": 15.481902058197303, + "grad_norm": 0.9928895235061646, + "learning_rate": 8.452462739531584e-05, + "loss": 0.03471379578113556, + "step": 109070 + }, + { + "epoch": 15.483321504613201, + "grad_norm": 0.9378594160079956, + "learning_rate": 8.452320794889992e-05, + "loss": 0.04051099121570587, + "step": 109080 + }, + { + "epoch": 15.4847409510291, + "grad_norm": 0.3512844443321228, + "learning_rate": 8.452178850248404e-05, + "loss": 0.026929941773414613, + "step": 109090 + }, + { + "epoch": 15.486160397444996, + "grad_norm": 0.055643901228904724, + "learning_rate": 8.452036905606813e-05, + "loss": 0.04442196190357208, + "step": 109100 + }, + { + "epoch": 15.487579843860894, + "grad_norm": 0.3258360028266907, + "learning_rate": 8.451894960965224e-05, + "loss": 0.006144022569060326, + "step": 109110 + }, + { + "epoch": 15.488999290276793, + "grad_norm": 0.4137914180755615, + "learning_rate": 8.451753016323634e-05, + "loss": 0.010329674184322356, + "step": 109120 + }, + { + "epoch": 15.490418736692689, + "grad_norm": 4.536850929260254, + "learning_rate": 8.451625266146204e-05, + "loss": 0.08604642748832703, + "step": 109130 + }, + { + "epoch": 15.491838183108587, + "grad_norm": 3.1179816722869873, + "learning_rate": 8.451483321504614e-05, + "loss": 0.010609415918588638, + "step": 109140 + }, + { + "epoch": 15.493257629524486, + "grad_norm": 3.4988842010498047, + "learning_rate": 8.451341376863023e-05, + "loss": 0.023094484210014345, + "step": 109150 + }, + { + "epoch": 15.494677075940384, + "grad_norm": 0.053110271692276, + "learning_rate": 8.451199432221433e-05, + "loss": 0.02963247299194336, + "step": 109160 + }, + { + "epoch": 15.49609652235628, + "grad_norm": 8.496163368225098, + "learning_rate": 8.451057487579844e-05, + "loss": 0.008166144788265228, + "step": 109170 + }, + { + "epoch": 15.497515968772179, + "grad_norm": 0.19316455721855164, + "learning_rate": 8.450915542938255e-05, + "loss": 0.008004320412874221, + "step": 109180 + }, + { + "epoch": 15.498935415188077, + "grad_norm": 0.49412721395492554, + "learning_rate": 8.450773598296665e-05, + "loss": 0.014553853869438171, + "step": 109190 + }, + { + "epoch": 15.500354861603974, + "grad_norm": 8.566283226013184, + "learning_rate": 8.450631653655075e-05, + "loss": 0.027910608053207397, + "step": 109200 + }, + { + "epoch": 15.501774308019872, + "grad_norm": 7.212894916534424, + "learning_rate": 8.450489709013485e-05, + "loss": 0.040905225276947024, + "step": 109210 + }, + { + "epoch": 15.50319375443577, + "grad_norm": 8.938632011413574, + "learning_rate": 8.450347764371896e-05, + "loss": 0.009672276675701141, + "step": 109220 + }, + { + "epoch": 15.504613200851669, + "grad_norm": 2.8423256874084473, + "learning_rate": 8.450205819730305e-05, + "loss": 0.02199479639530182, + "step": 109230 + }, + { + "epoch": 15.506032647267565, + "grad_norm": 0.3441467881202698, + "learning_rate": 8.450063875088717e-05, + "loss": 0.006689305603504181, + "step": 109240 + }, + { + "epoch": 15.507452093683463, + "grad_norm": 0.24933524429798126, + "learning_rate": 8.449921930447125e-05, + "loss": 0.04364819526672363, + "step": 109250 + }, + { + "epoch": 15.508871540099362, + "grad_norm": 6.487727165222168, + "learning_rate": 8.449779985805536e-05, + "loss": 0.0283764511346817, + "step": 109260 + }, + { + "epoch": 15.510290986515258, + "grad_norm": 0.009871330112218857, + "learning_rate": 8.449638041163947e-05, + "loss": 0.04828583300113678, + "step": 109270 + }, + { + "epoch": 15.511710432931157, + "grad_norm": 9.64976978302002, + "learning_rate": 8.449496096522357e-05, + "loss": 0.04197643399238586, + "step": 109280 + }, + { + "epoch": 15.513129879347055, + "grad_norm": 1.2106422185897827, + "learning_rate": 8.449354151880768e-05, + "loss": 0.013595214486122132, + "step": 109290 + }, + { + "epoch": 15.514549325762953, + "grad_norm": 4.21859073638916, + "learning_rate": 8.449212207239176e-05, + "loss": 0.07362929582595826, + "step": 109300 + }, + { + "epoch": 15.51596877217885, + "grad_norm": 0.23973767459392548, + "learning_rate": 8.449070262597587e-05, + "loss": 0.014032267034053802, + "step": 109310 + }, + { + "epoch": 15.517388218594748, + "grad_norm": 0.07020927220582962, + "learning_rate": 8.448928317955997e-05, + "loss": 0.03569666743278503, + "step": 109320 + }, + { + "epoch": 15.518807665010646, + "grad_norm": 0.3164190649986267, + "learning_rate": 8.448786373314408e-05, + "loss": 0.014000938832759857, + "step": 109330 + }, + { + "epoch": 15.520227111426543, + "grad_norm": 7.998960018157959, + "learning_rate": 8.448644428672818e-05, + "loss": 0.015060046315193176, + "step": 109340 + }, + { + "epoch": 15.521646557842441, + "grad_norm": 2.536862373352051, + "learning_rate": 8.448502484031228e-05, + "loss": 0.004766803607344628, + "step": 109350 + }, + { + "epoch": 15.52306600425834, + "grad_norm": 3.9543192386627197, + "learning_rate": 8.448360539389639e-05, + "loss": 0.00784388929605484, + "step": 109360 + }, + { + "epoch": 15.524485450674238, + "grad_norm": 1.2425618171691895, + "learning_rate": 8.448218594748049e-05, + "loss": 0.040623527765274045, + "step": 109370 + }, + { + "epoch": 15.525904897090134, + "grad_norm": 0.3892207741737366, + "learning_rate": 8.44807665010646e-05, + "loss": 0.021851207315921783, + "step": 109380 + }, + { + "epoch": 15.527324343506033, + "grad_norm": 12.130237579345703, + "learning_rate": 8.44793470546487e-05, + "loss": 0.05706889033317566, + "step": 109390 + }, + { + "epoch": 15.528743789921931, + "grad_norm": 4.82382345199585, + "learning_rate": 8.44779276082328e-05, + "loss": 0.04465123414993286, + "step": 109400 + }, + { + "epoch": 15.530163236337827, + "grad_norm": 8.075292587280273, + "learning_rate": 8.447650816181689e-05, + "loss": 0.012319304049015045, + "step": 109410 + }, + { + "epoch": 15.531582682753726, + "grad_norm": 1.7110710144042969, + "learning_rate": 8.4475088715401e-05, + "loss": 0.03167652189731598, + "step": 109420 + }, + { + "epoch": 15.533002129169624, + "grad_norm": 0.03574613481760025, + "learning_rate": 8.44736692689851e-05, + "loss": 0.004731189832091332, + "step": 109430 + }, + { + "epoch": 15.534421575585522, + "grad_norm": 4.071065902709961, + "learning_rate": 8.447224982256921e-05, + "loss": 0.03880989253520965, + "step": 109440 + }, + { + "epoch": 15.535841022001419, + "grad_norm": 0.3194302022457123, + "learning_rate": 8.44708303761533e-05, + "loss": 0.009817516803741455, + "step": 109450 + }, + { + "epoch": 15.537260468417317, + "grad_norm": 0.6993071436882019, + "learning_rate": 8.44694109297374e-05, + "loss": 0.023785218596458435, + "step": 109460 + }, + { + "epoch": 15.538679914833216, + "grad_norm": 0.22547049820423126, + "learning_rate": 8.446799148332151e-05, + "loss": 0.004621018841862679, + "step": 109470 + }, + { + "epoch": 15.540099361249112, + "grad_norm": 5.650206089019775, + "learning_rate": 8.446657203690561e-05, + "loss": 0.021305915713310242, + "step": 109480 + }, + { + "epoch": 15.54151880766501, + "grad_norm": 0.619448184967041, + "learning_rate": 8.446515259048972e-05, + "loss": 0.011055320501327515, + "step": 109490 + }, + { + "epoch": 15.542938254080909, + "grad_norm": 0.01749510131776333, + "learning_rate": 8.446373314407382e-05, + "loss": 0.028083550930023193, + "step": 109500 + }, + { + "epoch": 15.542938254080909, + "eval_accuracy": 0.9842945253385896, + "eval_loss": 0.05707499384880066, + "eval_runtime": 33.3307, + "eval_samples_per_second": 471.847, + "eval_steps_per_second": 14.761, + "step": 109500 + }, + { + "epoch": 15.544357700496807, + "grad_norm": 0.015825387090444565, + "learning_rate": 8.446231369765792e-05, + "loss": 0.013564802706241608, + "step": 109510 + }, + { + "epoch": 15.545777146912704, + "grad_norm": 0.11282722651958466, + "learning_rate": 8.446089425124201e-05, + "loss": 0.010410679876804352, + "step": 109520 + }, + { + "epoch": 15.547196593328602, + "grad_norm": 0.02192036435008049, + "learning_rate": 8.445947480482612e-05, + "loss": 0.03194921612739563, + "step": 109530 + }, + { + "epoch": 15.5486160397445, + "grad_norm": 0.06405551731586456, + "learning_rate": 8.445805535841022e-05, + "loss": 0.024281325936317443, + "step": 109540 + }, + { + "epoch": 15.550035486160397, + "grad_norm": 0.48121848702430725, + "learning_rate": 8.445663591199433e-05, + "loss": 0.01762586086988449, + "step": 109550 + }, + { + "epoch": 15.551454932576295, + "grad_norm": 0.06481608003377914, + "learning_rate": 8.445521646557843e-05, + "loss": 0.057471299171447755, + "step": 109560 + }, + { + "epoch": 15.552874378992193, + "grad_norm": 0.8036059141159058, + "learning_rate": 8.445379701916253e-05, + "loss": 0.007377585023641586, + "step": 109570 + }, + { + "epoch": 15.554293825408092, + "grad_norm": 6.7593207359313965, + "learning_rate": 8.445237757274664e-05, + "loss": 0.044133511185646054, + "step": 109580 + }, + { + "epoch": 15.555713271823988, + "grad_norm": 6.9386749267578125, + "learning_rate": 8.445095812633074e-05, + "loss": 0.023405832052230836, + "step": 109590 + }, + { + "epoch": 15.557132718239886, + "grad_norm": 0.21647658944129944, + "learning_rate": 8.444953867991485e-05, + "loss": 0.021339884400367735, + "step": 109600 + }, + { + "epoch": 15.558552164655785, + "grad_norm": 1.1516494750976562, + "learning_rate": 8.444811923349893e-05, + "loss": 0.011148992180824279, + "step": 109610 + }, + { + "epoch": 15.559971611071681, + "grad_norm": 1.6991609334945679, + "learning_rate": 8.444669978708304e-05, + "loss": 0.004113087803125382, + "step": 109620 + }, + { + "epoch": 15.56139105748758, + "grad_norm": 0.2822941839694977, + "learning_rate": 8.444528034066714e-05, + "loss": 0.0026031706482172014, + "step": 109630 + }, + { + "epoch": 15.562810503903478, + "grad_norm": 0.015090609900653362, + "learning_rate": 8.444386089425125e-05, + "loss": 0.039816674590110776, + "step": 109640 + }, + { + "epoch": 15.564229950319376, + "grad_norm": 0.018448730930685997, + "learning_rate": 8.444244144783535e-05, + "loss": 0.04701717495918274, + "step": 109650 + }, + { + "epoch": 15.565649396735273, + "grad_norm": 0.252054899930954, + "learning_rate": 8.444102200141944e-05, + "loss": 0.03242330551147461, + "step": 109660 + }, + { + "epoch": 15.567068843151171, + "grad_norm": 0.38011518120765686, + "learning_rate": 8.443960255500356e-05, + "loss": 0.017498777806758882, + "step": 109670 + }, + { + "epoch": 15.56848828956707, + "grad_norm": 7.523592948913574, + "learning_rate": 8.443818310858765e-05, + "loss": 0.009059159457683564, + "step": 109680 + }, + { + "epoch": 15.569907735982966, + "grad_norm": 0.7460406422615051, + "learning_rate": 8.443676366217176e-05, + "loss": 0.010137155652046204, + "step": 109690 + }, + { + "epoch": 15.571327182398864, + "grad_norm": 8.281078338623047, + "learning_rate": 8.443534421575586e-05, + "loss": 0.05115741491317749, + "step": 109700 + }, + { + "epoch": 15.572746628814762, + "grad_norm": 5.468574047088623, + "learning_rate": 8.443392476933996e-05, + "loss": 0.06222133040428161, + "step": 109710 + }, + { + "epoch": 15.57416607523066, + "grad_norm": 4.441808223724365, + "learning_rate": 8.443250532292406e-05, + "loss": 0.02200213372707367, + "step": 109720 + }, + { + "epoch": 15.575585521646557, + "grad_norm": 1.1405576467514038, + "learning_rate": 8.443108587650817e-05, + "loss": 0.018508346378803255, + "step": 109730 + }, + { + "epoch": 15.577004968062456, + "grad_norm": 9.524807929992676, + "learning_rate": 8.442966643009226e-05, + "loss": 0.08532507419586181, + "step": 109740 + }, + { + "epoch": 15.578424414478354, + "grad_norm": 3.8635847568511963, + "learning_rate": 8.442824698367638e-05, + "loss": 0.018411895632743834, + "step": 109750 + }, + { + "epoch": 15.57984386089425, + "grad_norm": 0.13418318331241608, + "learning_rate": 8.442682753726047e-05, + "loss": 0.007721404731273651, + "step": 109760 + }, + { + "epoch": 15.581263307310149, + "grad_norm": 0.04814405366778374, + "learning_rate": 8.442540809084457e-05, + "loss": 0.03254770040512085, + "step": 109770 + }, + { + "epoch": 15.582682753726047, + "grad_norm": 0.1322525292634964, + "learning_rate": 8.442398864442868e-05, + "loss": 0.04441567361354828, + "step": 109780 + }, + { + "epoch": 15.584102200141945, + "grad_norm": 0.26600533723831177, + "learning_rate": 8.442256919801278e-05, + "loss": 0.015367360413074493, + "step": 109790 + }, + { + "epoch": 15.585521646557842, + "grad_norm": 0.045342884957790375, + "learning_rate": 8.442114975159689e-05, + "loss": 0.043045997619628906, + "step": 109800 + }, + { + "epoch": 15.58694109297374, + "grad_norm": 0.2467890977859497, + "learning_rate": 8.441973030518099e-05, + "loss": 0.003971902281045913, + "step": 109810 + }, + { + "epoch": 15.588360539389639, + "grad_norm": 0.08323675394058228, + "learning_rate": 8.441831085876508e-05, + "loss": 0.007375334948301315, + "step": 109820 + }, + { + "epoch": 15.589779985805535, + "grad_norm": 1.4727420806884766, + "learning_rate": 8.441689141234918e-05, + "loss": 0.04179688096046448, + "step": 109830 + }, + { + "epoch": 15.591199432221433, + "grad_norm": 13.468354225158691, + "learning_rate": 8.441547196593329e-05, + "loss": 0.09071345329284668, + "step": 109840 + }, + { + "epoch": 15.592618878637332, + "grad_norm": 15.406740188598633, + "learning_rate": 8.441405251951739e-05, + "loss": 0.04422733187675476, + "step": 109850 + }, + { + "epoch": 15.59403832505323, + "grad_norm": 0.10879992693662643, + "learning_rate": 8.44126330731015e-05, + "loss": 0.06023339033126831, + "step": 109860 + }, + { + "epoch": 15.595457771469126, + "grad_norm": 8.273409843444824, + "learning_rate": 8.44112136266856e-05, + "loss": 0.04545511603355408, + "step": 109870 + }, + { + "epoch": 15.596877217885025, + "grad_norm": 0.15698932111263275, + "learning_rate": 8.44097941802697e-05, + "loss": 0.021031519770622252, + "step": 109880 + }, + { + "epoch": 15.598296664300923, + "grad_norm": 1.2806004285812378, + "learning_rate": 8.44083747338538e-05, + "loss": 0.03998381495475769, + "step": 109890 + }, + { + "epoch": 15.59971611071682, + "grad_norm": 2.8216757774353027, + "learning_rate": 8.44069552874379e-05, + "loss": 0.04080126881599426, + "step": 109900 + }, + { + "epoch": 15.601135557132718, + "grad_norm": 0.4584507942199707, + "learning_rate": 8.440553584102201e-05, + "loss": 0.042827948927879333, + "step": 109910 + }, + { + "epoch": 15.602555003548616, + "grad_norm": 10.623929977416992, + "learning_rate": 8.44041163946061e-05, + "loss": 0.029111909866333007, + "step": 109920 + }, + { + "epoch": 15.603974449964515, + "grad_norm": 0.0433068685233593, + "learning_rate": 8.440269694819021e-05, + "loss": 0.027372494339942932, + "step": 109930 + }, + { + "epoch": 15.605393896380411, + "grad_norm": 1.3678462505340576, + "learning_rate": 8.44012775017743e-05, + "loss": 0.03315771222114563, + "step": 109940 + }, + { + "epoch": 15.60681334279631, + "grad_norm": 0.028468577191233635, + "learning_rate": 8.439985805535842e-05, + "loss": 0.03360509574413299, + "step": 109950 + }, + { + "epoch": 15.608232789212208, + "grad_norm": 11.046137809753418, + "learning_rate": 8.439843860894251e-05, + "loss": 0.061426812410354616, + "step": 109960 + }, + { + "epoch": 15.609652235628104, + "grad_norm": 0.8080819249153137, + "learning_rate": 8.439701916252661e-05, + "loss": 0.025454476475715637, + "step": 109970 + }, + { + "epoch": 15.611071682044003, + "grad_norm": 3.136892557144165, + "learning_rate": 8.439559971611072e-05, + "loss": 0.06736152172088623, + "step": 109980 + }, + { + "epoch": 15.6124911284599, + "grad_norm": 0.2572226822376251, + "learning_rate": 8.439418026969482e-05, + "loss": 0.028138145804405212, + "step": 109990 + }, + { + "epoch": 15.6139105748758, + "grad_norm": 3.7471795082092285, + "learning_rate": 8.439276082327893e-05, + "loss": 0.020710088312625885, + "step": 110000 + }, + { + "epoch": 15.6139105748758, + "eval_accuracy": 0.981941883385261, + "eval_loss": 0.059561554342508316, + "eval_runtime": 32.3129, + "eval_samples_per_second": 486.71, + "eval_steps_per_second": 15.226, + "step": 110000 + }, + { + "epoch": 15.615330021291696, + "grad_norm": 1.7016760110855103, + "learning_rate": 8.439134137686303e-05, + "loss": 0.027257269620895384, + "step": 110010 + }, + { + "epoch": 15.616749467707594, + "grad_norm": 7.8672380447387695, + "learning_rate": 8.438992193044713e-05, + "loss": 0.03369447588920593, + "step": 110020 + }, + { + "epoch": 15.618168914123492, + "grad_norm": 16.262685775756836, + "learning_rate": 8.438850248403122e-05, + "loss": 0.06224566698074341, + "step": 110030 + }, + { + "epoch": 15.619588360539389, + "grad_norm": 0.527633786201477, + "learning_rate": 8.438708303761533e-05, + "loss": 0.041495251655578616, + "step": 110040 + }, + { + "epoch": 15.621007806955287, + "grad_norm": 1.2803436517715454, + "learning_rate": 8.438566359119943e-05, + "loss": 0.004190302640199661, + "step": 110050 + }, + { + "epoch": 15.622427253371185, + "grad_norm": 0.2544032335281372, + "learning_rate": 8.438424414478354e-05, + "loss": 0.024184998869895936, + "step": 110060 + }, + { + "epoch": 15.623846699787084, + "grad_norm": 0.30951157212257385, + "learning_rate": 8.438282469836764e-05, + "loss": 0.0079819455742836, + "step": 110070 + }, + { + "epoch": 15.62526614620298, + "grad_norm": 0.08560092002153397, + "learning_rate": 8.438140525195174e-05, + "loss": 0.01343330293893814, + "step": 110080 + }, + { + "epoch": 15.626685592618879, + "grad_norm": 0.5142914056777954, + "learning_rate": 8.437998580553585e-05, + "loss": 0.009387575834989548, + "step": 110090 + }, + { + "epoch": 15.628105039034777, + "grad_norm": 0.1999034583568573, + "learning_rate": 8.437856635911995e-05, + "loss": 0.015430738031864167, + "step": 110100 + }, + { + "epoch": 15.629524485450673, + "grad_norm": 8.752143859863281, + "learning_rate": 8.437714691270406e-05, + "loss": 0.01040157824754715, + "step": 110110 + }, + { + "epoch": 15.630943931866572, + "grad_norm": 7.477208614349365, + "learning_rate": 8.437572746628815e-05, + "loss": 0.015335509181022644, + "step": 110120 + }, + { + "epoch": 15.63236337828247, + "grad_norm": 0.04349973425269127, + "learning_rate": 8.437430801987225e-05, + "loss": 0.014721313118934631, + "step": 110130 + }, + { + "epoch": 15.633782824698368, + "grad_norm": 1.6337802410125732, + "learning_rate": 8.437288857345635e-05, + "loss": 0.015643975138664244, + "step": 110140 + }, + { + "epoch": 15.635202271114265, + "grad_norm": 0.07937774807214737, + "learning_rate": 8.437146912704046e-05, + "loss": 0.020302596688270568, + "step": 110150 + }, + { + "epoch": 15.636621717530163, + "grad_norm": 6.785717487335205, + "learning_rate": 8.437004968062456e-05, + "loss": 0.02386097311973572, + "step": 110160 + }, + { + "epoch": 15.638041163946061, + "grad_norm": 3.58707594871521, + "learning_rate": 8.436863023420867e-05, + "loss": 0.004123737290501595, + "step": 110170 + }, + { + "epoch": 15.639460610361958, + "grad_norm": 0.0928313136100769, + "learning_rate": 8.436721078779277e-05, + "loss": 0.03107512891292572, + "step": 110180 + }, + { + "epoch": 15.640880056777856, + "grad_norm": 6.510340213775635, + "learning_rate": 8.436579134137686e-05, + "loss": 0.01583448201417923, + "step": 110190 + }, + { + "epoch": 15.642299503193755, + "grad_norm": 0.04997854679822922, + "learning_rate": 8.436437189496097e-05, + "loss": 0.0402520090341568, + "step": 110200 + }, + { + "epoch": 15.643718949609653, + "grad_norm": 2.4349265098571777, + "learning_rate": 8.436295244854507e-05, + "loss": 0.012507960200309753, + "step": 110210 + }, + { + "epoch": 15.64513839602555, + "grad_norm": 0.037816863507032394, + "learning_rate": 8.436153300212918e-05, + "loss": 0.007110661268234253, + "step": 110220 + }, + { + "epoch": 15.646557842441448, + "grad_norm": 0.1295090615749359, + "learning_rate": 8.436011355571327e-05, + "loss": 0.037706056237220766, + "step": 110230 + }, + { + "epoch": 15.647977288857346, + "grad_norm": 0.10089115798473358, + "learning_rate": 8.435869410929738e-05, + "loss": 0.02398446500301361, + "step": 110240 + }, + { + "epoch": 15.649396735273243, + "grad_norm": 3.9181690216064453, + "learning_rate": 8.435727466288147e-05, + "loss": 0.03424106240272522, + "step": 110250 + }, + { + "epoch": 15.650816181689141, + "grad_norm": 0.01480263751000166, + "learning_rate": 8.435585521646559e-05, + "loss": 0.014413505792617798, + "step": 110260 + }, + { + "epoch": 15.65223562810504, + "grad_norm": 0.007548884954303503, + "learning_rate": 8.435443577004968e-05, + "loss": 0.041290727257728574, + "step": 110270 + }, + { + "epoch": 15.653655074520938, + "grad_norm": 0.3037410378456116, + "learning_rate": 8.435301632363378e-05, + "loss": 0.004957319423556328, + "step": 110280 + }, + { + "epoch": 15.655074520936834, + "grad_norm": 0.1113152801990509, + "learning_rate": 8.435159687721789e-05, + "loss": 0.019102156162261963, + "step": 110290 + }, + { + "epoch": 15.656493967352732, + "grad_norm": 0.6818870902061462, + "learning_rate": 8.435017743080199e-05, + "loss": 0.0033379919826984406, + "step": 110300 + }, + { + "epoch": 15.65791341376863, + "grad_norm": 0.11788970977067947, + "learning_rate": 8.43487579843861e-05, + "loss": 0.022930392622947694, + "step": 110310 + }, + { + "epoch": 15.659332860184527, + "grad_norm": 1.483289122581482, + "learning_rate": 8.43473385379702e-05, + "loss": 0.012271914631128311, + "step": 110320 + }, + { + "epoch": 15.660752306600425, + "grad_norm": 0.18088115751743317, + "learning_rate": 8.43459190915543e-05, + "loss": 0.015769003331661223, + "step": 110330 + }, + { + "epoch": 15.662171753016324, + "grad_norm": 0.052908070385456085, + "learning_rate": 8.434449964513839e-05, + "loss": 0.04344092309474945, + "step": 110340 + }, + { + "epoch": 15.663591199432222, + "grad_norm": 6.1623125076293945, + "learning_rate": 8.43430801987225e-05, + "loss": 0.004051884636282921, + "step": 110350 + }, + { + "epoch": 15.665010645848119, + "grad_norm": 5.770845890045166, + "learning_rate": 8.43416607523066e-05, + "loss": 0.004904124140739441, + "step": 110360 + }, + { + "epoch": 15.666430092264017, + "grad_norm": 0.3964264988899231, + "learning_rate": 8.434024130589071e-05, + "loss": 0.007972334325313569, + "step": 110370 + }, + { + "epoch": 15.667849538679915, + "grad_norm": 0.17308181524276733, + "learning_rate": 8.433882185947481e-05, + "loss": 0.043213242292404176, + "step": 110380 + }, + { + "epoch": 15.669268985095812, + "grad_norm": 0.36493051052093506, + "learning_rate": 8.43374024130589e-05, + "loss": 0.03352370262145996, + "step": 110390 + }, + { + "epoch": 15.67068843151171, + "grad_norm": 0.02064143307507038, + "learning_rate": 8.433598296664302e-05, + "loss": 0.013609150052070617, + "step": 110400 + }, + { + "epoch": 15.672107877927608, + "grad_norm": 8.8279390335083, + "learning_rate": 8.433456352022711e-05, + "loss": 0.017481517791748048, + "step": 110410 + }, + { + "epoch": 15.673527324343507, + "grad_norm": 3.44317364692688, + "learning_rate": 8.433314407381122e-05, + "loss": 0.05524869561195374, + "step": 110420 + }, + { + "epoch": 15.674946770759403, + "grad_norm": 0.13901682198047638, + "learning_rate": 8.433172462739531e-05, + "loss": 0.03441147804260254, + "step": 110430 + }, + { + "epoch": 15.676366217175302, + "grad_norm": 0.07394246757030487, + "learning_rate": 8.433030518097942e-05, + "loss": 0.018186067044734956, + "step": 110440 + }, + { + "epoch": 15.6777856635912, + "grad_norm": 6.032867908477783, + "learning_rate": 8.432888573456352e-05, + "loss": 0.017774075269699097, + "step": 110450 + }, + { + "epoch": 15.679205110007096, + "grad_norm": 2.218717336654663, + "learning_rate": 8.432746628814763e-05, + "loss": 0.04855410158634186, + "step": 110460 + }, + { + "epoch": 15.680624556422995, + "grad_norm": 0.3446206748485565, + "learning_rate": 8.432604684173172e-05, + "loss": 0.0166058674454689, + "step": 110470 + }, + { + "epoch": 15.682044002838893, + "grad_norm": 11.216866493225098, + "learning_rate": 8.432462739531584e-05, + "loss": 0.028833556175231933, + "step": 110480 + }, + { + "epoch": 15.683463449254791, + "grad_norm": 0.23951852321624756, + "learning_rate": 8.432320794889993e-05, + "loss": 0.043452754616737366, + "step": 110490 + }, + { + "epoch": 15.684882895670688, + "grad_norm": 9.91982650756836, + "learning_rate": 8.432178850248403e-05, + "loss": 0.04466983377933502, + "step": 110500 + }, + { + "epoch": 15.684882895670688, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.08100114017724991, + "eval_runtime": 32.2852, + "eval_samples_per_second": 487.127, + "eval_steps_per_second": 15.239, + "step": 110500 + }, + { + "epoch": 15.686302342086586, + "grad_norm": 2.133854866027832, + "learning_rate": 8.432036905606814e-05, + "loss": 0.04842991232872009, + "step": 110510 + }, + { + "epoch": 15.687721788502484, + "grad_norm": 3.1208901405334473, + "learning_rate": 8.431894960965224e-05, + "loss": 0.013027089834213256, + "step": 110520 + }, + { + "epoch": 15.689141234918381, + "grad_norm": 0.02794860303401947, + "learning_rate": 8.431753016323635e-05, + "loss": 0.04402931034564972, + "step": 110530 + }, + { + "epoch": 15.69056068133428, + "grad_norm": 0.09005889296531677, + "learning_rate": 8.431611071682043e-05, + "loss": 0.030572378635406496, + "step": 110540 + }, + { + "epoch": 15.691980127750178, + "grad_norm": 1.786181092262268, + "learning_rate": 8.431469127040454e-05, + "loss": 0.008524458110332488, + "step": 110550 + }, + { + "epoch": 15.693399574166076, + "grad_norm": 0.46687254309654236, + "learning_rate": 8.431327182398864e-05, + "loss": 0.035732558369636534, + "step": 110560 + }, + { + "epoch": 15.694819020581972, + "grad_norm": 1.8484487533569336, + "learning_rate": 8.431185237757275e-05, + "loss": 0.03911701440811157, + "step": 110570 + }, + { + "epoch": 15.69623846699787, + "grad_norm": 11.28540325164795, + "learning_rate": 8.431043293115686e-05, + "loss": 0.04078208804130554, + "step": 110580 + }, + { + "epoch": 15.697657913413769, + "grad_norm": 0.06755779683589935, + "learning_rate": 8.430901348474095e-05, + "loss": 0.01349416971206665, + "step": 110590 + }, + { + "epoch": 15.699077359829666, + "grad_norm": 8.60862922668457, + "learning_rate": 8.430759403832506e-05, + "loss": 0.06120396852493286, + "step": 110600 + }, + { + "epoch": 15.700496806245564, + "grad_norm": 1.085951328277588, + "learning_rate": 8.430617459190916e-05, + "loss": 0.041951301693916324, + "step": 110610 + }, + { + "epoch": 15.701916252661462, + "grad_norm": 4.475429058074951, + "learning_rate": 8.430475514549327e-05, + "loss": 0.053812021017074586, + "step": 110620 + }, + { + "epoch": 15.70333569907736, + "grad_norm": 5.355215549468994, + "learning_rate": 8.430333569907736e-05, + "loss": 0.015490742027759552, + "step": 110630 + }, + { + "epoch": 15.704755145493257, + "grad_norm": 0.05852344259619713, + "learning_rate": 8.430191625266146e-05, + "loss": 0.002999766170978546, + "step": 110640 + }, + { + "epoch": 15.706174591909155, + "grad_norm": 5.2904767990112305, + "learning_rate": 8.430049680624556e-05, + "loss": 0.011166901141405106, + "step": 110650 + }, + { + "epoch": 15.707594038325054, + "grad_norm": 1.8793965578079224, + "learning_rate": 8.429907735982967e-05, + "loss": 0.035853844881057736, + "step": 110660 + }, + { + "epoch": 15.70901348474095, + "grad_norm": 0.28392934799194336, + "learning_rate": 8.429765791341378e-05, + "loss": 0.008197212964296341, + "step": 110670 + }, + { + "epoch": 15.710432931156848, + "grad_norm": 0.019327659159898758, + "learning_rate": 8.429623846699788e-05, + "loss": 0.02017320692539215, + "step": 110680 + }, + { + "epoch": 15.711852377572747, + "grad_norm": 0.040241993963718414, + "learning_rate": 8.429481902058198e-05, + "loss": 0.036533668637275696, + "step": 110690 + }, + { + "epoch": 15.713271823988645, + "grad_norm": 1.2111730575561523, + "learning_rate": 8.429339957416607e-05, + "loss": 0.052335488796234134, + "step": 110700 + }, + { + "epoch": 15.714691270404542, + "grad_norm": 0.16749830543994904, + "learning_rate": 8.429198012775018e-05, + "loss": 0.0016597557812929153, + "step": 110710 + }, + { + "epoch": 15.71611071682044, + "grad_norm": 0.04028880223631859, + "learning_rate": 8.429056068133428e-05, + "loss": 0.01130024939775467, + "step": 110720 + }, + { + "epoch": 15.717530163236338, + "grad_norm": 0.8011389970779419, + "learning_rate": 8.428914123491839e-05, + "loss": 0.0341774582862854, + "step": 110730 + }, + { + "epoch": 15.718949609652235, + "grad_norm": 0.1157180517911911, + "learning_rate": 8.428772178850248e-05, + "loss": 0.024665170907974245, + "step": 110740 + }, + { + "epoch": 15.720369056068133, + "grad_norm": 0.24222633242607117, + "learning_rate": 8.428630234208659e-05, + "loss": 0.014342208206653596, + "step": 110750 + }, + { + "epoch": 15.721788502484031, + "grad_norm": 0.11375311762094498, + "learning_rate": 8.42848828956707e-05, + "loss": 0.025572729110717774, + "step": 110760 + }, + { + "epoch": 15.72320794889993, + "grad_norm": 0.2284621149301529, + "learning_rate": 8.42834634492548e-05, + "loss": 0.016583889722824097, + "step": 110770 + }, + { + "epoch": 15.724627395315826, + "grad_norm": 2.907301425933838, + "learning_rate": 8.42820440028389e-05, + "loss": 0.014733342826366425, + "step": 110780 + }, + { + "epoch": 15.726046841731725, + "grad_norm": 4.311647415161133, + "learning_rate": 8.4280624556423e-05, + "loss": 0.044314044713974, + "step": 110790 + }, + { + "epoch": 15.727466288147623, + "grad_norm": 1.3881518840789795, + "learning_rate": 8.42792051100071e-05, + "loss": 0.008146890997886657, + "step": 110800 + }, + { + "epoch": 15.72888573456352, + "grad_norm": 0.0465969555079937, + "learning_rate": 8.42777856635912e-05, + "loss": 0.016651517152786253, + "step": 110810 + }, + { + "epoch": 15.730305180979418, + "grad_norm": 0.1810646504163742, + "learning_rate": 8.427636621717531e-05, + "loss": 0.010874331742525101, + "step": 110820 + }, + { + "epoch": 15.731724627395316, + "grad_norm": 0.049988340586423874, + "learning_rate": 8.42749467707594e-05, + "loss": 0.0026690881699323655, + "step": 110830 + }, + { + "epoch": 15.733144073811214, + "grad_norm": 0.05745243653655052, + "learning_rate": 8.427352732434352e-05, + "loss": 0.007335717976093292, + "step": 110840 + }, + { + "epoch": 15.73456352022711, + "grad_norm": 0.04251491278409958, + "learning_rate": 8.427210787792762e-05, + "loss": 0.03565714657306671, + "step": 110850 + }, + { + "epoch": 15.735982966643009, + "grad_norm": 0.5023424625396729, + "learning_rate": 8.427068843151171e-05, + "loss": 0.0396359771490097, + "step": 110860 + }, + { + "epoch": 15.737402413058907, + "grad_norm": 0.19283650815486908, + "learning_rate": 8.426926898509582e-05, + "loss": 0.0022236768156290053, + "step": 110870 + }, + { + "epoch": 15.738821859474804, + "grad_norm": 0.23359252512454987, + "learning_rate": 8.426784953867992e-05, + "loss": 0.030733969807624818, + "step": 110880 + }, + { + "epoch": 15.740241305890702, + "grad_norm": 0.6884561777114868, + "learning_rate": 8.426643009226403e-05, + "loss": 0.03513207733631134, + "step": 110890 + }, + { + "epoch": 15.7416607523066, + "grad_norm": 4.131224632263184, + "learning_rate": 8.426501064584812e-05, + "loss": 0.004008464515209198, + "step": 110900 + }, + { + "epoch": 15.743080198722499, + "grad_norm": 0.22556258738040924, + "learning_rate": 8.426359119943223e-05, + "loss": 0.003835783526301384, + "step": 110910 + }, + { + "epoch": 15.744499645138395, + "grad_norm": 2.7751762866973877, + "learning_rate": 8.426217175301632e-05, + "loss": 0.005967815220355987, + "step": 110920 + }, + { + "epoch": 15.745919091554294, + "grad_norm": 13.616484642028809, + "learning_rate": 8.426075230660043e-05, + "loss": 0.02839590609073639, + "step": 110930 + }, + { + "epoch": 15.747338537970192, + "grad_norm": 0.7434455752372742, + "learning_rate": 8.425933286018453e-05, + "loss": 0.011851108074188233, + "step": 110940 + }, + { + "epoch": 15.748757984386089, + "grad_norm": 0.3369200527667999, + "learning_rate": 8.425791341376863e-05, + "loss": 0.04642275869846344, + "step": 110950 + }, + { + "epoch": 15.750177430801987, + "grad_norm": 14.551253318786621, + "learning_rate": 8.425649396735274e-05, + "loss": 0.07257083654403687, + "step": 110960 + }, + { + "epoch": 15.751596877217885, + "grad_norm": 9.559237480163574, + "learning_rate": 8.425507452093684e-05, + "loss": 0.03088544011116028, + "step": 110970 + }, + { + "epoch": 15.753016323633783, + "grad_norm": 0.060590412467718124, + "learning_rate": 8.425365507452095e-05, + "loss": 0.09815815687179566, + "step": 110980 + }, + { + "epoch": 15.75443577004968, + "grad_norm": 9.443406105041504, + "learning_rate": 8.425223562810505e-05, + "loss": 0.01634849011898041, + "step": 110990 + }, + { + "epoch": 15.755855216465578, + "grad_norm": 0.03370879590511322, + "learning_rate": 8.425081618168914e-05, + "loss": 0.040988501906394956, + "step": 111000 + }, + { + "epoch": 15.755855216465578, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.05055028945207596, + "eval_runtime": 32.712, + "eval_samples_per_second": 480.772, + "eval_steps_per_second": 15.04, + "step": 111000 + }, + { + "epoch": 15.757274662881477, + "grad_norm": 0.006649512331932783, + "learning_rate": 8.424939673527324e-05, + "loss": 0.02285851240158081, + "step": 111010 + }, + { + "epoch": 15.758694109297373, + "grad_norm": 0.03384064882993698, + "learning_rate": 8.424797728885735e-05, + "loss": 0.01740650236606598, + "step": 111020 + }, + { + "epoch": 15.760113555713271, + "grad_norm": 6.754789352416992, + "learning_rate": 8.424655784244145e-05, + "loss": 0.02038225531578064, + "step": 111030 + }, + { + "epoch": 15.76153300212917, + "grad_norm": 0.211502805352211, + "learning_rate": 8.424513839602556e-05, + "loss": 0.027649855613708495, + "step": 111040 + }, + { + "epoch": 15.762952448545068, + "grad_norm": 0.04203588888049126, + "learning_rate": 8.424371894960966e-05, + "loss": 0.0319151371717453, + "step": 111050 + }, + { + "epoch": 15.764371894960965, + "grad_norm": 0.061523132026195526, + "learning_rate": 8.424229950319375e-05, + "loss": 0.007026679813861847, + "step": 111060 + }, + { + "epoch": 15.765791341376863, + "grad_norm": 1.0526913404464722, + "learning_rate": 8.424088005677787e-05, + "loss": 0.017617282271385194, + "step": 111070 + }, + { + "epoch": 15.767210787792761, + "grad_norm": 0.545382022857666, + "learning_rate": 8.423946061036196e-05, + "loss": 0.008751662820577622, + "step": 111080 + }, + { + "epoch": 15.768630234208658, + "grad_norm": 2.2254326343536377, + "learning_rate": 8.423804116394607e-05, + "loss": 0.004927302151918412, + "step": 111090 + }, + { + "epoch": 15.770049680624556, + "grad_norm": 0.035762522369623184, + "learning_rate": 8.423662171753016e-05, + "loss": 0.015064448118209839, + "step": 111100 + }, + { + "epoch": 15.771469127040454, + "grad_norm": 0.004056834150105715, + "learning_rate": 8.423520227111427e-05, + "loss": 0.022363266348838805, + "step": 111110 + }, + { + "epoch": 15.772888573456353, + "grad_norm": 3.803673267364502, + "learning_rate": 8.423378282469837e-05, + "loss": 0.025861644744873048, + "step": 111120 + }, + { + "epoch": 15.77430801987225, + "grad_norm": 1.0381547212600708, + "learning_rate": 8.423236337828248e-05, + "loss": 0.03979891836643219, + "step": 111130 + }, + { + "epoch": 15.775727466288147, + "grad_norm": 2.0041587352752686, + "learning_rate": 8.423094393186657e-05, + "loss": 0.0236584410071373, + "step": 111140 + }, + { + "epoch": 15.777146912704046, + "grad_norm": 0.5491400361061096, + "learning_rate": 8.422952448545069e-05, + "loss": 0.06289567351341248, + "step": 111150 + }, + { + "epoch": 15.778566359119942, + "grad_norm": 0.2346036732196808, + "learning_rate": 8.422810503903478e-05, + "loss": 0.012413589656352997, + "step": 111160 + }, + { + "epoch": 15.77998580553584, + "grad_norm": 10.742231369018555, + "learning_rate": 8.422668559261888e-05, + "loss": 0.05024981498718262, + "step": 111170 + }, + { + "epoch": 15.781405251951739, + "grad_norm": 0.9036098718643188, + "learning_rate": 8.422526614620299e-05, + "loss": 0.006312942504882813, + "step": 111180 + }, + { + "epoch": 15.782824698367637, + "grad_norm": 0.039659857749938965, + "learning_rate": 8.422384669978709e-05, + "loss": 0.020886825025081636, + "step": 111190 + }, + { + "epoch": 15.784244144783534, + "grad_norm": 2.2897820472717285, + "learning_rate": 8.42224272533712e-05, + "loss": 0.040818363428115845, + "step": 111200 + }, + { + "epoch": 15.785663591199432, + "grad_norm": 0.9270223379135132, + "learning_rate": 8.422100780695528e-05, + "loss": 0.04686786830425262, + "step": 111210 + }, + { + "epoch": 15.78708303761533, + "grad_norm": 6.989339828491211, + "learning_rate": 8.42195883605394e-05, + "loss": 0.016091875731945038, + "step": 111220 + }, + { + "epoch": 15.788502484031227, + "grad_norm": 8.478525161743164, + "learning_rate": 8.421816891412349e-05, + "loss": 0.005417653918266296, + "step": 111230 + }, + { + "epoch": 15.789921930447125, + "grad_norm": 0.11680683493614197, + "learning_rate": 8.42167494677076e-05, + "loss": 0.014682632684707642, + "step": 111240 + }, + { + "epoch": 15.791341376863024, + "grad_norm": 2.4658005237579346, + "learning_rate": 8.42153300212917e-05, + "loss": 0.01771555542945862, + "step": 111250 + }, + { + "epoch": 15.792760823278922, + "grad_norm": 7.7995195388793945, + "learning_rate": 8.42140525195174e-05, + "loss": 0.035976368188858035, + "step": 111260 + }, + { + "epoch": 15.794180269694818, + "grad_norm": 5.016642093658447, + "learning_rate": 8.42126330731015e-05, + "loss": 0.02658340334892273, + "step": 111270 + }, + { + "epoch": 15.795599716110717, + "grad_norm": 10.646329879760742, + "learning_rate": 8.421121362668559e-05, + "loss": 0.014648254215717315, + "step": 111280 + }, + { + "epoch": 15.797019162526615, + "grad_norm": 0.013211095705628395, + "learning_rate": 8.420979418026969e-05, + "loss": 0.002887158468365669, + "step": 111290 + }, + { + "epoch": 15.798438608942512, + "grad_norm": 0.05278421938419342, + "learning_rate": 8.42083747338538e-05, + "loss": 0.01711827516555786, + "step": 111300 + }, + { + "epoch": 15.79985805535841, + "grad_norm": 12.257071495056152, + "learning_rate": 8.42069552874379e-05, + "loss": 0.019732020795345306, + "step": 111310 + }, + { + "epoch": 15.801277501774308, + "grad_norm": 0.05365946516394615, + "learning_rate": 8.420553584102201e-05, + "loss": 0.03791945576667786, + "step": 111320 + }, + { + "epoch": 15.802696948190206, + "grad_norm": 0.23845407366752625, + "learning_rate": 8.420411639460611e-05, + "loss": 0.012302954494953156, + "step": 111330 + }, + { + "epoch": 15.804116394606103, + "grad_norm": 3.534014940261841, + "learning_rate": 8.42026969481902e-05, + "loss": 0.024273604154586792, + "step": 111340 + }, + { + "epoch": 15.805535841022001, + "grad_norm": 0.020159900188446045, + "learning_rate": 8.420127750177432e-05, + "loss": 0.01508972942829132, + "step": 111350 + }, + { + "epoch": 15.8069552874379, + "grad_norm": 0.07559079676866531, + "learning_rate": 8.419985805535841e-05, + "loss": 0.030358341336250306, + "step": 111360 + }, + { + "epoch": 15.808374733853796, + "grad_norm": 0.23751536011695862, + "learning_rate": 8.419843860894252e-05, + "loss": 0.06312950253486634, + "step": 111370 + }, + { + "epoch": 15.809794180269694, + "grad_norm": 6.484631061553955, + "learning_rate": 8.419701916252661e-05, + "loss": 0.06035802960395813, + "step": 111380 + }, + { + "epoch": 15.811213626685593, + "grad_norm": 0.06401360034942627, + "learning_rate": 8.419559971611072e-05, + "loss": 0.014664597809314728, + "step": 111390 + }, + { + "epoch": 15.812633073101491, + "grad_norm": 4.50247859954834, + "learning_rate": 8.419418026969482e-05, + "loss": 0.017867615818977355, + "step": 111400 + }, + { + "epoch": 15.814052519517388, + "grad_norm": 4.960999488830566, + "learning_rate": 8.419276082327893e-05, + "loss": 0.04982729852199554, + "step": 111410 + }, + { + "epoch": 15.815471965933286, + "grad_norm": 0.12740544974803925, + "learning_rate": 8.419134137686304e-05, + "loss": 0.02763109505176544, + "step": 111420 + }, + { + "epoch": 15.816891412349184, + "grad_norm": 3.2318503856658936, + "learning_rate": 8.418992193044712e-05, + "loss": 0.08191608190536499, + "step": 111430 + }, + { + "epoch": 15.81831085876508, + "grad_norm": 0.348230242729187, + "learning_rate": 8.418850248403123e-05, + "loss": 0.012708775699138641, + "step": 111440 + }, + { + "epoch": 15.819730305180979, + "grad_norm": 2.7329788208007812, + "learning_rate": 8.418708303761533e-05, + "loss": 0.028577986359596252, + "step": 111450 + }, + { + "epoch": 15.821149751596877, + "grad_norm": 9.365242004394531, + "learning_rate": 8.418566359119944e-05, + "loss": 0.030303579568862916, + "step": 111460 + }, + { + "epoch": 15.822569198012776, + "grad_norm": 0.8939119577407837, + "learning_rate": 8.418424414478354e-05, + "loss": 0.014215029776096344, + "step": 111470 + }, + { + "epoch": 15.823988644428672, + "grad_norm": 9.930445671081543, + "learning_rate": 8.418282469836765e-05, + "loss": 0.032778263092041016, + "step": 111480 + }, + { + "epoch": 15.82540809084457, + "grad_norm": 6.8321757316589355, + "learning_rate": 8.418140525195173e-05, + "loss": 0.012145863473415374, + "step": 111490 + }, + { + "epoch": 15.826827537260469, + "grad_norm": 0.24095939099788666, + "learning_rate": 8.417998580553584e-05, + "loss": 0.03848366439342499, + "step": 111500 + }, + { + "epoch": 15.826827537260469, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.03906998410820961, + "eval_runtime": 32.8578, + "eval_samples_per_second": 478.639, + "eval_steps_per_second": 14.974, + "step": 111500 + }, + { + "epoch": 15.828246983676365, + "grad_norm": 0.6555007100105286, + "learning_rate": 8.417856635911995e-05, + "loss": 0.012145914137363434, + "step": 111510 + }, + { + "epoch": 15.829666430092264, + "grad_norm": 0.16343164443969727, + "learning_rate": 8.417714691270405e-05, + "loss": 0.02596222162246704, + "step": 111520 + }, + { + "epoch": 15.831085876508162, + "grad_norm": 0.07217422127723694, + "learning_rate": 8.417572746628816e-05, + "loss": 0.05710041522979736, + "step": 111530 + }, + { + "epoch": 15.83250532292406, + "grad_norm": 0.29869720339775085, + "learning_rate": 8.417430801987225e-05, + "loss": 0.025982874631881713, + "step": 111540 + }, + { + "epoch": 15.833924769339957, + "grad_norm": 6.981906890869141, + "learning_rate": 8.417288857345636e-05, + "loss": 0.014904718101024627, + "step": 111550 + }, + { + "epoch": 15.835344215755855, + "grad_norm": 1.1227699518203735, + "learning_rate": 8.417146912704046e-05, + "loss": 0.033892059326171876, + "step": 111560 + }, + { + "epoch": 15.836763662171753, + "grad_norm": 6.107216835021973, + "learning_rate": 8.417004968062457e-05, + "loss": 0.007829079031944275, + "step": 111570 + }, + { + "epoch": 15.83818310858765, + "grad_norm": 10.42241382598877, + "learning_rate": 8.416863023420866e-05, + "loss": 0.02919740676879883, + "step": 111580 + }, + { + "epoch": 15.839602555003548, + "grad_norm": 0.1118733212351799, + "learning_rate": 8.416721078779276e-05, + "loss": 0.009149536490440369, + "step": 111590 + }, + { + "epoch": 15.841022001419446, + "grad_norm": 1.1477084159851074, + "learning_rate": 8.416579134137687e-05, + "loss": 0.010149497538805008, + "step": 111600 + }, + { + "epoch": 15.842441447835345, + "grad_norm": 0.6257818341255188, + "learning_rate": 8.416437189496097e-05, + "loss": 0.01542675495147705, + "step": 111610 + }, + { + "epoch": 15.843860894251241, + "grad_norm": 0.05429164692759514, + "learning_rate": 8.416295244854508e-05, + "loss": 0.01368640512228012, + "step": 111620 + }, + { + "epoch": 15.84528034066714, + "grad_norm": 9.229668617248535, + "learning_rate": 8.416153300212918e-05, + "loss": 0.0817399799823761, + "step": 111630 + }, + { + "epoch": 15.846699787083038, + "grad_norm": 0.07244868576526642, + "learning_rate": 8.416011355571327e-05, + "loss": 0.05533000230789185, + "step": 111640 + }, + { + "epoch": 15.848119233498934, + "grad_norm": 0.11954519152641296, + "learning_rate": 8.415869410929737e-05, + "loss": 0.019126889109611512, + "step": 111650 + }, + { + "epoch": 15.849538679914833, + "grad_norm": 0.40038758516311646, + "learning_rate": 8.415727466288148e-05, + "loss": 0.004278429970145226, + "step": 111660 + }, + { + "epoch": 15.850958126330731, + "grad_norm": 0.4650089740753174, + "learning_rate": 8.415585521646558e-05, + "loss": 0.009340885281562804, + "step": 111670 + }, + { + "epoch": 15.85237757274663, + "grad_norm": 0.37171149253845215, + "learning_rate": 8.415443577004969e-05, + "loss": 0.015973356366157532, + "step": 111680 + }, + { + "epoch": 15.853797019162526, + "grad_norm": 0.15554748475551605, + "learning_rate": 8.415301632363379e-05, + "loss": 0.05459659695625305, + "step": 111690 + }, + { + "epoch": 15.855216465578424, + "grad_norm": 0.66119784116745, + "learning_rate": 8.415159687721789e-05, + "loss": 0.012813696265220642, + "step": 111700 + }, + { + "epoch": 15.856635911994323, + "grad_norm": 0.025645015761256218, + "learning_rate": 8.4150177430802e-05, + "loss": 0.008818748593330383, + "step": 111710 + }, + { + "epoch": 15.858055358410219, + "grad_norm": 3.9062538146972656, + "learning_rate": 8.41487579843861e-05, + "loss": 0.04479396939277649, + "step": 111720 + }, + { + "epoch": 15.859474804826117, + "grad_norm": 0.01201682724058628, + "learning_rate": 8.41473385379702e-05, + "loss": 0.012727364897727966, + "step": 111730 + }, + { + "epoch": 15.860894251242016, + "grad_norm": 0.7928599119186401, + "learning_rate": 8.414591909155429e-05, + "loss": 0.020368021726608277, + "step": 111740 + }, + { + "epoch": 15.862313697657914, + "grad_norm": 10.790556907653809, + "learning_rate": 8.41444996451384e-05, + "loss": 0.022309675812721252, + "step": 111750 + }, + { + "epoch": 15.86373314407381, + "grad_norm": 3.925905466079712, + "learning_rate": 8.41430801987225e-05, + "loss": 0.03202139735221863, + "step": 111760 + }, + { + "epoch": 15.865152590489709, + "grad_norm": 0.42782142758369446, + "learning_rate": 8.414166075230661e-05, + "loss": 0.014055775105953216, + "step": 111770 + }, + { + "epoch": 15.866572036905607, + "grad_norm": 1.0484387874603271, + "learning_rate": 8.41402413058907e-05, + "loss": 0.01409704089164734, + "step": 111780 + }, + { + "epoch": 15.867991483321505, + "grad_norm": 0.23534570634365082, + "learning_rate": 8.41388218594748e-05, + "loss": 0.07245479226112365, + "step": 111790 + }, + { + "epoch": 15.869410929737402, + "grad_norm": 8.457581520080566, + "learning_rate": 8.413740241305891e-05, + "loss": 0.05491371154785156, + "step": 111800 + }, + { + "epoch": 15.8708303761533, + "grad_norm": 0.24857249855995178, + "learning_rate": 8.413598296664301e-05, + "loss": 0.03584108054637909, + "step": 111810 + }, + { + "epoch": 15.872249822569199, + "grad_norm": 0.17013956606388092, + "learning_rate": 8.413456352022712e-05, + "loss": 0.012269440293312072, + "step": 111820 + }, + { + "epoch": 15.873669268985095, + "grad_norm": 1.8806430101394653, + "learning_rate": 8.413314407381122e-05, + "loss": 0.03550401926040649, + "step": 111830 + }, + { + "epoch": 15.875088715400993, + "grad_norm": 0.308080792427063, + "learning_rate": 8.413172462739533e-05, + "loss": 0.04612856507301331, + "step": 111840 + }, + { + "epoch": 15.876508161816892, + "grad_norm": 0.014880211092531681, + "learning_rate": 8.413030518097941e-05, + "loss": 0.008867764472961425, + "step": 111850 + }, + { + "epoch": 15.87792760823279, + "grad_norm": 0.8367263674736023, + "learning_rate": 8.412888573456353e-05, + "loss": 0.010482820868492126, + "step": 111860 + }, + { + "epoch": 15.879347054648687, + "grad_norm": 0.00848662480711937, + "learning_rate": 8.412746628814762e-05, + "loss": 0.013878077268600464, + "step": 111870 + }, + { + "epoch": 15.880766501064585, + "grad_norm": 6.243743896484375, + "learning_rate": 8.412604684173173e-05, + "loss": 0.019290319085121153, + "step": 111880 + }, + { + "epoch": 15.882185947480483, + "grad_norm": 0.19164791703224182, + "learning_rate": 8.412462739531583e-05, + "loss": 0.07253921627998353, + "step": 111890 + }, + { + "epoch": 15.88360539389638, + "grad_norm": 0.2494039535522461, + "learning_rate": 8.412320794889993e-05, + "loss": 0.036090517044067384, + "step": 111900 + }, + { + "epoch": 15.885024840312278, + "grad_norm": 4.53750467300415, + "learning_rate": 8.412178850248404e-05, + "loss": 0.008243809640407562, + "step": 111910 + }, + { + "epoch": 15.886444286728176, + "grad_norm": 0.1850540190935135, + "learning_rate": 8.412036905606814e-05, + "loss": 0.029005610942840578, + "step": 111920 + }, + { + "epoch": 15.887863733144075, + "grad_norm": 0.18927457928657532, + "learning_rate": 8.411894960965225e-05, + "loss": 0.03487345576286316, + "step": 111930 + }, + { + "epoch": 15.889283179559971, + "grad_norm": 0.005943607538938522, + "learning_rate": 8.411753016323635e-05, + "loss": 0.024997258186340333, + "step": 111940 + }, + { + "epoch": 15.89070262597587, + "grad_norm": 5.918054580688477, + "learning_rate": 8.411611071682044e-05, + "loss": 0.008443067967891692, + "step": 111950 + }, + { + "epoch": 15.892122072391768, + "grad_norm": 0.6412557363510132, + "learning_rate": 8.411469127040454e-05, + "loss": 0.0039185058325529095, + "step": 111960 + }, + { + "epoch": 15.893541518807664, + "grad_norm": 4.264925003051758, + "learning_rate": 8.411327182398865e-05, + "loss": 0.035122907161712645, + "step": 111970 + }, + { + "epoch": 15.894960965223563, + "grad_norm": 0.6918242573738098, + "learning_rate": 8.411185237757275e-05, + "loss": 0.02806202173233032, + "step": 111980 + }, + { + "epoch": 15.896380411639461, + "grad_norm": 0.370086133480072, + "learning_rate": 8.411043293115686e-05, + "loss": 0.013014155626296996, + "step": 111990 + }, + { + "epoch": 15.89779985805536, + "grad_norm": 0.08562023937702179, + "learning_rate": 8.410901348474096e-05, + "loss": 0.017157554626464844, + "step": 112000 + }, + { + "epoch": 15.89779985805536, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.042419932782649994, + "eval_runtime": 32.0075, + "eval_samples_per_second": 491.354, + "eval_steps_per_second": 15.371, + "step": 112000 + }, + { + "epoch": 15.899219304471256, + "grad_norm": 0.32334455847740173, + "learning_rate": 8.410759403832505e-05, + "loss": 0.022023583948612212, + "step": 112010 + }, + { + "epoch": 15.900638750887154, + "grad_norm": 0.8937520980834961, + "learning_rate": 8.410617459190916e-05, + "loss": 0.033554989099502566, + "step": 112020 + }, + { + "epoch": 15.902058197303052, + "grad_norm": 4.915921688079834, + "learning_rate": 8.410475514549326e-05, + "loss": 0.022143127024173738, + "step": 112030 + }, + { + "epoch": 15.903477643718949, + "grad_norm": 8.751575469970703, + "learning_rate": 8.410333569907737e-05, + "loss": 0.03376420438289642, + "step": 112040 + }, + { + "epoch": 15.904897090134847, + "grad_norm": 0.06762025505304337, + "learning_rate": 8.410191625266146e-05, + "loss": 0.011050128936767578, + "step": 112050 + }, + { + "epoch": 15.906316536550746, + "grad_norm": 0.14220504462718964, + "learning_rate": 8.410049680624557e-05, + "loss": 0.01688355803489685, + "step": 112060 + }, + { + "epoch": 15.907735982966644, + "grad_norm": 0.10968053340911865, + "learning_rate": 8.409907735982967e-05, + "loss": 0.052211225032806396, + "step": 112070 + }, + { + "epoch": 15.90915542938254, + "grad_norm": 4.428947925567627, + "learning_rate": 8.409765791341378e-05, + "loss": 0.014014440774917602, + "step": 112080 + }, + { + "epoch": 15.910574875798439, + "grad_norm": 0.0376085601747036, + "learning_rate": 8.409623846699787e-05, + "loss": 0.003753634914755821, + "step": 112090 + }, + { + "epoch": 15.911994322214337, + "grad_norm": 19.084617614746094, + "learning_rate": 8.409481902058197e-05, + "loss": 0.03427457511425018, + "step": 112100 + }, + { + "epoch": 15.913413768630233, + "grad_norm": 0.031321216374635696, + "learning_rate": 8.409339957416608e-05, + "loss": 0.04106015264987946, + "step": 112110 + }, + { + "epoch": 15.914833215046132, + "grad_norm": 0.0813651755452156, + "learning_rate": 8.409198012775018e-05, + "loss": 0.013805033266544342, + "step": 112120 + }, + { + "epoch": 15.91625266146203, + "grad_norm": 6.500750541687012, + "learning_rate": 8.409056068133429e-05, + "loss": 0.04296710193157196, + "step": 112130 + }, + { + "epoch": 15.917672107877928, + "grad_norm": 0.1737774908542633, + "learning_rate": 8.408914123491839e-05, + "loss": 0.027148693799972534, + "step": 112140 + }, + { + "epoch": 15.919091554293825, + "grad_norm": 0.02817084826529026, + "learning_rate": 8.408772178850248e-05, + "loss": 0.041319137811660765, + "step": 112150 + }, + { + "epoch": 15.920511000709723, + "grad_norm": 0.17972466349601746, + "learning_rate": 8.408630234208658e-05, + "loss": 0.01709538549184799, + "step": 112160 + }, + { + "epoch": 15.921930447125622, + "grad_norm": 0.6958065032958984, + "learning_rate": 8.408488289567069e-05, + "loss": 0.025096040964126588, + "step": 112170 + }, + { + "epoch": 15.923349893541518, + "grad_norm": 4.633899211883545, + "learning_rate": 8.408346344925479e-05, + "loss": 0.03428847789764404, + "step": 112180 + }, + { + "epoch": 15.924769339957416, + "grad_norm": 0.2608836889266968, + "learning_rate": 8.40820440028389e-05, + "loss": 0.024020206928253175, + "step": 112190 + }, + { + "epoch": 15.926188786373315, + "grad_norm": 1.9271941184997559, + "learning_rate": 8.4080624556423e-05, + "loss": 0.014135521650314332, + "step": 112200 + }, + { + "epoch": 15.927608232789213, + "grad_norm": 1.365207552909851, + "learning_rate": 8.40792051100071e-05, + "loss": 0.01664539873600006, + "step": 112210 + }, + { + "epoch": 15.92902767920511, + "grad_norm": 0.03752214461565018, + "learning_rate": 8.407778566359121e-05, + "loss": 0.019473978877067567, + "step": 112220 + }, + { + "epoch": 15.930447125621008, + "grad_norm": 0.5360216498374939, + "learning_rate": 8.40763662171753e-05, + "loss": 0.042499488592147826, + "step": 112230 + }, + { + "epoch": 15.931866572036906, + "grad_norm": 0.14745332300662994, + "learning_rate": 8.407494677075942e-05, + "loss": 0.029511284828186036, + "step": 112240 + }, + { + "epoch": 15.933286018452803, + "grad_norm": 0.8141366243362427, + "learning_rate": 8.407352732434351e-05, + "loss": 0.017665939033031465, + "step": 112250 + }, + { + "epoch": 15.934705464868701, + "grad_norm": 0.15989312529563904, + "learning_rate": 8.407210787792761e-05, + "loss": 0.03332527875900269, + "step": 112260 + }, + { + "epoch": 15.9361249112846, + "grad_norm": 6.702155113220215, + "learning_rate": 8.407068843151171e-05, + "loss": 0.025216048955917357, + "step": 112270 + }, + { + "epoch": 15.937544357700498, + "grad_norm": 4.99879264831543, + "learning_rate": 8.406926898509582e-05, + "loss": 0.010565738379955291, + "step": 112280 + }, + { + "epoch": 15.938963804116394, + "grad_norm": 1.4353564977645874, + "learning_rate": 8.406784953867992e-05, + "loss": 0.024430674314498902, + "step": 112290 + }, + { + "epoch": 15.940383250532292, + "grad_norm": 0.6167616248130798, + "learning_rate": 8.406643009226403e-05, + "loss": 0.056137692928314206, + "step": 112300 + }, + { + "epoch": 15.94180269694819, + "grad_norm": 1.0208706855773926, + "learning_rate": 8.406501064584812e-05, + "loss": 0.01831849366426468, + "step": 112310 + }, + { + "epoch": 15.943222143364087, + "grad_norm": 0.07933774590492249, + "learning_rate": 8.406359119943222e-05, + "loss": 0.04364119172096252, + "step": 112320 + }, + { + "epoch": 15.944641589779986, + "grad_norm": 0.0641828402876854, + "learning_rate": 8.406217175301633e-05, + "loss": 0.009057014435529708, + "step": 112330 + }, + { + "epoch": 15.946061036195884, + "grad_norm": 6.091492652893066, + "learning_rate": 8.406075230660043e-05, + "loss": 0.011662401258945465, + "step": 112340 + }, + { + "epoch": 15.947480482611782, + "grad_norm": 0.13847945630550385, + "learning_rate": 8.405933286018454e-05, + "loss": 0.02798805832862854, + "step": 112350 + }, + { + "epoch": 15.948899929027679, + "grad_norm": 0.2876303195953369, + "learning_rate": 8.405791341376862e-05, + "loss": 0.008951310813426972, + "step": 112360 + }, + { + "epoch": 15.950319375443577, + "grad_norm": 0.3807947039604187, + "learning_rate": 8.405649396735274e-05, + "loss": 0.018087761104106904, + "step": 112370 + }, + { + "epoch": 15.951738821859475, + "grad_norm": 0.19306473433971405, + "learning_rate": 8.405507452093683e-05, + "loss": 0.025654715299606324, + "step": 112380 + }, + { + "epoch": 15.953158268275372, + "grad_norm": 0.596372127532959, + "learning_rate": 8.405365507452094e-05, + "loss": 0.03283799290657043, + "step": 112390 + }, + { + "epoch": 15.95457771469127, + "grad_norm": 10.068487167358398, + "learning_rate": 8.405223562810504e-05, + "loss": 0.010049515962600708, + "step": 112400 + }, + { + "epoch": 15.955997161107168, + "grad_norm": 6.676372051239014, + "learning_rate": 8.405081618168914e-05, + "loss": 0.032247743010520934, + "step": 112410 + }, + { + "epoch": 15.957416607523067, + "grad_norm": 0.18787598609924316, + "learning_rate": 8.404939673527325e-05, + "loss": 0.027509596943855286, + "step": 112420 + }, + { + "epoch": 15.958836053938963, + "grad_norm": 0.0962926521897316, + "learning_rate": 8.404797728885735e-05, + "loss": 0.03231292963027954, + "step": 112430 + }, + { + "epoch": 15.960255500354862, + "grad_norm": 1.5756398439407349, + "learning_rate": 8.404655784244146e-05, + "loss": 0.021877171099185945, + "step": 112440 + }, + { + "epoch": 15.96167494677076, + "grad_norm": 3.9619812965393066, + "learning_rate": 8.404513839602556e-05, + "loss": 0.0720575749874115, + "step": 112450 + }, + { + "epoch": 15.963094393186656, + "grad_norm": 1.577416181564331, + "learning_rate": 8.404371894960965e-05, + "loss": 0.02191317081451416, + "step": 112460 + }, + { + "epoch": 15.964513839602555, + "grad_norm": 0.6079181432723999, + "learning_rate": 8.404229950319375e-05, + "loss": 0.007672099024057388, + "step": 112470 + }, + { + "epoch": 15.965933286018453, + "grad_norm": 0.038492441177368164, + "learning_rate": 8.404088005677786e-05, + "loss": 0.03949972987174988, + "step": 112480 + }, + { + "epoch": 15.967352732434351, + "grad_norm": 1.8505480289459229, + "learning_rate": 8.403946061036196e-05, + "loss": 0.01702859401702881, + "step": 112490 + }, + { + "epoch": 15.968772178850248, + "grad_norm": 0.02366023324429989, + "learning_rate": 8.403804116394607e-05, + "loss": 0.014248864352703094, + "step": 112500 + }, + { + "epoch": 15.968772178850248, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.048695117235183716, + "eval_runtime": 33.2219, + "eval_samples_per_second": 473.392, + "eval_steps_per_second": 14.809, + "step": 112500 + }, + { + "epoch": 15.970191625266146, + "grad_norm": 0.09983468055725098, + "learning_rate": 8.403662171753017e-05, + "loss": 0.041495496034622194, + "step": 112510 + }, + { + "epoch": 15.971611071682045, + "grad_norm": 0.7310523390769958, + "learning_rate": 8.403520227111426e-05, + "loss": 0.004337532818317414, + "step": 112520 + }, + { + "epoch": 15.973030518097941, + "grad_norm": 0.7653382420539856, + "learning_rate": 8.403378282469837e-05, + "loss": 0.007619468867778778, + "step": 112530 + }, + { + "epoch": 15.97444996451384, + "grad_norm": 1.2560532093048096, + "learning_rate": 8.403236337828247e-05, + "loss": 0.00598950944840908, + "step": 112540 + }, + { + "epoch": 15.975869410929738, + "grad_norm": 0.12405021488666534, + "learning_rate": 8.403094393186658e-05, + "loss": 0.0030649252235889434, + "step": 112550 + }, + { + "epoch": 15.977288857345636, + "grad_norm": 0.9109636545181274, + "learning_rate": 8.402952448545068e-05, + "loss": 0.05473610758781433, + "step": 112560 + }, + { + "epoch": 15.978708303761533, + "grad_norm": 6.435018539428711, + "learning_rate": 8.402810503903478e-05, + "loss": 0.013474112749099732, + "step": 112570 + }, + { + "epoch": 15.98012775017743, + "grad_norm": 0.34421679377555847, + "learning_rate": 8.402668559261888e-05, + "loss": 0.05626802444458008, + "step": 112580 + }, + { + "epoch": 15.98154719659333, + "grad_norm": 5.662394046783447, + "learning_rate": 8.402526614620299e-05, + "loss": 0.03009917736053467, + "step": 112590 + }, + { + "epoch": 15.982966643009226, + "grad_norm": 0.11366448551416397, + "learning_rate": 8.402384669978708e-05, + "loss": 0.02108916491270065, + "step": 112600 + }, + { + "epoch": 15.984386089425124, + "grad_norm": 4.0829057693481445, + "learning_rate": 8.40224272533712e-05, + "loss": 0.006870292872190475, + "step": 112610 + }, + { + "epoch": 15.985805535841022, + "grad_norm": 8.846285820007324, + "learning_rate": 8.402100780695529e-05, + "loss": 0.028130248188972473, + "step": 112620 + }, + { + "epoch": 15.98722498225692, + "grad_norm": 0.16223247349262238, + "learning_rate": 8.401958836053939e-05, + "loss": 0.01694146990776062, + "step": 112630 + }, + { + "epoch": 15.988644428672817, + "grad_norm": 0.1122329905629158, + "learning_rate": 8.40181689141235e-05, + "loss": 0.042585551738739014, + "step": 112640 + }, + { + "epoch": 15.990063875088715, + "grad_norm": 7.043773651123047, + "learning_rate": 8.40167494677076e-05, + "loss": 0.027266567945480345, + "step": 112650 + }, + { + "epoch": 15.991483321504614, + "grad_norm": 3.80629301071167, + "learning_rate": 8.401533002129171e-05, + "loss": 0.017532944679260254, + "step": 112660 + }, + { + "epoch": 15.99290276792051, + "grad_norm": 10.180436134338379, + "learning_rate": 8.401391057487579e-05, + "loss": 0.04507697224617004, + "step": 112670 + }, + { + "epoch": 15.994322214336409, + "grad_norm": 1.181219458580017, + "learning_rate": 8.40124911284599e-05, + "loss": 0.002518647536635399, + "step": 112680 + }, + { + "epoch": 15.995741660752307, + "grad_norm": 2.4488043785095215, + "learning_rate": 8.4011071682044e-05, + "loss": 0.020063599944114684, + "step": 112690 + }, + { + "epoch": 15.997161107168205, + "grad_norm": 0.6078232526779175, + "learning_rate": 8.400965223562811e-05, + "loss": 0.00689728707075119, + "step": 112700 + }, + { + "epoch": 15.998580553584102, + "grad_norm": 0.18818716704845428, + "learning_rate": 8.400823278921221e-05, + "loss": 0.0035076819360256193, + "step": 112710 + }, + { + "epoch": 16.0, + "grad_norm": 3.3798446655273438, + "learning_rate": 8.40068133427963e-05, + "loss": 0.006935934722423554, + "step": 112720 + }, + { + "epoch": 16.0014194464159, + "grad_norm": 1.8569788932800293, + "learning_rate": 8.400539389638042e-05, + "loss": 0.014986465871334075, + "step": 112730 + }, + { + "epoch": 16.002838892831797, + "grad_norm": 0.38919079303741455, + "learning_rate": 8.400397444996451e-05, + "loss": 0.018700113892555235, + "step": 112740 + }, + { + "epoch": 16.004258339247695, + "grad_norm": 0.20278839766979218, + "learning_rate": 8.400255500354863e-05, + "loss": 0.009249137341976165, + "step": 112750 + }, + { + "epoch": 16.00567778566359, + "grad_norm": 1.0998165607452393, + "learning_rate": 8.400113555713272e-05, + "loss": 0.02100822627544403, + "step": 112760 + }, + { + "epoch": 16.007097232079488, + "grad_norm": 0.06329260021448135, + "learning_rate": 8.399971611071682e-05, + "loss": 0.01535399556159973, + "step": 112770 + }, + { + "epoch": 16.008516678495386, + "grad_norm": 4.446816921234131, + "learning_rate": 8.399829666430092e-05, + "loss": 0.023000138998031616, + "step": 112780 + }, + { + "epoch": 16.009936124911285, + "grad_norm": 0.7698379755020142, + "learning_rate": 8.399687721788503e-05, + "loss": 0.00757470577955246, + "step": 112790 + }, + { + "epoch": 16.011355571327183, + "grad_norm": 1.464584231376648, + "learning_rate": 8.399545777146913e-05, + "loss": 0.003870783746242523, + "step": 112800 + }, + { + "epoch": 16.01277501774308, + "grad_norm": 4.501135349273682, + "learning_rate": 8.399403832505324e-05, + "loss": 0.0050099663436412815, + "step": 112810 + }, + { + "epoch": 16.01419446415898, + "grad_norm": 0.6903401613235474, + "learning_rate": 8.399261887863733e-05, + "loss": 0.011681679636240005, + "step": 112820 + }, + { + "epoch": 16.015613910574874, + "grad_norm": 0.8593311309814453, + "learning_rate": 8.399119943222143e-05, + "loss": 0.033016365766525266, + "step": 112830 + }, + { + "epoch": 16.017033356990773, + "grad_norm": 0.8904664516448975, + "learning_rate": 8.398977998580554e-05, + "loss": 0.008453948795795441, + "step": 112840 + }, + { + "epoch": 16.01845280340667, + "grad_norm": 0.12439213693141937, + "learning_rate": 8.398836053938964e-05, + "loss": 0.014540690183639526, + "step": 112850 + }, + { + "epoch": 16.01987224982257, + "grad_norm": 0.12126282602548599, + "learning_rate": 8.398694109297375e-05, + "loss": 0.016869255900382997, + "step": 112860 + }, + { + "epoch": 16.021291696238467, + "grad_norm": 2.421494245529175, + "learning_rate": 8.398552164655783e-05, + "loss": 0.03948677182197571, + "step": 112870 + }, + { + "epoch": 16.022711142654366, + "grad_norm": 0.043827567249536514, + "learning_rate": 8.398410220014195e-05, + "loss": 0.00926891267299652, + "step": 112880 + }, + { + "epoch": 16.024130589070264, + "grad_norm": 0.08589991182088852, + "learning_rate": 8.398268275372604e-05, + "loss": 0.007034893333911896, + "step": 112890 + }, + { + "epoch": 16.02555003548616, + "grad_norm": 0.07396560162305832, + "learning_rate": 8.398126330731015e-05, + "loss": 0.005567807704210281, + "step": 112900 + }, + { + "epoch": 16.026969481902057, + "grad_norm": 0.05021880194544792, + "learning_rate": 8.397984386089426e-05, + "loss": 0.011187079548835754, + "step": 112910 + }, + { + "epoch": 16.028388928317955, + "grad_norm": 2.3082640171051025, + "learning_rate": 8.397842441447836e-05, + "loss": 0.002422238886356354, + "step": 112920 + }, + { + "epoch": 16.029808374733854, + "grad_norm": 0.029654445126652718, + "learning_rate": 8.397700496806246e-05, + "loss": 0.007260937988758087, + "step": 112930 + }, + { + "epoch": 16.031227821149752, + "grad_norm": 4.165407180786133, + "learning_rate": 8.397558552164656e-05, + "loss": 0.0049143187701702114, + "step": 112940 + }, + { + "epoch": 16.03264726756565, + "grad_norm": 0.09881002455949783, + "learning_rate": 8.397416607523067e-05, + "loss": 0.002365070208907127, + "step": 112950 + }, + { + "epoch": 16.03406671398155, + "grad_norm": 0.1122019812464714, + "learning_rate": 8.397274662881477e-05, + "loss": 0.002952999994158745, + "step": 112960 + }, + { + "epoch": 16.035486160397443, + "grad_norm": 0.09588045626878738, + "learning_rate": 8.397132718239888e-05, + "loss": 0.033624234795570376, + "step": 112970 + }, + { + "epoch": 16.03690560681334, + "grad_norm": 0.013734962791204453, + "learning_rate": 8.396990773598296e-05, + "loss": 0.032681146264076234, + "step": 112980 + }, + { + "epoch": 16.03832505322924, + "grad_norm": 0.21410711109638214, + "learning_rate": 8.396848828956707e-05, + "loss": 0.006711065024137497, + "step": 112990 + }, + { + "epoch": 16.03974449964514, + "grad_norm": 13.848963737487793, + "learning_rate": 8.396706884315118e-05, + "loss": 0.049816185235977174, + "step": 113000 + }, + { + "epoch": 16.03974449964514, + "eval_accuracy": 0.988872639409932, + "eval_loss": 0.03962714597582817, + "eval_runtime": 32.0224, + "eval_samples_per_second": 491.124, + "eval_steps_per_second": 15.364, + "step": 113000 + }, + { + "epoch": 16.041163946061037, + "grad_norm": 0.010466187261044979, + "learning_rate": 8.396564939673528e-05, + "loss": 0.010902185738086701, + "step": 113010 + }, + { + "epoch": 16.042583392476935, + "grad_norm": 5.942933082580566, + "learning_rate": 8.396422995031939e-05, + "loss": 0.014651863276958466, + "step": 113020 + }, + { + "epoch": 16.044002838892833, + "grad_norm": 0.2957586348056793, + "learning_rate": 8.396281050390347e-05, + "loss": 0.04585053324699402, + "step": 113030 + }, + { + "epoch": 16.045422285308728, + "grad_norm": 11.693886756896973, + "learning_rate": 8.396139105748758e-05, + "loss": 0.023407797515392303, + "step": 113040 + }, + { + "epoch": 16.046841731724626, + "grad_norm": 11.953022956848145, + "learning_rate": 8.395997161107168e-05, + "loss": 0.029421928524971008, + "step": 113050 + }, + { + "epoch": 16.048261178140525, + "grad_norm": 1.568056344985962, + "learning_rate": 8.395855216465579e-05, + "loss": 0.0045596893876791, + "step": 113060 + }, + { + "epoch": 16.049680624556423, + "grad_norm": 1.8041973114013672, + "learning_rate": 8.395713271823989e-05, + "loss": 0.02693631947040558, + "step": 113070 + }, + { + "epoch": 16.05110007097232, + "grad_norm": 2.983215808868408, + "learning_rate": 8.395571327182399e-05, + "loss": 0.006398119032382965, + "step": 113080 + }, + { + "epoch": 16.05251951738822, + "grad_norm": 0.19344577193260193, + "learning_rate": 8.39542938254081e-05, + "loss": 0.014838898181915283, + "step": 113090 + }, + { + "epoch": 16.053938963804118, + "grad_norm": 12.71757984161377, + "learning_rate": 8.39528743789922e-05, + "loss": 0.039472135901451114, + "step": 113100 + }, + { + "epoch": 16.055358410220013, + "grad_norm": 0.5390371680259705, + "learning_rate": 8.395145493257631e-05, + "loss": 0.01392301321029663, + "step": 113110 + }, + { + "epoch": 16.05677785663591, + "grad_norm": 0.3036908805370331, + "learning_rate": 8.39500354861604e-05, + "loss": 0.0026406005024909975, + "step": 113120 + }, + { + "epoch": 16.05819730305181, + "grad_norm": 0.07947884500026703, + "learning_rate": 8.39486160397445e-05, + "loss": 0.008793811500072479, + "step": 113130 + }, + { + "epoch": 16.059616749467708, + "grad_norm": 0.04886677861213684, + "learning_rate": 8.39471965933286e-05, + "loss": 0.005639223754405976, + "step": 113140 + }, + { + "epoch": 16.061036195883606, + "grad_norm": 5.401880741119385, + "learning_rate": 8.394577714691271e-05, + "loss": 0.05478711128234863, + "step": 113150 + }, + { + "epoch": 16.062455642299504, + "grad_norm": 0.4508975148200989, + "learning_rate": 8.394435770049681e-05, + "loss": 0.026101893186569212, + "step": 113160 + }, + { + "epoch": 16.063875088715402, + "grad_norm": 0.22109004855155945, + "learning_rate": 8.394293825408092e-05, + "loss": 0.0165793314576149, + "step": 113170 + }, + { + "epoch": 16.065294535131297, + "grad_norm": 0.09848267585039139, + "learning_rate": 8.394151880766502e-05, + "loss": 0.03446832001209259, + "step": 113180 + }, + { + "epoch": 16.066713981547196, + "grad_norm": 0.04517025500535965, + "learning_rate": 8.394009936124911e-05, + "loss": 0.018778929114341737, + "step": 113190 + }, + { + "epoch": 16.068133427963094, + "grad_norm": 15.434242248535156, + "learning_rate": 8.393867991483322e-05, + "loss": 0.020848213136196135, + "step": 113200 + }, + { + "epoch": 16.069552874378992, + "grad_norm": 0.043510545045137405, + "learning_rate": 8.393726046841732e-05, + "loss": 0.023297858238220216, + "step": 113210 + }, + { + "epoch": 16.07097232079489, + "grad_norm": 15.332098007202148, + "learning_rate": 8.393584102200143e-05, + "loss": 0.03652581572532654, + "step": 113220 + }, + { + "epoch": 16.07239176721079, + "grad_norm": 8.775132179260254, + "learning_rate": 8.393442157558553e-05, + "loss": 0.07113283276557922, + "step": 113230 + }, + { + "epoch": 16.073811213626687, + "grad_norm": 0.041509199887514114, + "learning_rate": 8.393300212916963e-05, + "loss": 0.014686094224452972, + "step": 113240 + }, + { + "epoch": 16.075230660042582, + "grad_norm": 0.4447268843650818, + "learning_rate": 8.393158268275372e-05, + "loss": 0.047878941893577574, + "step": 113250 + }, + { + "epoch": 16.07665010645848, + "grad_norm": 0.5964030027389526, + "learning_rate": 8.393016323633784e-05, + "loss": 0.005113707855343819, + "step": 113260 + }, + { + "epoch": 16.07806955287438, + "grad_norm": 2.3050570487976074, + "learning_rate": 8.392888573456352e-05, + "loss": 0.08484262824058533, + "step": 113270 + }, + { + "epoch": 16.079488999290277, + "grad_norm": 1.124589204788208, + "learning_rate": 8.392746628814763e-05, + "loss": 0.017958565056324004, + "step": 113280 + }, + { + "epoch": 16.080908445706175, + "grad_norm": 0.08736645430326462, + "learning_rate": 8.392604684173173e-05, + "loss": 0.029423543810844423, + "step": 113290 + }, + { + "epoch": 16.082327892122073, + "grad_norm": 0.009426097385585308, + "learning_rate": 8.392462739531584e-05, + "loss": 0.0018295619636774063, + "step": 113300 + }, + { + "epoch": 16.08374733853797, + "grad_norm": 1.1997662782669067, + "learning_rate": 8.392320794889992e-05, + "loss": 0.01612427681684494, + "step": 113310 + }, + { + "epoch": 16.085166784953866, + "grad_norm": 0.032743312418460846, + "learning_rate": 8.392178850248403e-05, + "loss": 0.02764788568019867, + "step": 113320 + }, + { + "epoch": 16.086586231369765, + "grad_norm": 1.5302761793136597, + "learning_rate": 8.392036905606813e-05, + "loss": 0.05237635374069214, + "step": 113330 + }, + { + "epoch": 16.088005677785663, + "grad_norm": 0.0579666830599308, + "learning_rate": 8.391894960965224e-05, + "loss": 0.05110551118850708, + "step": 113340 + }, + { + "epoch": 16.08942512420156, + "grad_norm": 0.18044817447662354, + "learning_rate": 8.391753016323634e-05, + "loss": 0.02474549263715744, + "step": 113350 + }, + { + "epoch": 16.09084457061746, + "grad_norm": 0.3958684206008911, + "learning_rate": 8.391611071682044e-05, + "loss": 0.03521615266799927, + "step": 113360 + }, + { + "epoch": 16.092264017033358, + "grad_norm": 0.5362204313278198, + "learning_rate": 8.391469127040455e-05, + "loss": 0.051009106636047366, + "step": 113370 + }, + { + "epoch": 16.093683463449256, + "grad_norm": 0.08517732471227646, + "learning_rate": 8.391327182398865e-05, + "loss": 0.016589146852493287, + "step": 113380 + }, + { + "epoch": 16.09510290986515, + "grad_norm": 0.5726885795593262, + "learning_rate": 8.391185237757276e-05, + "loss": 0.012025222927331925, + "step": 113390 + }, + { + "epoch": 16.09652235628105, + "grad_norm": 0.1441572606563568, + "learning_rate": 8.391043293115685e-05, + "loss": 0.0675897479057312, + "step": 113400 + }, + { + "epoch": 16.097941802696948, + "grad_norm": 0.4573246240615845, + "learning_rate": 8.390901348474095e-05, + "loss": 0.0028651710599660873, + "step": 113410 + }, + { + "epoch": 16.099361249112846, + "grad_norm": 0.07784921675920486, + "learning_rate": 8.390759403832505e-05, + "loss": 0.02416677474975586, + "step": 113420 + }, + { + "epoch": 16.100780695528744, + "grad_norm": 11.768951416015625, + "learning_rate": 8.390617459190916e-05, + "loss": 0.022789698839187623, + "step": 113430 + }, + { + "epoch": 16.102200141944643, + "grad_norm": 0.08935142308473587, + "learning_rate": 8.390475514549326e-05, + "loss": 0.0012326732277870179, + "step": 113440 + }, + { + "epoch": 16.10361958836054, + "grad_norm": 1.0763702392578125, + "learning_rate": 8.390333569907737e-05, + "loss": 0.010844753682613372, + "step": 113450 + }, + { + "epoch": 16.105039034776436, + "grad_norm": 0.42075884342193604, + "learning_rate": 8.390191625266147e-05, + "loss": 0.00529075525701046, + "step": 113460 + }, + { + "epoch": 16.106458481192334, + "grad_norm": 0.32909151911735535, + "learning_rate": 8.390049680624556e-05, + "loss": 0.007352690398693085, + "step": 113470 + }, + { + "epoch": 16.107877927608232, + "grad_norm": 0.017695745453238487, + "learning_rate": 8.389907735982967e-05, + "loss": 0.008752924203872681, + "step": 113480 + }, + { + "epoch": 16.10929737402413, + "grad_norm": 14.182565689086914, + "learning_rate": 8.389765791341377e-05, + "loss": 0.020782370865345002, + "step": 113490 + }, + { + "epoch": 16.11071682044003, + "grad_norm": 0.04523881524801254, + "learning_rate": 8.389623846699788e-05, + "loss": 0.02090078145265579, + "step": 113500 + }, + { + "epoch": 16.11071682044003, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.058538712561130524, + "eval_runtime": 32.6752, + "eval_samples_per_second": 481.312, + "eval_steps_per_second": 15.057, + "step": 113500 + }, + { + "epoch": 16.112136266855927, + "grad_norm": 1.4609222412109375, + "learning_rate": 8.389481902058197e-05, + "loss": 0.06761115193367004, + "step": 113510 + }, + { + "epoch": 16.113555713271825, + "grad_norm": 0.02035006321966648, + "learning_rate": 8.389339957416608e-05, + "loss": 0.027057936787605284, + "step": 113520 + }, + { + "epoch": 16.11497515968772, + "grad_norm": 1.180684208869934, + "learning_rate": 8.389198012775017e-05, + "loss": 0.021567445993423463, + "step": 113530 + }, + { + "epoch": 16.11639460610362, + "grad_norm": 0.0720195323228836, + "learning_rate": 8.389056068133429e-05, + "loss": 0.022597649693489076, + "step": 113540 + }, + { + "epoch": 16.117814052519517, + "grad_norm": 3.2325596809387207, + "learning_rate": 8.388914123491838e-05, + "loss": 0.03432404398918152, + "step": 113550 + }, + { + "epoch": 16.119233498935415, + "grad_norm": 0.2862333655357361, + "learning_rate": 8.38877217885025e-05, + "loss": 0.04290721118450165, + "step": 113560 + }, + { + "epoch": 16.120652945351313, + "grad_norm": 0.8970960974693298, + "learning_rate": 8.388630234208659e-05, + "loss": 0.02014774680137634, + "step": 113570 + }, + { + "epoch": 16.12207239176721, + "grad_norm": 0.037683311849832535, + "learning_rate": 8.388488289567069e-05, + "loss": 0.050986915826797485, + "step": 113580 + }, + { + "epoch": 16.12349183818311, + "grad_norm": 6.2578959465026855, + "learning_rate": 8.38834634492548e-05, + "loss": 0.019598402082920074, + "step": 113590 + }, + { + "epoch": 16.124911284599005, + "grad_norm": 1.3776534795761108, + "learning_rate": 8.38820440028389e-05, + "loss": 0.04389630854129791, + "step": 113600 + }, + { + "epoch": 16.126330731014903, + "grad_norm": 3.2619071006774902, + "learning_rate": 8.388062455642301e-05, + "loss": 0.02302349656820297, + "step": 113610 + }, + { + "epoch": 16.1277501774308, + "grad_norm": 0.03281306102871895, + "learning_rate": 8.387920511000709e-05, + "loss": 0.03444663286209106, + "step": 113620 + }, + { + "epoch": 16.1291696238467, + "grad_norm": 0.4675664007663727, + "learning_rate": 8.38777856635912e-05, + "loss": 0.033840471506118776, + "step": 113630 + }, + { + "epoch": 16.130589070262598, + "grad_norm": 2.2576606273651123, + "learning_rate": 8.38763662171753e-05, + "loss": 0.012754887342453003, + "step": 113640 + }, + { + "epoch": 16.132008516678496, + "grad_norm": 0.26638638973236084, + "learning_rate": 8.387494677075941e-05, + "loss": 0.02814761996269226, + "step": 113650 + }, + { + "epoch": 16.133427963094395, + "grad_norm": 0.17130306363105774, + "learning_rate": 8.387352732434352e-05, + "loss": 0.009480338543653488, + "step": 113660 + }, + { + "epoch": 16.13484740951029, + "grad_norm": 1.0494219064712524, + "learning_rate": 8.38721078779276e-05, + "loss": 0.01807313859462738, + "step": 113670 + }, + { + "epoch": 16.136266855926188, + "grad_norm": 4.024386405944824, + "learning_rate": 8.387068843151172e-05, + "loss": 0.018029528856277465, + "step": 113680 + }, + { + "epoch": 16.137686302342086, + "grad_norm": 0.07396601140499115, + "learning_rate": 8.386926898509581e-05, + "loss": 0.009591655433177948, + "step": 113690 + }, + { + "epoch": 16.139105748757984, + "grad_norm": 0.0040374561212956905, + "learning_rate": 8.386784953867992e-05, + "loss": 0.00962514728307724, + "step": 113700 + }, + { + "epoch": 16.140525195173883, + "grad_norm": 0.0394991971552372, + "learning_rate": 8.386643009226402e-05, + "loss": 0.01851954609155655, + "step": 113710 + }, + { + "epoch": 16.14194464158978, + "grad_norm": 10.12916088104248, + "learning_rate": 8.386501064584812e-05, + "loss": 0.0460242509841919, + "step": 113720 + }, + { + "epoch": 16.14336408800568, + "grad_norm": 1.8034098148345947, + "learning_rate": 8.386359119943222e-05, + "loss": 0.0432327538728714, + "step": 113730 + }, + { + "epoch": 16.144783534421574, + "grad_norm": 2.1534202098846436, + "learning_rate": 8.386217175301633e-05, + "loss": 0.02302303910255432, + "step": 113740 + }, + { + "epoch": 16.146202980837472, + "grad_norm": 0.3321599066257477, + "learning_rate": 8.386075230660044e-05, + "loss": 0.0073155477643013, + "step": 113750 + }, + { + "epoch": 16.14762242725337, + "grad_norm": 0.5808457732200623, + "learning_rate": 8.385933286018454e-05, + "loss": 0.016782888770103456, + "step": 113760 + }, + { + "epoch": 16.14904187366927, + "grad_norm": 6.599685192108154, + "learning_rate": 8.385791341376863e-05, + "loss": 0.0414692759513855, + "step": 113770 + }, + { + "epoch": 16.150461320085167, + "grad_norm": 5.038705348968506, + "learning_rate": 8.385649396735273e-05, + "loss": 0.022276198863983153, + "step": 113780 + }, + { + "epoch": 16.151880766501066, + "grad_norm": 0.011023037135601044, + "learning_rate": 8.385507452093684e-05, + "loss": 0.008303380012512207, + "step": 113790 + }, + { + "epoch": 16.153300212916964, + "grad_norm": 0.7618648409843445, + "learning_rate": 8.385365507452094e-05, + "loss": 0.046433225274086, + "step": 113800 + }, + { + "epoch": 16.15471965933286, + "grad_norm": 0.15397347509860992, + "learning_rate": 8.385223562810505e-05, + "loss": 0.02768896520137787, + "step": 113810 + }, + { + "epoch": 16.156139105748757, + "grad_norm": 0.014487593434751034, + "learning_rate": 8.385081618168913e-05, + "loss": 0.05798448324203491, + "step": 113820 + }, + { + "epoch": 16.157558552164655, + "grad_norm": 0.8680633306503296, + "learning_rate": 8.384939673527324e-05, + "loss": 0.05073647499084473, + "step": 113830 + }, + { + "epoch": 16.158977998580554, + "grad_norm": 0.09245634078979492, + "learning_rate": 8.384797728885736e-05, + "loss": 0.016138990223407746, + "step": 113840 + }, + { + "epoch": 16.160397444996452, + "grad_norm": 0.0410037636756897, + "learning_rate": 8.384655784244145e-05, + "loss": 0.006852047145366668, + "step": 113850 + }, + { + "epoch": 16.16181689141235, + "grad_norm": 7.452047348022461, + "learning_rate": 8.384513839602556e-05, + "loss": 0.030913379788398743, + "step": 113860 + }, + { + "epoch": 16.16323633782825, + "grad_norm": 0.3431392312049866, + "learning_rate": 8.384371894960965e-05, + "loss": 0.010368605703115463, + "step": 113870 + }, + { + "epoch": 16.164655784244143, + "grad_norm": 0.02049533650279045, + "learning_rate": 8.384229950319376e-05, + "loss": 0.050056666135787964, + "step": 113880 + }, + { + "epoch": 16.16607523066004, + "grad_norm": 1.6124444007873535, + "learning_rate": 8.384088005677786e-05, + "loss": 0.006237022578716278, + "step": 113890 + }, + { + "epoch": 16.16749467707594, + "grad_norm": 0.058554697781801224, + "learning_rate": 8.383946061036197e-05, + "loss": 0.017270559072494508, + "step": 113900 + }, + { + "epoch": 16.168914123491838, + "grad_norm": 0.15180853009223938, + "learning_rate": 8.383804116394606e-05, + "loss": 0.014956575632095338, + "step": 113910 + }, + { + "epoch": 16.170333569907736, + "grad_norm": 0.9848091006278992, + "learning_rate": 8.383662171753018e-05, + "loss": 0.014708085358142853, + "step": 113920 + }, + { + "epoch": 16.171753016323635, + "grad_norm": 3.5492801666259766, + "learning_rate": 8.383520227111427e-05, + "loss": 0.017875519394874573, + "step": 113930 + }, + { + "epoch": 16.173172462739533, + "grad_norm": 0.6977704167366028, + "learning_rate": 8.383378282469837e-05, + "loss": 0.032053911685943605, + "step": 113940 + }, + { + "epoch": 16.174591909155428, + "grad_norm": 3.879185676574707, + "learning_rate": 8.383236337828248e-05, + "loss": 0.024631142616271973, + "step": 113950 + }, + { + "epoch": 16.176011355571326, + "grad_norm": 2.5624606609344482, + "learning_rate": 8.383094393186658e-05, + "loss": 0.028592270612716675, + "step": 113960 + }, + { + "epoch": 16.177430801987224, + "grad_norm": 0.04760686308145523, + "learning_rate": 8.382952448545069e-05, + "loss": 0.027992811799049378, + "step": 113970 + }, + { + "epoch": 16.178850248403123, + "grad_norm": 0.4733794033527374, + "learning_rate": 8.382810503903477e-05, + "loss": 0.004519284889101982, + "step": 113980 + }, + { + "epoch": 16.18026969481902, + "grad_norm": 4.214389324188232, + "learning_rate": 8.382668559261888e-05, + "loss": 0.033532992005348206, + "step": 113990 + }, + { + "epoch": 16.18168914123492, + "grad_norm": 0.059593670070171356, + "learning_rate": 8.382526614620298e-05, + "loss": 0.036214256286621095, + "step": 114000 + }, + { + "epoch": 16.18168914123492, + "eval_accuracy": 0.9774909391492338, + "eval_loss": 0.07867772877216339, + "eval_runtime": 32.61, + "eval_samples_per_second": 482.275, + "eval_steps_per_second": 15.087, + "step": 114000 + }, + { + "epoch": 16.183108587650818, + "grad_norm": 0.38623306155204773, + "learning_rate": 8.382384669978709e-05, + "loss": 0.05369272828102112, + "step": 114010 + }, + { + "epoch": 16.184528034066712, + "grad_norm": 2.5667450428009033, + "learning_rate": 8.382242725337119e-05, + "loss": 0.0825517237186432, + "step": 114020 + }, + { + "epoch": 16.18594748048261, + "grad_norm": 4.854561805725098, + "learning_rate": 8.382100780695529e-05, + "loss": 0.03476034104824066, + "step": 114030 + }, + { + "epoch": 16.18736692689851, + "grad_norm": 10.146092414855957, + "learning_rate": 8.38195883605394e-05, + "loss": 0.022970007359981538, + "step": 114040 + }, + { + "epoch": 16.188786373314407, + "grad_norm": 14.312685012817383, + "learning_rate": 8.38181689141235e-05, + "loss": 0.08640811443328858, + "step": 114050 + }, + { + "epoch": 16.190205819730306, + "grad_norm": 0.31443431973457336, + "learning_rate": 8.38167494677076e-05, + "loss": 0.0024741746485233305, + "step": 114060 + }, + { + "epoch": 16.191625266146204, + "grad_norm": 0.1664002686738968, + "learning_rate": 8.38153300212917e-05, + "loss": 0.014358796179294586, + "step": 114070 + }, + { + "epoch": 16.193044712562102, + "grad_norm": 0.33076465129852295, + "learning_rate": 8.38139105748758e-05, + "loss": 0.009447012841701508, + "step": 114080 + }, + { + "epoch": 16.194464158977997, + "grad_norm": 0.12226522713899612, + "learning_rate": 8.38124911284599e-05, + "loss": 0.050778812170028685, + "step": 114090 + }, + { + "epoch": 16.195883605393895, + "grad_norm": 2.3829030990600586, + "learning_rate": 8.381107168204401e-05, + "loss": 0.0029199857264757155, + "step": 114100 + }, + { + "epoch": 16.197303051809794, + "grad_norm": 0.021862417459487915, + "learning_rate": 8.38096522356281e-05, + "loss": 0.014643014967441558, + "step": 114110 + }, + { + "epoch": 16.198722498225692, + "grad_norm": 5.501955986022949, + "learning_rate": 8.380823278921222e-05, + "loss": 0.06353362798690795, + "step": 114120 + }, + { + "epoch": 16.20014194464159, + "grad_norm": 0.030185649171471596, + "learning_rate": 8.380681334279631e-05, + "loss": 0.007039766758680344, + "step": 114130 + }, + { + "epoch": 16.20156139105749, + "grad_norm": 14.978668212890625, + "learning_rate": 8.380539389638041e-05, + "loss": 0.021280562877655028, + "step": 114140 + }, + { + "epoch": 16.202980837473387, + "grad_norm": 0.18859194219112396, + "learning_rate": 8.380397444996452e-05, + "loss": 0.04804631173610687, + "step": 114150 + }, + { + "epoch": 16.20440028388928, + "grad_norm": 0.3008385896682739, + "learning_rate": 8.380255500354862e-05, + "loss": 0.020325048267841338, + "step": 114160 + }, + { + "epoch": 16.20581973030518, + "grad_norm": 0.050892528146505356, + "learning_rate": 8.380113555713273e-05, + "loss": 0.006553132832050323, + "step": 114170 + }, + { + "epoch": 16.207239176721078, + "grad_norm": 17.227819442749023, + "learning_rate": 8.379971611071682e-05, + "loss": 0.04515390396118164, + "step": 114180 + }, + { + "epoch": 16.208658623136976, + "grad_norm": 0.6563828587532043, + "learning_rate": 8.379829666430093e-05, + "loss": 0.04622732698917389, + "step": 114190 + }, + { + "epoch": 16.210078069552875, + "grad_norm": 0.5986719131469727, + "learning_rate": 8.379687721788502e-05, + "loss": 0.008560654520988465, + "step": 114200 + }, + { + "epoch": 16.211497515968773, + "grad_norm": 5.077998161315918, + "learning_rate": 8.379545777146913e-05, + "loss": 0.01993117332458496, + "step": 114210 + }, + { + "epoch": 16.21291696238467, + "grad_norm": 1.5840833187103271, + "learning_rate": 8.379403832505323e-05, + "loss": 0.040464064478874205, + "step": 114220 + }, + { + "epoch": 16.214336408800566, + "grad_norm": 1.6033178567886353, + "learning_rate": 8.379261887863733e-05, + "loss": 0.03339195549488068, + "step": 114230 + }, + { + "epoch": 16.215755855216464, + "grad_norm": 0.9904631972312927, + "learning_rate": 8.379119943222144e-05, + "loss": 0.054726976156234744, + "step": 114240 + }, + { + "epoch": 16.217175301632363, + "grad_norm": 5.803867340087891, + "learning_rate": 8.378977998580554e-05, + "loss": 0.014914526045322419, + "step": 114250 + }, + { + "epoch": 16.21859474804826, + "grad_norm": 9.74636459350586, + "learning_rate": 8.378836053938965e-05, + "loss": 0.054322832822799684, + "step": 114260 + }, + { + "epoch": 16.22001419446416, + "grad_norm": 0.028677405789494514, + "learning_rate": 8.378694109297375e-05, + "loss": 0.03889646232128143, + "step": 114270 + }, + { + "epoch": 16.221433640880058, + "grad_norm": 5.0605149269104, + "learning_rate": 8.378552164655786e-05, + "loss": 0.017887987196445465, + "step": 114280 + }, + { + "epoch": 16.222853087295956, + "grad_norm": 0.1755291372537613, + "learning_rate": 8.378410220014194e-05, + "loss": 0.01712374985218048, + "step": 114290 + }, + { + "epoch": 16.22427253371185, + "grad_norm": 0.1283310055732727, + "learning_rate": 8.378268275372605e-05, + "loss": 0.03941631317138672, + "step": 114300 + }, + { + "epoch": 16.22569198012775, + "grad_norm": 0.12080294638872147, + "learning_rate": 8.378126330731015e-05, + "loss": 0.03501139581203461, + "step": 114310 + }, + { + "epoch": 16.227111426543647, + "grad_norm": 0.21980896592140198, + "learning_rate": 8.377984386089426e-05, + "loss": 0.01078852266073227, + "step": 114320 + }, + { + "epoch": 16.228530872959546, + "grad_norm": 3.0699965953826904, + "learning_rate": 8.377842441447836e-05, + "loss": 0.09886000752449035, + "step": 114330 + }, + { + "epoch": 16.229950319375444, + "grad_norm": 0.06542843580245972, + "learning_rate": 8.377700496806245e-05, + "loss": 0.029817229509353636, + "step": 114340 + }, + { + "epoch": 16.231369765791342, + "grad_norm": 0.34764522314071655, + "learning_rate": 8.377558552164657e-05, + "loss": 0.0022600889205932617, + "step": 114350 + }, + { + "epoch": 16.23278921220724, + "grad_norm": 0.052265383303165436, + "learning_rate": 8.377416607523066e-05, + "loss": 0.047760218381881714, + "step": 114360 + }, + { + "epoch": 16.234208658623135, + "grad_norm": 6.4861931800842285, + "learning_rate": 8.377274662881477e-05, + "loss": 0.027695602178573607, + "step": 114370 + }, + { + "epoch": 16.235628105039034, + "grad_norm": 3.700791358947754, + "learning_rate": 8.377132718239887e-05, + "loss": 0.01770424097776413, + "step": 114380 + }, + { + "epoch": 16.237047551454932, + "grad_norm": 5.992948055267334, + "learning_rate": 8.376990773598297e-05, + "loss": 0.03541705012321472, + "step": 114390 + }, + { + "epoch": 16.23846699787083, + "grad_norm": 2.1976397037506104, + "learning_rate": 8.376848828956707e-05, + "loss": 0.022951729595661163, + "step": 114400 + }, + { + "epoch": 16.23988644428673, + "grad_norm": 1.044998288154602, + "learning_rate": 8.376706884315118e-05, + "loss": 0.01119830459356308, + "step": 114410 + }, + { + "epoch": 16.241305890702627, + "grad_norm": 0.030515599995851517, + "learning_rate": 8.376564939673527e-05, + "loss": 0.024385225772857667, + "step": 114420 + }, + { + "epoch": 16.242725337118525, + "grad_norm": 2.9850716590881348, + "learning_rate": 8.376422995031939e-05, + "loss": 0.011873158812522887, + "step": 114430 + }, + { + "epoch": 16.24414478353442, + "grad_norm": 8.733880043029785, + "learning_rate": 8.376281050390348e-05, + "loss": 0.05018941760063171, + "step": 114440 + }, + { + "epoch": 16.24556422995032, + "grad_norm": 3.0520923137664795, + "learning_rate": 8.376139105748758e-05, + "loss": 0.029634937644004822, + "step": 114450 + }, + { + "epoch": 16.246983676366217, + "grad_norm": 0.42049431800842285, + "learning_rate": 8.375997161107169e-05, + "loss": 0.009971027076244355, + "step": 114460 + }, + { + "epoch": 16.248403122782115, + "grad_norm": 0.03036337159574032, + "learning_rate": 8.375855216465579e-05, + "loss": 0.01775699257850647, + "step": 114470 + }, + { + "epoch": 16.249822569198013, + "grad_norm": 5.403947353363037, + "learning_rate": 8.37571327182399e-05, + "loss": 0.041061696410179135, + "step": 114480 + }, + { + "epoch": 16.25124201561391, + "grad_norm": 0.026970932260155678, + "learning_rate": 8.375571327182398e-05, + "loss": 0.0037163462489843368, + "step": 114490 + }, + { + "epoch": 16.25266146202981, + "grad_norm": 0.009657211601734161, + "learning_rate": 8.37542938254081e-05, + "loss": 0.005383031442761421, + "step": 114500 + }, + { + "epoch": 16.25266146202981, + "eval_accuracy": 0.9855662236917403, + "eval_loss": 0.053308937698602676, + "eval_runtime": 32.8268, + "eval_samples_per_second": 479.091, + "eval_steps_per_second": 14.988, + "step": 114500 + }, + { + "epoch": 16.254080908445705, + "grad_norm": 3.289928436279297, + "learning_rate": 8.375287437899219e-05, + "loss": 0.04177338480949402, + "step": 114510 + }, + { + "epoch": 16.255500354861603, + "grad_norm": 1.0733915567398071, + "learning_rate": 8.37514549325763e-05, + "loss": 0.006131677702069282, + "step": 114520 + }, + { + "epoch": 16.2569198012775, + "grad_norm": 5.283725261688232, + "learning_rate": 8.37500354861604e-05, + "loss": 0.0118058480322361, + "step": 114530 + }, + { + "epoch": 16.2583392476934, + "grad_norm": 0.19665656983852386, + "learning_rate": 8.37486160397445e-05, + "loss": 0.0037314273416996003, + "step": 114540 + }, + { + "epoch": 16.259758694109298, + "grad_norm": 0.6442742347717285, + "learning_rate": 8.374719659332861e-05, + "loss": 0.017784593999385832, + "step": 114550 + }, + { + "epoch": 16.261178140525196, + "grad_norm": 0.9198021292686462, + "learning_rate": 8.37457771469127e-05, + "loss": 0.027311056852340698, + "step": 114560 + }, + { + "epoch": 16.262597586941094, + "grad_norm": 0.5428899526596069, + "learning_rate": 8.374435770049682e-05, + "loss": 0.008149975538253784, + "step": 114570 + }, + { + "epoch": 16.26401703335699, + "grad_norm": 0.33625754714012146, + "learning_rate": 8.374293825408091e-05, + "loss": 0.014088863134384155, + "step": 114580 + }, + { + "epoch": 16.265436479772887, + "grad_norm": 5.278527736663818, + "learning_rate": 8.374151880766501e-05, + "loss": 0.02593829035758972, + "step": 114590 + }, + { + "epoch": 16.266855926188786, + "grad_norm": 0.8755744695663452, + "learning_rate": 8.374009936124911e-05, + "loss": 0.01804676800966263, + "step": 114600 + }, + { + "epoch": 16.268275372604684, + "grad_norm": 3.1491434574127197, + "learning_rate": 8.373867991483322e-05, + "loss": 0.008193667232990264, + "step": 114610 + }, + { + "epoch": 16.269694819020582, + "grad_norm": 0.10345172882080078, + "learning_rate": 8.373726046841732e-05, + "loss": 0.03468181788921356, + "step": 114620 + }, + { + "epoch": 16.27111426543648, + "grad_norm": 10.353991508483887, + "learning_rate": 8.373584102200143e-05, + "loss": 0.02742985785007477, + "step": 114630 + }, + { + "epoch": 16.27253371185238, + "grad_norm": 0.13133634626865387, + "learning_rate": 8.373442157558553e-05, + "loss": 0.012948381900787353, + "step": 114640 + }, + { + "epoch": 16.273953158268274, + "grad_norm": 0.13561446964740753, + "learning_rate": 8.373300212916962e-05, + "loss": 0.03252634406089783, + "step": 114650 + }, + { + "epoch": 16.275372604684172, + "grad_norm": 0.01328427903354168, + "learning_rate": 8.373158268275373e-05, + "loss": 0.05154916048049927, + "step": 114660 + }, + { + "epoch": 16.27679205110007, + "grad_norm": 4.751542091369629, + "learning_rate": 8.373016323633783e-05, + "loss": 0.03669569492340088, + "step": 114670 + }, + { + "epoch": 16.27821149751597, + "grad_norm": 0.4849165976047516, + "learning_rate": 8.372874378992194e-05, + "loss": 0.006884780526161194, + "step": 114680 + }, + { + "epoch": 16.279630943931867, + "grad_norm": 2.379690647125244, + "learning_rate": 8.372732434350604e-05, + "loss": 0.009375137090682984, + "step": 114690 + }, + { + "epoch": 16.281050390347765, + "grad_norm": 1.0215786695480347, + "learning_rate": 8.372590489709014e-05, + "loss": 0.025673750042915344, + "step": 114700 + }, + { + "epoch": 16.282469836763664, + "grad_norm": 6.123466491699219, + "learning_rate": 8.372448545067423e-05, + "loss": 0.02046940624713898, + "step": 114710 + }, + { + "epoch": 16.28388928317956, + "grad_norm": 0.04557579383254051, + "learning_rate": 8.372306600425834e-05, + "loss": 0.011663874983787537, + "step": 114720 + }, + { + "epoch": 16.285308729595457, + "grad_norm": 2.435971736907959, + "learning_rate": 8.372164655784244e-05, + "loss": 0.02507122457027435, + "step": 114730 + }, + { + "epoch": 16.286728176011355, + "grad_norm": 0.46252408623695374, + "learning_rate": 8.372022711142655e-05, + "loss": 0.0026042815297842026, + "step": 114740 + }, + { + "epoch": 16.288147622427253, + "grad_norm": 0.07082727551460266, + "learning_rate": 8.371880766501065e-05, + "loss": 0.014566010236740113, + "step": 114750 + }, + { + "epoch": 16.28956706884315, + "grad_norm": 3.0574235916137695, + "learning_rate": 8.371738821859475e-05, + "loss": 0.02677365243434906, + "step": 114760 + }, + { + "epoch": 16.29098651525905, + "grad_norm": 0.7447109222412109, + "learning_rate": 8.371596877217886e-05, + "loss": 0.007968011498451232, + "step": 114770 + }, + { + "epoch": 16.292405961674948, + "grad_norm": 5.180814743041992, + "learning_rate": 8.371454932576296e-05, + "loss": 0.020020869374275208, + "step": 114780 + }, + { + "epoch": 16.293825408090843, + "grad_norm": 3.603285789489746, + "learning_rate": 8.371312987934707e-05, + "loss": 0.029456543922424316, + "step": 114790 + }, + { + "epoch": 16.29524485450674, + "grad_norm": 0.4434371888637543, + "learning_rate": 8.371171043293115e-05, + "loss": 0.04266427755355835, + "step": 114800 + }, + { + "epoch": 16.29666430092264, + "grad_norm": 0.6044859886169434, + "learning_rate": 8.371029098651526e-05, + "loss": 0.04656971991062164, + "step": 114810 + }, + { + "epoch": 16.298083747338538, + "grad_norm": 0.04528965428471565, + "learning_rate": 8.370887154009936e-05, + "loss": 0.01092740148305893, + "step": 114820 + }, + { + "epoch": 16.299503193754436, + "grad_norm": 0.0253436379134655, + "learning_rate": 8.370745209368347e-05, + "loss": 0.04149647951126099, + "step": 114830 + }, + { + "epoch": 16.300922640170334, + "grad_norm": 0.33020728826522827, + "learning_rate": 8.370603264726757e-05, + "loss": 0.029522156715393065, + "step": 114840 + }, + { + "epoch": 16.302342086586233, + "grad_norm": 3.6205527782440186, + "learning_rate": 8.370461320085166e-05, + "loss": 0.02586008906364441, + "step": 114850 + }, + { + "epoch": 16.303761533002127, + "grad_norm": 0.2821308970451355, + "learning_rate": 8.370319375443578e-05, + "loss": 0.013489672541618347, + "step": 114860 + }, + { + "epoch": 16.305180979418026, + "grad_norm": 0.031373679637908936, + "learning_rate": 8.370177430801987e-05, + "loss": 0.013849031925201417, + "step": 114870 + }, + { + "epoch": 16.306600425833924, + "grad_norm": 5.040038108825684, + "learning_rate": 8.370035486160398e-05, + "loss": 0.02151748687028885, + "step": 114880 + }, + { + "epoch": 16.308019872249822, + "grad_norm": 0.03968564793467522, + "learning_rate": 8.369893541518808e-05, + "loss": 0.024423137307167053, + "step": 114890 + }, + { + "epoch": 16.30943931866572, + "grad_norm": 0.42807242274284363, + "learning_rate": 8.369751596877218e-05, + "loss": 0.025084248185157774, + "step": 114900 + }, + { + "epoch": 16.31085876508162, + "grad_norm": 9.006585121154785, + "learning_rate": 8.369609652235628e-05, + "loss": 0.016592279076576233, + "step": 114910 + }, + { + "epoch": 16.312278211497517, + "grad_norm": 0.06665924936532974, + "learning_rate": 8.369467707594039e-05, + "loss": 0.020814579725265504, + "step": 114920 + }, + { + "epoch": 16.313697657913412, + "grad_norm": 7.368483066558838, + "learning_rate": 8.369325762952448e-05, + "loss": 0.022772680222988128, + "step": 114930 + }, + { + "epoch": 16.31511710432931, + "grad_norm": 0.04184337332844734, + "learning_rate": 8.36918381831086e-05, + "loss": 0.003722415864467621, + "step": 114940 + }, + { + "epoch": 16.31653655074521, + "grad_norm": 0.12398586422204971, + "learning_rate": 8.369041873669269e-05, + "loss": 0.014785408973693848, + "step": 114950 + }, + { + "epoch": 16.317955997161107, + "grad_norm": 7.1915764808654785, + "learning_rate": 8.368899929027679e-05, + "loss": 0.04960554540157318, + "step": 114960 + }, + { + "epoch": 16.319375443577005, + "grad_norm": 0.316967248916626, + "learning_rate": 8.36875798438609e-05, + "loss": 0.08032472729682923, + "step": 114970 + }, + { + "epoch": 16.320794889992904, + "grad_norm": 0.14770057797431946, + "learning_rate": 8.3686160397445e-05, + "loss": 0.012001116573810578, + "step": 114980 + }, + { + "epoch": 16.322214336408802, + "grad_norm": 0.02392687276005745, + "learning_rate": 8.368474095102911e-05, + "loss": 0.03899045586585999, + "step": 114990 + }, + { + "epoch": 16.323633782824697, + "grad_norm": 0.5080669522285461, + "learning_rate": 8.36833215046132e-05, + "loss": 0.06272426843643189, + "step": 115000 + }, + { + "epoch": 16.323633782824697, + "eval_accuracy": 0.9827684873148089, + "eval_loss": 0.05823565647006035, + "eval_runtime": 33.363, + "eval_samples_per_second": 471.39, + "eval_steps_per_second": 14.747, + "step": 115000 + }, + { + "epoch": 16.325053229240595, + "grad_norm": 1.3705179691314697, + "learning_rate": 8.36819020581973e-05, + "loss": 0.03353260755538941, + "step": 115010 + }, + { + "epoch": 16.326472675656493, + "grad_norm": 0.10776552557945251, + "learning_rate": 8.36804826117814e-05, + "loss": 0.017255675792694092, + "step": 115020 + }, + { + "epoch": 16.32789212207239, + "grad_norm": 0.04744476079940796, + "learning_rate": 8.367906316536551e-05, + "loss": 0.025518766045570372, + "step": 115030 + }, + { + "epoch": 16.32931156848829, + "grad_norm": 1.8002848625183105, + "learning_rate": 8.367764371894961e-05, + "loss": 0.013162341713905335, + "step": 115040 + }, + { + "epoch": 16.330731014904188, + "grad_norm": 7.3443732261657715, + "learning_rate": 8.367622427253372e-05, + "loss": 0.01212470680475235, + "step": 115050 + }, + { + "epoch": 16.332150461320087, + "grad_norm": 2.0661675930023193, + "learning_rate": 8.367480482611782e-05, + "loss": 0.033673611283302304, + "step": 115060 + }, + { + "epoch": 16.33356990773598, + "grad_norm": 0.07441291958093643, + "learning_rate": 8.367338537970192e-05, + "loss": 0.0027853518724441527, + "step": 115070 + }, + { + "epoch": 16.33498935415188, + "grad_norm": 1.5034619569778442, + "learning_rate": 8.367196593328603e-05, + "loss": 0.008223128318786622, + "step": 115080 + }, + { + "epoch": 16.336408800567778, + "grad_norm": 0.20786134898662567, + "learning_rate": 8.367054648687012e-05, + "loss": 0.004812454804778099, + "step": 115090 + }, + { + "epoch": 16.337828246983676, + "grad_norm": 0.5978586673736572, + "learning_rate": 8.366912704045423e-05, + "loss": 0.027519500255584715, + "step": 115100 + }, + { + "epoch": 16.339247693399575, + "grad_norm": 0.2658475339412689, + "learning_rate": 8.366770759403832e-05, + "loss": 0.026401248574256898, + "step": 115110 + }, + { + "epoch": 16.340667139815473, + "grad_norm": 0.026537051424384117, + "learning_rate": 8.366628814762243e-05, + "loss": 0.026840895414352417, + "step": 115120 + }, + { + "epoch": 16.34208658623137, + "grad_norm": 0.2847868800163269, + "learning_rate": 8.366486870120653e-05, + "loss": 0.007615000009536743, + "step": 115130 + }, + { + "epoch": 16.343506032647266, + "grad_norm": 8.656590461730957, + "learning_rate": 8.366344925479064e-05, + "loss": 0.03938590884208679, + "step": 115140 + }, + { + "epoch": 16.344925479063164, + "grad_norm": 0.22449363768100739, + "learning_rate": 8.366202980837475e-05, + "loss": 0.00203959122300148, + "step": 115150 + }, + { + "epoch": 16.346344925479062, + "grad_norm": 0.11375907808542252, + "learning_rate": 8.366061036195883e-05, + "loss": 0.006970361620187759, + "step": 115160 + }, + { + "epoch": 16.34776437189496, + "grad_norm": 0.22546268999576569, + "learning_rate": 8.365919091554294e-05, + "loss": 0.00478266179561615, + "step": 115170 + }, + { + "epoch": 16.34918381831086, + "grad_norm": 0.8449538946151733, + "learning_rate": 8.365777146912704e-05, + "loss": 0.0027214929461479185, + "step": 115180 + }, + { + "epoch": 16.350603264726757, + "grad_norm": 0.7154591083526611, + "learning_rate": 8.365635202271115e-05, + "loss": 0.04068278968334198, + "step": 115190 + }, + { + "epoch": 16.352022711142656, + "grad_norm": 3.3361713886260986, + "learning_rate": 8.365493257629525e-05, + "loss": 0.006870243698358536, + "step": 115200 + }, + { + "epoch": 16.35344215755855, + "grad_norm": 11.387276649475098, + "learning_rate": 8.365351312987935e-05, + "loss": 0.024857263267040252, + "step": 115210 + }, + { + "epoch": 16.35486160397445, + "grad_norm": 1.0348623991012573, + "learning_rate": 8.365209368346344e-05, + "loss": 0.007273174822330475, + "step": 115220 + }, + { + "epoch": 16.356281050390347, + "grad_norm": 0.0277334563434124, + "learning_rate": 8.365067423704755e-05, + "loss": 0.005058244615793228, + "step": 115230 + }, + { + "epoch": 16.357700496806245, + "grad_norm": 0.16035868227481842, + "learning_rate": 8.364925479063167e-05, + "loss": 0.024697315692901612, + "step": 115240 + }, + { + "epoch": 16.359119943222144, + "grad_norm": 8.151751518249512, + "learning_rate": 8.364783534421576e-05, + "loss": 0.01324802339076996, + "step": 115250 + }, + { + "epoch": 16.360539389638042, + "grad_norm": 0.006740411277860403, + "learning_rate": 8.364641589779986e-05, + "loss": 0.009046432375907899, + "step": 115260 + }, + { + "epoch": 16.36195883605394, + "grad_norm": 9.100384712219238, + "learning_rate": 8.364499645138396e-05, + "loss": 0.03189237117767334, + "step": 115270 + }, + { + "epoch": 16.363378282469835, + "grad_norm": 0.5900508761405945, + "learning_rate": 8.364357700496807e-05, + "loss": 0.004971956834197044, + "step": 115280 + }, + { + "epoch": 16.364797728885733, + "grad_norm": 2.169576406478882, + "learning_rate": 8.364215755855217e-05, + "loss": 0.012622570991516114, + "step": 115290 + }, + { + "epoch": 16.36621717530163, + "grad_norm": 3.9675159454345703, + "learning_rate": 8.364073811213628e-05, + "loss": 0.04543294906616211, + "step": 115300 + }, + { + "epoch": 16.36763662171753, + "grad_norm": 0.21474087238311768, + "learning_rate": 8.363931866572036e-05, + "loss": 0.07644522786140442, + "step": 115310 + }, + { + "epoch": 16.36905606813343, + "grad_norm": 11.529156684875488, + "learning_rate": 8.363789921930447e-05, + "loss": 0.08131443858146667, + "step": 115320 + }, + { + "epoch": 16.370475514549327, + "grad_norm": 0.2404913604259491, + "learning_rate": 8.363647977288858e-05, + "loss": 0.034796294569969174, + "step": 115330 + }, + { + "epoch": 16.371894960965225, + "grad_norm": 0.11170510947704315, + "learning_rate": 8.363506032647268e-05, + "loss": 0.0060304529964923855, + "step": 115340 + }, + { + "epoch": 16.37331440738112, + "grad_norm": 0.08581695705652237, + "learning_rate": 8.363364088005679e-05, + "loss": 0.018119427561759948, + "step": 115350 + }, + { + "epoch": 16.374733853797018, + "grad_norm": 0.657220184803009, + "learning_rate": 8.363222143364089e-05, + "loss": 0.015167847275733948, + "step": 115360 + }, + { + "epoch": 16.376153300212916, + "grad_norm": 0.07838866859674454, + "learning_rate": 8.363080198722499e-05, + "loss": 0.011263452470302582, + "step": 115370 + }, + { + "epoch": 16.377572746628815, + "grad_norm": 1.0153034925460815, + "learning_rate": 8.362938254080908e-05, + "loss": 0.005640817433595657, + "step": 115380 + }, + { + "epoch": 16.378992193044713, + "grad_norm": 0.6309021711349487, + "learning_rate": 8.36279630943932e-05, + "loss": 0.017265193164348602, + "step": 115390 + }, + { + "epoch": 16.38041163946061, + "grad_norm": 0.010063917376101017, + "learning_rate": 8.362654364797729e-05, + "loss": 0.019005651772022247, + "step": 115400 + }, + { + "epoch": 16.38183108587651, + "grad_norm": 0.26570451259613037, + "learning_rate": 8.36251242015614e-05, + "loss": 0.009477096050977707, + "step": 115410 + }, + { + "epoch": 16.383250532292404, + "grad_norm": 0.41002532839775085, + "learning_rate": 8.36237047551455e-05, + "loss": 0.008529486507177353, + "step": 115420 + }, + { + "epoch": 16.384669978708303, + "grad_norm": 14.01142692565918, + "learning_rate": 8.36222853087296e-05, + "loss": 0.028902316093444826, + "step": 115430 + }, + { + "epoch": 16.3860894251242, + "grad_norm": 0.2603977918624878, + "learning_rate": 8.362086586231371e-05, + "loss": 0.004179652780294418, + "step": 115440 + }, + { + "epoch": 16.3875088715401, + "grad_norm": 1.5551576614379883, + "learning_rate": 8.36194464158978e-05, + "loss": 0.008667966723442078, + "step": 115450 + }, + { + "epoch": 16.388928317955997, + "grad_norm": 0.09633925557136536, + "learning_rate": 8.361802696948192e-05, + "loss": 0.031859517097473145, + "step": 115460 + }, + { + "epoch": 16.390347764371896, + "grad_norm": 7.917778491973877, + "learning_rate": 8.3616607523066e-05, + "loss": 0.019880032539367674, + "step": 115470 + }, + { + "epoch": 16.391767210787794, + "grad_norm": 0.0779050812125206, + "learning_rate": 8.36153300212917e-05, + "loss": 0.03968299627304077, + "step": 115480 + }, + { + "epoch": 16.39318665720369, + "grad_norm": 0.15087741613388062, + "learning_rate": 8.36139105748758e-05, + "loss": 0.016289661824703216, + "step": 115490 + }, + { + "epoch": 16.394606103619587, + "grad_norm": 0.2024311125278473, + "learning_rate": 8.361249112845991e-05, + "loss": 0.00186014324426651, + "step": 115500 + }, + { + "epoch": 16.394606103619587, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.049540504813194275, + "eval_runtime": 32.6337, + "eval_samples_per_second": 481.925, + "eval_steps_per_second": 15.076, + "step": 115500 + }, + { + "epoch": 16.396025550035485, + "grad_norm": 4.084429740905762, + "learning_rate": 8.3611071682044e-05, + "loss": 0.030351912975311278, + "step": 115510 + }, + { + "epoch": 16.397444996451384, + "grad_norm": 2.075824022293091, + "learning_rate": 8.360965223562812e-05, + "loss": 0.022257208824157715, + "step": 115520 + }, + { + "epoch": 16.398864442867282, + "grad_norm": 0.05323343724012375, + "learning_rate": 8.360823278921221e-05, + "loss": 0.009775599837303162, + "step": 115530 + }, + { + "epoch": 16.40028388928318, + "grad_norm": 0.227037250995636, + "learning_rate": 8.360681334279631e-05, + "loss": 0.01040622591972351, + "step": 115540 + }, + { + "epoch": 16.40170333569908, + "grad_norm": 0.36200225353240967, + "learning_rate": 8.360539389638041e-05, + "loss": 0.03365554213523865, + "step": 115550 + }, + { + "epoch": 16.403122782114973, + "grad_norm": 12.0408296585083, + "learning_rate": 8.360397444996452e-05, + "loss": 0.023565518856048583, + "step": 115560 + }, + { + "epoch": 16.40454222853087, + "grad_norm": 14.00751781463623, + "learning_rate": 8.360255500354862e-05, + "loss": 0.024163755774497985, + "step": 115570 + }, + { + "epoch": 16.40596167494677, + "grad_norm": 0.039277564734220505, + "learning_rate": 8.360113555713273e-05, + "loss": 0.03428987562656403, + "step": 115580 + }, + { + "epoch": 16.40738112136267, + "grad_norm": 9.152158737182617, + "learning_rate": 8.359971611071682e-05, + "loss": 0.06883146166801453, + "step": 115590 + }, + { + "epoch": 16.408800567778567, + "grad_norm": 0.1360839605331421, + "learning_rate": 8.359829666430092e-05, + "loss": 0.00723818838596344, + "step": 115600 + }, + { + "epoch": 16.410220014194465, + "grad_norm": 7.627287864685059, + "learning_rate": 8.359687721788503e-05, + "loss": 0.021432147920131685, + "step": 115610 + }, + { + "epoch": 16.411639460610363, + "grad_norm": 0.02474578469991684, + "learning_rate": 8.359545777146913e-05, + "loss": 0.011796319484710693, + "step": 115620 + }, + { + "epoch": 16.413058907026258, + "grad_norm": 2.2613930702209473, + "learning_rate": 8.359403832505324e-05, + "loss": 0.007085627317428589, + "step": 115630 + }, + { + "epoch": 16.414478353442156, + "grad_norm": 0.021743113175034523, + "learning_rate": 8.359261887863734e-05, + "loss": 0.011453683674335479, + "step": 115640 + }, + { + "epoch": 16.415897799858055, + "grad_norm": 0.01169249601662159, + "learning_rate": 8.359119943222144e-05, + "loss": 0.034659892320632935, + "step": 115650 + }, + { + "epoch": 16.417317246273953, + "grad_norm": 10.435478210449219, + "learning_rate": 8.358977998580553e-05, + "loss": 0.01766133904457092, + "step": 115660 + }, + { + "epoch": 16.41873669268985, + "grad_norm": 0.17260213196277618, + "learning_rate": 8.358836053938964e-05, + "loss": 0.00581606812775135, + "step": 115670 + }, + { + "epoch": 16.42015613910575, + "grad_norm": 2.1862330436706543, + "learning_rate": 8.358694109297374e-05, + "loss": 0.04490639269351959, + "step": 115680 + }, + { + "epoch": 16.421575585521648, + "grad_norm": 0.038389623165130615, + "learning_rate": 8.358552164655785e-05, + "loss": 0.014942404627799988, + "step": 115690 + }, + { + "epoch": 16.422995031937543, + "grad_norm": 0.029459958896040916, + "learning_rate": 8.358410220014195e-05, + "loss": 0.03503639698028564, + "step": 115700 + }, + { + "epoch": 16.42441447835344, + "grad_norm": 0.2350374162197113, + "learning_rate": 8.358268275372605e-05, + "loss": 0.012763646245002747, + "step": 115710 + }, + { + "epoch": 16.42583392476934, + "grad_norm": 0.03455529734492302, + "learning_rate": 8.358126330731016e-05, + "loss": 0.022325176000595092, + "step": 115720 + }, + { + "epoch": 16.427253371185238, + "grad_norm": 0.20192380249500275, + "learning_rate": 8.357984386089426e-05, + "loss": 0.0248690664768219, + "step": 115730 + }, + { + "epoch": 16.428672817601136, + "grad_norm": 12.080421447753906, + "learning_rate": 8.357842441447837e-05, + "loss": 0.026702883839607238, + "step": 115740 + }, + { + "epoch": 16.430092264017034, + "grad_norm": 7.304847717285156, + "learning_rate": 8.357700496806245e-05, + "loss": 0.036014878749847413, + "step": 115750 + }, + { + "epoch": 16.431511710432932, + "grad_norm": 0.007642616052180529, + "learning_rate": 8.357558552164656e-05, + "loss": 0.023060834407806395, + "step": 115760 + }, + { + "epoch": 16.432931156848827, + "grad_norm": 1.2195990085601807, + "learning_rate": 8.357416607523066e-05, + "loss": 0.01461809128522873, + "step": 115770 + }, + { + "epoch": 16.434350603264726, + "grad_norm": 0.3150467574596405, + "learning_rate": 8.357274662881477e-05, + "loss": 0.015410137176513673, + "step": 115780 + }, + { + "epoch": 16.435770049680624, + "grad_norm": 1.0256836414337158, + "learning_rate": 8.357132718239887e-05, + "loss": 0.03227716088294983, + "step": 115790 + }, + { + "epoch": 16.437189496096522, + "grad_norm": 0.051455672830343246, + "learning_rate": 8.356990773598296e-05, + "loss": 0.02644713819026947, + "step": 115800 + }, + { + "epoch": 16.43860894251242, + "grad_norm": 0.05469426512718201, + "learning_rate": 8.356848828956707e-05, + "loss": 0.02017551362514496, + "step": 115810 + }, + { + "epoch": 16.44002838892832, + "grad_norm": 0.6227673888206482, + "learning_rate": 8.356706884315117e-05, + "loss": 0.006663136184215546, + "step": 115820 + }, + { + "epoch": 16.441447835344217, + "grad_norm": 0.05501323565840721, + "learning_rate": 8.356564939673528e-05, + "loss": 0.03132602572441101, + "step": 115830 + }, + { + "epoch": 16.442867281760112, + "grad_norm": 6.319305419921875, + "learning_rate": 8.356422995031938e-05, + "loss": 0.024266751110553743, + "step": 115840 + }, + { + "epoch": 16.44428672817601, + "grad_norm": 1.0513020753860474, + "learning_rate": 8.356281050390348e-05, + "loss": 0.004631191864609719, + "step": 115850 + }, + { + "epoch": 16.44570617459191, + "grad_norm": 6.053133487701416, + "learning_rate": 8.356139105748758e-05, + "loss": 0.019884093105792998, + "step": 115860 + }, + { + "epoch": 16.447125621007807, + "grad_norm": 0.01819560118019581, + "learning_rate": 8.355997161107169e-05, + "loss": 0.009723077714443206, + "step": 115870 + }, + { + "epoch": 16.448545067423705, + "grad_norm": 0.7310961484909058, + "learning_rate": 8.355855216465578e-05, + "loss": 0.010221529006958007, + "step": 115880 + }, + { + "epoch": 16.449964513839603, + "grad_norm": 0.6194095611572266, + "learning_rate": 8.35571327182399e-05, + "loss": 0.0022616587579250337, + "step": 115890 + }, + { + "epoch": 16.4513839602555, + "grad_norm": 15.329816818237305, + "learning_rate": 8.355571327182399e-05, + "loss": 0.03254573047161102, + "step": 115900 + }, + { + "epoch": 16.4528034066714, + "grad_norm": 0.5103831887245178, + "learning_rate": 8.355429382540809e-05, + "loss": 0.0019066516309976579, + "step": 115910 + }, + { + "epoch": 16.454222853087295, + "grad_norm": 0.43424859642982483, + "learning_rate": 8.35528743789922e-05, + "loss": 0.009562914073467255, + "step": 115920 + }, + { + "epoch": 16.455642299503193, + "grad_norm": 0.04635334387421608, + "learning_rate": 8.35514549325763e-05, + "loss": 0.019181308150291444, + "step": 115930 + }, + { + "epoch": 16.45706174591909, + "grad_norm": 5.84125280380249, + "learning_rate": 8.355003548616041e-05, + "loss": 0.06330277919769287, + "step": 115940 + }, + { + "epoch": 16.45848119233499, + "grad_norm": 0.1608128547668457, + "learning_rate": 8.354861603974449e-05, + "loss": 0.027855148911476134, + "step": 115950 + }, + { + "epoch": 16.459900638750888, + "grad_norm": 0.13251209259033203, + "learning_rate": 8.35471965933286e-05, + "loss": 0.011945770680904388, + "step": 115960 + }, + { + "epoch": 16.461320085166786, + "grad_norm": 0.12929560244083405, + "learning_rate": 8.35457771469127e-05, + "loss": 0.006080722063779831, + "step": 115970 + }, + { + "epoch": 16.462739531582685, + "grad_norm": 4.530337333679199, + "learning_rate": 8.354435770049681e-05, + "loss": 0.0706814706325531, + "step": 115980 + }, + { + "epoch": 16.46415897799858, + "grad_norm": 0.32479044795036316, + "learning_rate": 8.354293825408092e-05, + "loss": 0.05216479897499084, + "step": 115990 + }, + { + "epoch": 16.465578424414478, + "grad_norm": 0.01489038486033678, + "learning_rate": 8.354151880766502e-05, + "loss": 0.018127787113189697, + "step": 116000 + }, + { + "epoch": 16.465578424414478, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04969051852822304, + "eval_runtime": 32.3768, + "eval_samples_per_second": 485.749, + "eval_steps_per_second": 15.196, + "step": 116000 + }, + { + "epoch": 16.466997870830376, + "grad_norm": 0.02967519871890545, + "learning_rate": 8.354009936124912e-05, + "loss": 0.02046469897031784, + "step": 116010 + }, + { + "epoch": 16.468417317246274, + "grad_norm": 3.218904733657837, + "learning_rate": 8.353867991483321e-05, + "loss": 0.020984625816345213, + "step": 116020 + }, + { + "epoch": 16.469836763662173, + "grad_norm": 0.4643838703632355, + "learning_rate": 8.353726046841733e-05, + "loss": 0.06885659098625183, + "step": 116030 + }, + { + "epoch": 16.47125621007807, + "grad_norm": 14.610981941223145, + "learning_rate": 8.353584102200142e-05, + "loss": 0.07456240057945251, + "step": 116040 + }, + { + "epoch": 16.47267565649397, + "grad_norm": 11.100933074951172, + "learning_rate": 8.353442157558553e-05, + "loss": 0.04586609303951263, + "step": 116050 + }, + { + "epoch": 16.474095102909864, + "grad_norm": 0.10718601942062378, + "learning_rate": 8.353300212916962e-05, + "loss": 0.016160254180431367, + "step": 116060 + }, + { + "epoch": 16.475514549325762, + "grad_norm": 0.06104143708944321, + "learning_rate": 8.353158268275373e-05, + "loss": 0.023186489939689636, + "step": 116070 + }, + { + "epoch": 16.47693399574166, + "grad_norm": 6.335537433624268, + "learning_rate": 8.353016323633784e-05, + "loss": 0.016118551790714263, + "step": 116080 + }, + { + "epoch": 16.47835344215756, + "grad_norm": 0.003421552013605833, + "learning_rate": 8.352874378992194e-05, + "loss": 0.0062081929296255115, + "step": 116090 + }, + { + "epoch": 16.479772888573457, + "grad_norm": 4.670344829559326, + "learning_rate": 8.352732434350605e-05, + "loss": 0.021376237273216248, + "step": 116100 + }, + { + "epoch": 16.481192334989355, + "grad_norm": 5.064271926879883, + "learning_rate": 8.352590489709013e-05, + "loss": 0.055617785453796385, + "step": 116110 + }, + { + "epoch": 16.482611781405254, + "grad_norm": 10.445257186889648, + "learning_rate": 8.352448545067424e-05, + "loss": 0.029583734273910523, + "step": 116120 + }, + { + "epoch": 16.48403122782115, + "grad_norm": 0.3211866319179535, + "learning_rate": 8.352306600425834e-05, + "loss": 0.013366077840328217, + "step": 116130 + }, + { + "epoch": 16.485450674237047, + "grad_norm": 1.0787125825881958, + "learning_rate": 8.352164655784245e-05, + "loss": 0.03924538791179657, + "step": 116140 + }, + { + "epoch": 16.486870120652945, + "grad_norm": 0.2747179865837097, + "learning_rate": 8.352022711142655e-05, + "loss": 0.05656103491783142, + "step": 116150 + }, + { + "epoch": 16.488289567068843, + "grad_norm": 0.15979036688804626, + "learning_rate": 8.351880766501065e-05, + "loss": 0.032468026876449584, + "step": 116160 + }, + { + "epoch": 16.48970901348474, + "grad_norm": 5.031485080718994, + "learning_rate": 8.351738821859476e-05, + "loss": 0.006507077068090439, + "step": 116170 + }, + { + "epoch": 16.49112845990064, + "grad_norm": 0.0589463971555233, + "learning_rate": 8.351596877217885e-05, + "loss": 0.009158772230148316, + "step": 116180 + }, + { + "epoch": 16.49254790631654, + "grad_norm": 0.01717454195022583, + "learning_rate": 8.351454932576296e-05, + "loss": 0.014610305428504944, + "step": 116190 + }, + { + "epoch": 16.493967352732433, + "grad_norm": 0.45543742179870605, + "learning_rate": 8.351312987934706e-05, + "loss": 0.027995049953460693, + "step": 116200 + }, + { + "epoch": 16.49538679914833, + "grad_norm": 0.3700833320617676, + "learning_rate": 8.351171043293116e-05, + "loss": 0.040512260794639585, + "step": 116210 + }, + { + "epoch": 16.49680624556423, + "grad_norm": 1.1924240589141846, + "learning_rate": 8.351029098651526e-05, + "loss": 0.03392375409603119, + "step": 116220 + }, + { + "epoch": 16.498225691980128, + "grad_norm": 1.7394284009933472, + "learning_rate": 8.350887154009937e-05, + "loss": 0.010041067004203796, + "step": 116230 + }, + { + "epoch": 16.499645138396026, + "grad_norm": 0.04915410280227661, + "learning_rate": 8.350745209368347e-05, + "loss": 0.020868843793869017, + "step": 116240 + }, + { + "epoch": 16.501064584811925, + "grad_norm": 0.034044016152620316, + "learning_rate": 8.350603264726758e-05, + "loss": 0.013935981690883637, + "step": 116250 + }, + { + "epoch": 16.502484031227823, + "grad_norm": 0.4427124261856079, + "learning_rate": 8.350461320085167e-05, + "loss": 0.03237035572528839, + "step": 116260 + }, + { + "epoch": 16.503903477643718, + "grad_norm": 0.012194049544632435, + "learning_rate": 8.350319375443577e-05, + "loss": 0.017374065518379212, + "step": 116270 + }, + { + "epoch": 16.505322924059616, + "grad_norm": 0.20120149850845337, + "learning_rate": 8.350177430801988e-05, + "loss": 0.02145771086215973, + "step": 116280 + }, + { + "epoch": 16.506742370475514, + "grad_norm": 0.022378822788596153, + "learning_rate": 8.350035486160398e-05, + "loss": 0.02700048089027405, + "step": 116290 + }, + { + "epoch": 16.508161816891413, + "grad_norm": 0.8439290523529053, + "learning_rate": 8.349893541518809e-05, + "loss": 0.02738071084022522, + "step": 116300 + }, + { + "epoch": 16.50958126330731, + "grad_norm": 0.1249089315533638, + "learning_rate": 8.349751596877217e-05, + "loss": 0.01525331735610962, + "step": 116310 + }, + { + "epoch": 16.51100070972321, + "grad_norm": 1.2921068668365479, + "learning_rate": 8.349609652235628e-05, + "loss": 0.022152553498744964, + "step": 116320 + }, + { + "epoch": 16.512420156139108, + "grad_norm": 4.518233776092529, + "learning_rate": 8.349467707594038e-05, + "loss": 0.030002409219741823, + "step": 116330 + }, + { + "epoch": 16.513839602555002, + "grad_norm": 3.4533886909484863, + "learning_rate": 8.349325762952449e-05, + "loss": 0.03558555543422699, + "step": 116340 + }, + { + "epoch": 16.5152590489709, + "grad_norm": 0.011736358515918255, + "learning_rate": 8.349183818310859e-05, + "loss": 0.05337393879890442, + "step": 116350 + }, + { + "epoch": 16.5166784953868, + "grad_norm": 0.0143458042293787, + "learning_rate": 8.34904187366927e-05, + "loss": 0.022696526348590852, + "step": 116360 + }, + { + "epoch": 16.518097941802697, + "grad_norm": 0.0822390615940094, + "learning_rate": 8.34889992902768e-05, + "loss": 0.023125678300857544, + "step": 116370 + }, + { + "epoch": 16.519517388218595, + "grad_norm": 0.06752567738294601, + "learning_rate": 8.34875798438609e-05, + "loss": 0.02257518470287323, + "step": 116380 + }, + { + "epoch": 16.520936834634494, + "grad_norm": 11.136147499084473, + "learning_rate": 8.348616039744501e-05, + "loss": 0.023729005455970766, + "step": 116390 + }, + { + "epoch": 16.522356281050392, + "grad_norm": 0.02640192024409771, + "learning_rate": 8.34847409510291e-05, + "loss": 0.028013849258422853, + "step": 116400 + }, + { + "epoch": 16.523775727466287, + "grad_norm": 1.2313165664672852, + "learning_rate": 8.348332150461322e-05, + "loss": 0.004355183988809586, + "step": 116410 + }, + { + "epoch": 16.525195173882185, + "grad_norm": 0.11673381179571152, + "learning_rate": 8.34819020581973e-05, + "loss": 0.03289211392402649, + "step": 116420 + }, + { + "epoch": 16.526614620298083, + "grad_norm": 1.9656383991241455, + "learning_rate": 8.348048261178141e-05, + "loss": 0.03792242407798767, + "step": 116430 + }, + { + "epoch": 16.528034066713982, + "grad_norm": 0.4912014305591583, + "learning_rate": 8.347906316536551e-05, + "loss": 0.03354101479053497, + "step": 116440 + }, + { + "epoch": 16.52945351312988, + "grad_norm": 10.189749717712402, + "learning_rate": 8.347764371894962e-05, + "loss": 0.008781054615974426, + "step": 116450 + }, + { + "epoch": 16.53087295954578, + "grad_norm": 0.001697646570391953, + "learning_rate": 8.347622427253372e-05, + "loss": 0.004727036133408547, + "step": 116460 + }, + { + "epoch": 16.532292405961677, + "grad_norm": 0.20702528953552246, + "learning_rate": 8.347480482611781e-05, + "loss": 0.00879761427640915, + "step": 116470 + }, + { + "epoch": 16.53371185237757, + "grad_norm": 5.2699713706970215, + "learning_rate": 8.347338537970192e-05, + "loss": 0.012068004161119462, + "step": 116480 + }, + { + "epoch": 16.53513129879347, + "grad_norm": 2.320582389831543, + "learning_rate": 8.347196593328602e-05, + "loss": 0.005346919223666191, + "step": 116490 + }, + { + "epoch": 16.536550745209368, + "grad_norm": 0.1566736102104187, + "learning_rate": 8.347054648687013e-05, + "loss": 0.010593122243881226, + "step": 116500 + }, + { + "epoch": 16.536550745209368, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.061827003955841064, + "eval_runtime": 32.9233, + "eval_samples_per_second": 477.686, + "eval_steps_per_second": 14.944, + "step": 116500 + }, + { + "epoch": 16.537970191625266, + "grad_norm": 0.008619280532002449, + "learning_rate": 8.346912704045423e-05, + "loss": 0.0030029378831386566, + "step": 116510 + }, + { + "epoch": 16.539389638041165, + "grad_norm": 0.008758191019296646, + "learning_rate": 8.346770759403833e-05, + "loss": 0.02375074625015259, + "step": 116520 + }, + { + "epoch": 16.540809084457063, + "grad_norm": 1.1835864782333374, + "learning_rate": 8.346628814762242e-05, + "loss": 0.0320775032043457, + "step": 116530 + }, + { + "epoch": 16.54222853087296, + "grad_norm": 0.5120196342468262, + "learning_rate": 8.346486870120654e-05, + "loss": 0.009070900827646255, + "step": 116540 + }, + { + "epoch": 16.543647977288856, + "grad_norm": 0.06194985657930374, + "learning_rate": 8.346344925479063e-05, + "loss": 0.014806480705738067, + "step": 116550 + }, + { + "epoch": 16.545067423704754, + "grad_norm": 0.012789204716682434, + "learning_rate": 8.346202980837474e-05, + "loss": 0.014970606565475464, + "step": 116560 + }, + { + "epoch": 16.546486870120653, + "grad_norm": 0.2259586751461029, + "learning_rate": 8.346061036195884e-05, + "loss": 0.019840967655181885, + "step": 116570 + }, + { + "epoch": 16.54790631653655, + "grad_norm": 0.03640298917889595, + "learning_rate": 8.345919091554294e-05, + "loss": 0.0335048109292984, + "step": 116580 + }, + { + "epoch": 16.54932576295245, + "grad_norm": 1.2783691883087158, + "learning_rate": 8.345777146912705e-05, + "loss": 0.018758539855480195, + "step": 116590 + }, + { + "epoch": 16.550745209368348, + "grad_norm": 0.260236531496048, + "learning_rate": 8.345635202271115e-05, + "loss": 0.04677286446094513, + "step": 116600 + }, + { + "epoch": 16.552164655784246, + "grad_norm": 0.854559063911438, + "learning_rate": 8.345493257629526e-05, + "loss": 0.01778472363948822, + "step": 116610 + }, + { + "epoch": 16.55358410220014, + "grad_norm": 17.11147117614746, + "learning_rate": 8.345351312987934e-05, + "loss": 0.057949680089950564, + "step": 116620 + }, + { + "epoch": 16.55500354861604, + "grad_norm": 4.914608001708984, + "learning_rate": 8.345209368346345e-05, + "loss": 0.006153997406363488, + "step": 116630 + }, + { + "epoch": 16.556422995031937, + "grad_norm": 0.3897269070148468, + "learning_rate": 8.345067423704755e-05, + "loss": 0.004015297442674637, + "step": 116640 + }, + { + "epoch": 16.557842441447836, + "grad_norm": 2.7007765769958496, + "learning_rate": 8.344925479063166e-05, + "loss": 0.006022479012608528, + "step": 116650 + }, + { + "epoch": 16.559261887863734, + "grad_norm": 3.9975502490997314, + "learning_rate": 8.344783534421576e-05, + "loss": 0.02496832013130188, + "step": 116660 + }, + { + "epoch": 16.560681334279632, + "grad_norm": 0.02255144529044628, + "learning_rate": 8.344641589779986e-05, + "loss": 0.0035631228238344193, + "step": 116670 + }, + { + "epoch": 16.56210078069553, + "grad_norm": 0.02827240154147148, + "learning_rate": 8.344499645138397e-05, + "loss": 0.006084645912051201, + "step": 116680 + }, + { + "epoch": 16.563520227111425, + "grad_norm": 1.6112371683120728, + "learning_rate": 8.344357700496806e-05, + "loss": 0.048271042108535764, + "step": 116690 + }, + { + "epoch": 16.564939673527324, + "grad_norm": 0.9190977811813354, + "learning_rate": 8.344215755855217e-05, + "loss": 0.03557129204273224, + "step": 116700 + }, + { + "epoch": 16.566359119943222, + "grad_norm": 0.5334121584892273, + "learning_rate": 8.344073811213627e-05, + "loss": 0.03682603240013123, + "step": 116710 + }, + { + "epoch": 16.56777856635912, + "grad_norm": 2.058285713195801, + "learning_rate": 8.343931866572038e-05, + "loss": 0.012582701444625855, + "step": 116720 + }, + { + "epoch": 16.56919801277502, + "grad_norm": 7.025942325592041, + "learning_rate": 8.343789921930447e-05, + "loss": 0.03197660446166992, + "step": 116730 + }, + { + "epoch": 16.570617459190917, + "grad_norm": 3.6591897010803223, + "learning_rate": 8.343647977288858e-05, + "loss": 0.027104607224464415, + "step": 116740 + }, + { + "epoch": 16.572036905606815, + "grad_norm": 0.9631967544555664, + "learning_rate": 8.343506032647268e-05, + "loss": 0.015047216415405273, + "step": 116750 + }, + { + "epoch": 16.57345635202271, + "grad_norm": 0.07745859771966934, + "learning_rate": 8.343364088005679e-05, + "loss": 0.008823959529399872, + "step": 116760 + }, + { + "epoch": 16.574875798438608, + "grad_norm": 0.19734224677085876, + "learning_rate": 8.343222143364088e-05, + "loss": 0.02915046215057373, + "step": 116770 + }, + { + "epoch": 16.576295244854506, + "grad_norm": 0.09668063372373581, + "learning_rate": 8.343080198722498e-05, + "loss": 0.026794278621673585, + "step": 116780 + }, + { + "epoch": 16.577714691270405, + "grad_norm": 0.08512010425329208, + "learning_rate": 8.342938254080909e-05, + "loss": 0.009084032475948333, + "step": 116790 + }, + { + "epoch": 16.579134137686303, + "grad_norm": 0.07173248380422592, + "learning_rate": 8.342796309439319e-05, + "loss": 0.009995944797992706, + "step": 116800 + }, + { + "epoch": 16.5805535841022, + "grad_norm": 0.018395403400063515, + "learning_rate": 8.342668559261887e-05, + "loss": 0.049830347299575806, + "step": 116810 + }, + { + "epoch": 16.5819730305181, + "grad_norm": 0.7037346363067627, + "learning_rate": 8.342526614620299e-05, + "loss": 0.011580074578523636, + "step": 116820 + }, + { + "epoch": 16.583392476933994, + "grad_norm": 0.91822350025177, + "learning_rate": 8.34238466997871e-05, + "loss": 0.003994914516806602, + "step": 116830 + }, + { + "epoch": 16.584811923349893, + "grad_norm": 0.8074606657028198, + "learning_rate": 8.34224272533712e-05, + "loss": 0.012307696789503098, + "step": 116840 + }, + { + "epoch": 16.58623136976579, + "grad_norm": 2.2989635467529297, + "learning_rate": 8.342100780695529e-05, + "loss": 0.013350567221641541, + "step": 116850 + }, + { + "epoch": 16.58765081618169, + "grad_norm": 1.6210821866989136, + "learning_rate": 8.341958836053939e-05, + "loss": 0.01092873364686966, + "step": 116860 + }, + { + "epoch": 16.589070262597588, + "grad_norm": 1.5248768329620361, + "learning_rate": 8.34181689141235e-05, + "loss": 0.014216583967208863, + "step": 116870 + }, + { + "epoch": 16.590489709013486, + "grad_norm": 0.028146639466285706, + "learning_rate": 8.34167494677076e-05, + "loss": 0.055502718687057494, + "step": 116880 + }, + { + "epoch": 16.591909155429384, + "grad_norm": 7.138301372528076, + "learning_rate": 8.341533002129171e-05, + "loss": 0.0446750670671463, + "step": 116890 + }, + { + "epoch": 16.59332860184528, + "grad_norm": 10.649222373962402, + "learning_rate": 8.341391057487579e-05, + "loss": 0.019469711184501647, + "step": 116900 + }, + { + "epoch": 16.594748048261177, + "grad_norm": 4.18618106842041, + "learning_rate": 8.34124911284599e-05, + "loss": 0.020049738883972167, + "step": 116910 + }, + { + "epoch": 16.596167494677076, + "grad_norm": 0.057533327490091324, + "learning_rate": 8.341107168204401e-05, + "loss": 0.0431816428899765, + "step": 116920 + }, + { + "epoch": 16.597586941092974, + "grad_norm": 0.42292356491088867, + "learning_rate": 8.340965223562811e-05, + "loss": 0.0034976493567228316, + "step": 116930 + }, + { + "epoch": 16.599006387508872, + "grad_norm": 0.04697134718298912, + "learning_rate": 8.340823278921222e-05, + "loss": 0.004483159631490707, + "step": 116940 + }, + { + "epoch": 16.60042583392477, + "grad_norm": 0.044272731989622116, + "learning_rate": 8.34068133427963e-05, + "loss": 0.038423961400985716, + "step": 116950 + }, + { + "epoch": 16.60184528034067, + "grad_norm": 3.4408552646636963, + "learning_rate": 8.340539389638042e-05, + "loss": 0.07658275365829467, + "step": 116960 + }, + { + "epoch": 16.603264726756564, + "grad_norm": 12.716185569763184, + "learning_rate": 8.340397444996451e-05, + "loss": 0.04419266879558563, + "step": 116970 + }, + { + "epoch": 16.604684173172462, + "grad_norm": 5.369838714599609, + "learning_rate": 8.340255500354862e-05, + "loss": 0.02143881618976593, + "step": 116980 + }, + { + "epoch": 16.60610361958836, + "grad_norm": 0.009581176564097404, + "learning_rate": 8.340113555713272e-05, + "loss": 0.011463432759046554, + "step": 116990 + }, + { + "epoch": 16.60752306600426, + "grad_norm": 13.691011428833008, + "learning_rate": 8.339971611071682e-05, + "loss": 0.010283533483743668, + "step": 117000 + }, + { + "epoch": 16.60752306600426, + "eval_accuracy": 0.982895657150124, + "eval_loss": 0.06755528599023819, + "eval_runtime": 33.6337, + "eval_samples_per_second": 467.596, + "eval_steps_per_second": 14.628, + "step": 117000 + }, + { + "epoch": 16.608942512420157, + "grad_norm": 0.05318843573331833, + "learning_rate": 8.339829666430092e-05, + "loss": 0.00797543078660965, + "step": 117010 + }, + { + "epoch": 16.610361958836055, + "grad_norm": 8.886881828308105, + "learning_rate": 8.339687721788503e-05, + "loss": 0.04408339262008667, + "step": 117020 + }, + { + "epoch": 16.611781405251953, + "grad_norm": 12.320178031921387, + "learning_rate": 8.339545777146914e-05, + "loss": 0.04468807280063629, + "step": 117030 + }, + { + "epoch": 16.613200851667848, + "grad_norm": 0.2367558479309082, + "learning_rate": 8.339403832505324e-05, + "loss": 0.06928732991218567, + "step": 117040 + }, + { + "epoch": 16.614620298083747, + "grad_norm": 0.6578727960586548, + "learning_rate": 8.339261887863735e-05, + "loss": 0.006959295272827149, + "step": 117050 + }, + { + "epoch": 16.616039744499645, + "grad_norm": 4.500197410583496, + "learning_rate": 8.339119943222143e-05, + "loss": 0.008282274007797241, + "step": 117060 + }, + { + "epoch": 16.617459190915543, + "grad_norm": 0.07957390695810318, + "learning_rate": 8.338977998580554e-05, + "loss": 0.018632544577121733, + "step": 117070 + }, + { + "epoch": 16.61887863733144, + "grad_norm": 0.03085346519947052, + "learning_rate": 8.338836053938964e-05, + "loss": 0.018058757483959197, + "step": 117080 + }, + { + "epoch": 16.62029808374734, + "grad_norm": 2.6925551891326904, + "learning_rate": 8.338694109297375e-05, + "loss": 0.03852428793907166, + "step": 117090 + }, + { + "epoch": 16.621717530163238, + "grad_norm": 1.5794312953948975, + "learning_rate": 8.338552164655785e-05, + "loss": 0.012525925040245056, + "step": 117100 + }, + { + "epoch": 16.623136976579133, + "grad_norm": 0.02748379483819008, + "learning_rate": 8.338410220014194e-05, + "loss": 0.03372379541397095, + "step": 117110 + }, + { + "epoch": 16.62455642299503, + "grad_norm": 7.666720390319824, + "learning_rate": 8.338268275372606e-05, + "loss": 0.015265947580337525, + "step": 117120 + }, + { + "epoch": 16.62597586941093, + "grad_norm": 3.351346254348755, + "learning_rate": 8.338126330731015e-05, + "loss": 0.03214417695999146, + "step": 117130 + }, + { + "epoch": 16.627395315826828, + "grad_norm": 2.961798667907715, + "learning_rate": 8.337984386089426e-05, + "loss": 0.0034553803503513335, + "step": 117140 + }, + { + "epoch": 16.628814762242726, + "grad_norm": 0.037895407527685165, + "learning_rate": 8.337842441447836e-05, + "loss": 0.0046453546732664105, + "step": 117150 + }, + { + "epoch": 16.630234208658624, + "grad_norm": 0.02167833223938942, + "learning_rate": 8.337700496806246e-05, + "loss": 0.03771307468414307, + "step": 117160 + }, + { + "epoch": 16.631653655074523, + "grad_norm": 0.37683844566345215, + "learning_rate": 8.337558552164656e-05, + "loss": 0.014065866172313691, + "step": 117170 + }, + { + "epoch": 16.633073101490417, + "grad_norm": 0.1941373646259308, + "learning_rate": 8.337416607523067e-05, + "loss": 0.026754018664360047, + "step": 117180 + }, + { + "epoch": 16.634492547906316, + "grad_norm": 0.07540564239025116, + "learning_rate": 8.337274662881476e-05, + "loss": 0.029977530241012573, + "step": 117190 + }, + { + "epoch": 16.635911994322214, + "grad_norm": 0.07340825349092484, + "learning_rate": 8.337132718239888e-05, + "loss": 0.011456934362649917, + "step": 117200 + }, + { + "epoch": 16.637331440738112, + "grad_norm": 3.494420289993286, + "learning_rate": 8.336990773598297e-05, + "loss": 0.022322097420692445, + "step": 117210 + }, + { + "epoch": 16.63875088715401, + "grad_norm": 1.0809097290039062, + "learning_rate": 8.336848828956707e-05, + "loss": 0.01884945034980774, + "step": 117220 + }, + { + "epoch": 16.64017033356991, + "grad_norm": 3.570888042449951, + "learning_rate": 8.336706884315118e-05, + "loss": 0.009533677250146866, + "step": 117230 + }, + { + "epoch": 16.641589779985807, + "grad_norm": 0.03743152692914009, + "learning_rate": 8.336564939673528e-05, + "loss": 0.02686296999454498, + "step": 117240 + }, + { + "epoch": 16.643009226401702, + "grad_norm": 0.36875879764556885, + "learning_rate": 8.336422995031939e-05, + "loss": 0.03535675704479217, + "step": 117250 + }, + { + "epoch": 16.6444286728176, + "grad_norm": 1.0940033197402954, + "learning_rate": 8.336281050390347e-05, + "loss": 0.0029364045709371566, + "step": 117260 + }, + { + "epoch": 16.6458481192335, + "grad_norm": 4.938434600830078, + "learning_rate": 8.336139105748758e-05, + "loss": 0.023037466406822204, + "step": 117270 + }, + { + "epoch": 16.647267565649397, + "grad_norm": 0.1725301295518875, + "learning_rate": 8.335997161107168e-05, + "loss": 0.05853109955787659, + "step": 117280 + }, + { + "epoch": 16.648687012065295, + "grad_norm": 0.9875553846359253, + "learning_rate": 8.335855216465579e-05, + "loss": 0.028866416215896605, + "step": 117290 + }, + { + "epoch": 16.650106458481194, + "grad_norm": 0.014165399596095085, + "learning_rate": 8.335713271823989e-05, + "loss": 0.04174271821975708, + "step": 117300 + }, + { + "epoch": 16.651525904897092, + "grad_norm": 3.5790436267852783, + "learning_rate": 8.335571327182399e-05, + "loss": 0.015095795691013335, + "step": 117310 + }, + { + "epoch": 16.652945351312987, + "grad_norm": 0.6211684346199036, + "learning_rate": 8.33542938254081e-05, + "loss": 0.011040177941322327, + "step": 117320 + }, + { + "epoch": 16.654364797728885, + "grad_norm": 0.011647253297269344, + "learning_rate": 8.33528743789922e-05, + "loss": 0.03260062634944916, + "step": 117330 + }, + { + "epoch": 16.655784244144783, + "grad_norm": 3.1049857139587402, + "learning_rate": 8.33514549325763e-05, + "loss": 0.04729237854480743, + "step": 117340 + }, + { + "epoch": 16.65720369056068, + "grad_norm": 2.3427257537841797, + "learning_rate": 8.33500354861604e-05, + "loss": 0.04432548880577088, + "step": 117350 + }, + { + "epoch": 16.65862313697658, + "grad_norm": 5.978562355041504, + "learning_rate": 8.33486160397445e-05, + "loss": 0.03646363019943237, + "step": 117360 + }, + { + "epoch": 16.660042583392478, + "grad_norm": 12.11841106414795, + "learning_rate": 8.33471965933286e-05, + "loss": 0.0496336430311203, + "step": 117370 + }, + { + "epoch": 16.661462029808376, + "grad_norm": 11.358500480651855, + "learning_rate": 8.334577714691271e-05, + "loss": 0.03017066717147827, + "step": 117380 + }, + { + "epoch": 16.66288147622427, + "grad_norm": 0.1127539798617363, + "learning_rate": 8.33443577004968e-05, + "loss": 0.02873893976211548, + "step": 117390 + }, + { + "epoch": 16.66430092264017, + "grad_norm": 0.4423753023147583, + "learning_rate": 8.334293825408092e-05, + "loss": 0.004543831199407577, + "step": 117400 + }, + { + "epoch": 16.665720369056068, + "grad_norm": 0.3045527935028076, + "learning_rate": 8.334151880766501e-05, + "loss": 0.005070832371711731, + "step": 117410 + }, + { + "epoch": 16.667139815471966, + "grad_norm": 1.01546049118042, + "learning_rate": 8.334009936124911e-05, + "loss": 0.020494504272937773, + "step": 117420 + }, + { + "epoch": 16.668559261887864, + "grad_norm": 0.43640536069869995, + "learning_rate": 8.333867991483322e-05, + "loss": 0.0030459832400083543, + "step": 117430 + }, + { + "epoch": 16.669978708303763, + "grad_norm": 4.950283527374268, + "learning_rate": 8.333726046841732e-05, + "loss": 0.05004417896270752, + "step": 117440 + }, + { + "epoch": 16.67139815471966, + "grad_norm": 8.34631061553955, + "learning_rate": 8.333584102200143e-05, + "loss": 0.009309016168117523, + "step": 117450 + }, + { + "epoch": 16.672817601135556, + "grad_norm": 1.3374550342559814, + "learning_rate": 8.333442157558553e-05, + "loss": 0.013670036196708679, + "step": 117460 + }, + { + "epoch": 16.674237047551454, + "grad_norm": 4.7788801193237305, + "learning_rate": 8.333300212916963e-05, + "loss": 0.03752616047859192, + "step": 117470 + }, + { + "epoch": 16.675656493967352, + "grad_norm": 0.3163778483867645, + "learning_rate": 8.333158268275372e-05, + "loss": 0.017105105519294738, + "step": 117480 + }, + { + "epoch": 16.67707594038325, + "grad_norm": 0.020877504721283913, + "learning_rate": 8.333016323633783e-05, + "loss": 0.03657255172729492, + "step": 117490 + }, + { + "epoch": 16.67849538679915, + "grad_norm": 0.011093349196016788, + "learning_rate": 8.332874378992193e-05, + "loss": 0.009079907834529877, + "step": 117500 + }, + { + "epoch": 16.67849538679915, + "eval_accuracy": 0.9876645259744389, + "eval_loss": 0.04434996098279953, + "eval_runtime": 32.6727, + "eval_samples_per_second": 481.35, + "eval_steps_per_second": 15.058, + "step": 117500 + }, + { + "epoch": 16.679914833215047, + "grad_norm": 0.21959349513053894, + "learning_rate": 8.332732434350604e-05, + "loss": 0.010873357951641082, + "step": 117510 + }, + { + "epoch": 16.681334279630946, + "grad_norm": 0.028821425512433052, + "learning_rate": 8.332590489709014e-05, + "loss": 0.01830779165029526, + "step": 117520 + }, + { + "epoch": 16.68275372604684, + "grad_norm": 0.00650831637904048, + "learning_rate": 8.332448545067424e-05, + "loss": 0.02057510018348694, + "step": 117530 + }, + { + "epoch": 16.68417317246274, + "grad_norm": 0.07420149445533752, + "learning_rate": 8.332306600425835e-05, + "loss": 0.02898067533969879, + "step": 117540 + }, + { + "epoch": 16.685592618878637, + "grad_norm": 0.13417570292949677, + "learning_rate": 8.332164655784245e-05, + "loss": 0.04370847940444946, + "step": 117550 + }, + { + "epoch": 16.687012065294535, + "grad_norm": 0.3044644892215729, + "learning_rate": 8.332022711142656e-05, + "loss": 0.040468630194664, + "step": 117560 + }, + { + "epoch": 16.688431511710434, + "grad_norm": 0.005468321498483419, + "learning_rate": 8.331880766501064e-05, + "loss": 0.03988224864006042, + "step": 117570 + }, + { + "epoch": 16.689850958126332, + "grad_norm": 1.0364768505096436, + "learning_rate": 8.331738821859475e-05, + "loss": 0.0045741118490695955, + "step": 117580 + }, + { + "epoch": 16.69127040454223, + "grad_norm": 9.827880859375, + "learning_rate": 8.331596877217885e-05, + "loss": 0.020103031396865846, + "step": 117590 + }, + { + "epoch": 16.692689850958125, + "grad_norm": 4.745016098022461, + "learning_rate": 8.331454932576296e-05, + "loss": 0.04226047396659851, + "step": 117600 + }, + { + "epoch": 16.694109297374023, + "grad_norm": 0.5448516607284546, + "learning_rate": 8.331312987934706e-05, + "loss": 0.016260065138339996, + "step": 117610 + }, + { + "epoch": 16.69552874378992, + "grad_norm": 0.011220871470868587, + "learning_rate": 8.331171043293115e-05, + "loss": 0.020905345678329468, + "step": 117620 + }, + { + "epoch": 16.69694819020582, + "grad_norm": 0.45287594199180603, + "learning_rate": 8.331029098651527e-05, + "loss": 0.01711665540933609, + "step": 117630 + }, + { + "epoch": 16.698367636621718, + "grad_norm": 3.3729403018951416, + "learning_rate": 8.330887154009936e-05, + "loss": 0.01988566517829895, + "step": 117640 + }, + { + "epoch": 16.699787083037616, + "grad_norm": 0.024243533611297607, + "learning_rate": 8.330745209368347e-05, + "loss": 0.0944155514240265, + "step": 117650 + }, + { + "epoch": 16.701206529453515, + "grad_norm": 3.3614089488983154, + "learning_rate": 8.330603264726757e-05, + "loss": 0.013256210088729858, + "step": 117660 + }, + { + "epoch": 16.70262597586941, + "grad_norm": 0.16608507931232452, + "learning_rate": 8.330461320085167e-05, + "loss": 0.029009857773780824, + "step": 117670 + }, + { + "epoch": 16.704045422285308, + "grad_norm": 4.247665882110596, + "learning_rate": 8.330319375443577e-05, + "loss": 0.03191390335559845, + "step": 117680 + }, + { + "epoch": 16.705464868701206, + "grad_norm": 4.687053203582764, + "learning_rate": 8.330177430801988e-05, + "loss": 0.07247737050056458, + "step": 117690 + }, + { + "epoch": 16.706884315117104, + "grad_norm": 1.3678719997406006, + "learning_rate": 8.330035486160397e-05, + "loss": 0.016752901673316955, + "step": 117700 + }, + { + "epoch": 16.708303761533003, + "grad_norm": 2.391916275024414, + "learning_rate": 8.329893541518809e-05, + "loss": 0.015759438276290894, + "step": 117710 + }, + { + "epoch": 16.7097232079489, + "grad_norm": 0.06527828425168991, + "learning_rate": 8.329751596877218e-05, + "loss": 0.043265300989151004, + "step": 117720 + }, + { + "epoch": 16.7111426543648, + "grad_norm": 0.5923253297805786, + "learning_rate": 8.329609652235628e-05, + "loss": 0.02035187929868698, + "step": 117730 + }, + { + "epoch": 16.712562100780694, + "grad_norm": 9.604898452758789, + "learning_rate": 8.329467707594039e-05, + "loss": 0.01412811279296875, + "step": 117740 + }, + { + "epoch": 16.713981547196592, + "grad_norm": 2.5967063903808594, + "learning_rate": 8.329325762952449e-05, + "loss": 0.005281927064061165, + "step": 117750 + }, + { + "epoch": 16.71540099361249, + "grad_norm": 0.33082273602485657, + "learning_rate": 8.32918381831086e-05, + "loss": 0.017080453038215638, + "step": 117760 + }, + { + "epoch": 16.71682044002839, + "grad_norm": 0.6532198786735535, + "learning_rate": 8.32904187366927e-05, + "loss": 0.029032516479492187, + "step": 117770 + }, + { + "epoch": 16.718239886444287, + "grad_norm": 0.05124543979763985, + "learning_rate": 8.32889992902768e-05, + "loss": 0.013310250639915467, + "step": 117780 + }, + { + "epoch": 16.719659332860186, + "grad_norm": 6.831470966339111, + "learning_rate": 8.328757984386089e-05, + "loss": 0.007946215569972992, + "step": 117790 + }, + { + "epoch": 16.721078779276084, + "grad_norm": 9.196264266967773, + "learning_rate": 8.3286160397445e-05, + "loss": 0.04549582004547119, + "step": 117800 + }, + { + "epoch": 16.72249822569198, + "grad_norm": 0.03787418454885483, + "learning_rate": 8.32847409510291e-05, + "loss": 0.018013326823711394, + "step": 117810 + }, + { + "epoch": 16.723917672107877, + "grad_norm": 0.1705215722322464, + "learning_rate": 8.328332150461321e-05, + "loss": 0.009039975702762604, + "step": 117820 + }, + { + "epoch": 16.725337118523775, + "grad_norm": 0.023820318281650543, + "learning_rate": 8.328190205819731e-05, + "loss": 0.010485945641994477, + "step": 117830 + }, + { + "epoch": 16.726756564939674, + "grad_norm": 0.03546424210071564, + "learning_rate": 8.32804826117814e-05, + "loss": 0.04599918127059936, + "step": 117840 + }, + { + "epoch": 16.728176011355572, + "grad_norm": 0.09232477098703384, + "learning_rate": 8.327906316536552e-05, + "loss": 0.005126936361193657, + "step": 117850 + }, + { + "epoch": 16.72959545777147, + "grad_norm": 0.22951701283454895, + "learning_rate": 8.327764371894961e-05, + "loss": 0.055786031484603885, + "step": 117860 + }, + { + "epoch": 16.73101490418737, + "grad_norm": 0.005998775362968445, + "learning_rate": 8.327622427253372e-05, + "loss": 0.04999669194221497, + "step": 117870 + }, + { + "epoch": 16.732434350603263, + "grad_norm": 0.004882345907390118, + "learning_rate": 8.327480482611781e-05, + "loss": 0.011063285171985626, + "step": 117880 + }, + { + "epoch": 16.73385379701916, + "grad_norm": 0.29720327258110046, + "learning_rate": 8.327338537970192e-05, + "loss": 0.0025852181017398832, + "step": 117890 + }, + { + "epoch": 16.73527324343506, + "grad_norm": 0.08615440875291824, + "learning_rate": 8.327196593328602e-05, + "loss": 0.0022923082113265993, + "step": 117900 + }, + { + "epoch": 16.73669268985096, + "grad_norm": 0.2930206060409546, + "learning_rate": 8.327054648687013e-05, + "loss": 0.028328916430473326, + "step": 117910 + }, + { + "epoch": 16.738112136266857, + "grad_norm": 0.052065297961235046, + "learning_rate": 8.326912704045423e-05, + "loss": 0.006777378171682358, + "step": 117920 + }, + { + "epoch": 16.739531582682755, + "grad_norm": 7.862561225891113, + "learning_rate": 8.326770759403832e-05, + "loss": 0.01529969722032547, + "step": 117930 + }, + { + "epoch": 16.740951029098653, + "grad_norm": 0.03088100254535675, + "learning_rate": 8.326628814762243e-05, + "loss": 0.053841644525527955, + "step": 117940 + }, + { + "epoch": 16.742370475514548, + "grad_norm": 3.4934494495391846, + "learning_rate": 8.326486870120653e-05, + "loss": 0.014754070341587067, + "step": 117950 + }, + { + "epoch": 16.743789921930446, + "grad_norm": 17.251235961914062, + "learning_rate": 8.326344925479064e-05, + "loss": 0.051452040672302246, + "step": 117960 + }, + { + "epoch": 16.745209368346345, + "grad_norm": 5.5792951583862305, + "learning_rate": 8.326202980837474e-05, + "loss": 0.03119921386241913, + "step": 117970 + }, + { + "epoch": 16.746628814762243, + "grad_norm": 0.1415969580411911, + "learning_rate": 8.326061036195884e-05, + "loss": 0.009391264617443084, + "step": 117980 + }, + { + "epoch": 16.74804826117814, + "grad_norm": 0.07763611525297165, + "learning_rate": 8.325919091554293e-05, + "loss": 0.022009353339672088, + "step": 117990 + }, + { + "epoch": 16.74946770759404, + "grad_norm": 0.29674217104911804, + "learning_rate": 8.325777146912704e-05, + "loss": 0.01453334093093872, + "step": 118000 + }, + { + "epoch": 16.74946770759404, + "eval_accuracy": 0.984103770585617, + "eval_loss": 0.06307372450828552, + "eval_runtime": 34.4236, + "eval_samples_per_second": 456.866, + "eval_steps_per_second": 14.293, + "step": 118000 + }, + { + "epoch": 16.750887154009938, + "grad_norm": 1.6057296991348267, + "learning_rate": 8.325635202271114e-05, + "loss": 0.010459934175014497, + "step": 118010 + }, + { + "epoch": 16.752306600425833, + "grad_norm": 0.00853043794631958, + "learning_rate": 8.325493257629525e-05, + "loss": 0.017493090033531188, + "step": 118020 + }, + { + "epoch": 16.75372604684173, + "grad_norm": 0.8278746604919434, + "learning_rate": 8.325351312987935e-05, + "loss": 0.024293732643127442, + "step": 118030 + }, + { + "epoch": 16.75514549325763, + "grad_norm": 0.06697604060173035, + "learning_rate": 8.325209368346345e-05, + "loss": 0.014129532873630524, + "step": 118040 + }, + { + "epoch": 16.756564939673527, + "grad_norm": 1.1142808198928833, + "learning_rate": 8.325067423704756e-05, + "loss": 0.008899647742509842, + "step": 118050 + }, + { + "epoch": 16.757984386089426, + "grad_norm": 0.6534438729286194, + "learning_rate": 8.324925479063166e-05, + "loss": 0.0418839693069458, + "step": 118060 + }, + { + "epoch": 16.759403832505324, + "grad_norm": 0.8519339561462402, + "learning_rate": 8.324783534421577e-05, + "loss": 0.014938108623027802, + "step": 118070 + }, + { + "epoch": 16.760823278921222, + "grad_norm": 0.3277834951877594, + "learning_rate": 8.324641589779985e-05, + "loss": 0.02368403822183609, + "step": 118080 + }, + { + "epoch": 16.762242725337117, + "grad_norm": 0.11566946655511856, + "learning_rate": 8.324499645138396e-05, + "loss": 0.04180347919464111, + "step": 118090 + }, + { + "epoch": 16.763662171753015, + "grad_norm": 1.1494996547698975, + "learning_rate": 8.324357700496806e-05, + "loss": 0.0094342440366745, + "step": 118100 + }, + { + "epoch": 16.765081618168914, + "grad_norm": 0.013919823803007603, + "learning_rate": 8.324215755855217e-05, + "loss": 0.06293154954910278, + "step": 118110 + }, + { + "epoch": 16.766501064584812, + "grad_norm": 4.400853157043457, + "learning_rate": 8.324073811213627e-05, + "loss": 0.027945888042449952, + "step": 118120 + }, + { + "epoch": 16.76792051100071, + "grad_norm": 2.673232078552246, + "learning_rate": 8.323931866572038e-05, + "loss": 0.02772565484046936, + "step": 118130 + }, + { + "epoch": 16.76933995741661, + "grad_norm": 0.4370087683200836, + "learning_rate": 8.323789921930448e-05, + "loss": 0.04705857038497925, + "step": 118140 + }, + { + "epoch": 16.770759403832507, + "grad_norm": 0.4404630661010742, + "learning_rate": 8.323647977288857e-05, + "loss": 0.01638329178094864, + "step": 118150 + }, + { + "epoch": 16.7721788502484, + "grad_norm": 14.777122497558594, + "learning_rate": 8.323506032647268e-05, + "loss": 0.015768852829933167, + "step": 118160 + }, + { + "epoch": 16.7735982966643, + "grad_norm": 0.024510599672794342, + "learning_rate": 8.323364088005678e-05, + "loss": 0.03490220904350281, + "step": 118170 + }, + { + "epoch": 16.7750177430802, + "grad_norm": 0.20726227760314941, + "learning_rate": 8.323222143364089e-05, + "loss": 0.017764410376548766, + "step": 118180 + }, + { + "epoch": 16.776437189496097, + "grad_norm": 0.646827757358551, + "learning_rate": 8.323080198722498e-05, + "loss": 0.006039583683013916, + "step": 118190 + }, + { + "epoch": 16.777856635911995, + "grad_norm": 7.485454082489014, + "learning_rate": 8.322938254080909e-05, + "loss": 0.014593130350112915, + "step": 118200 + }, + { + "epoch": 16.779276082327893, + "grad_norm": 0.013658017851412296, + "learning_rate": 8.322796309439318e-05, + "loss": 0.0015197057276964188, + "step": 118210 + }, + { + "epoch": 16.78069552874379, + "grad_norm": 0.10143446922302246, + "learning_rate": 8.32265436479773e-05, + "loss": 0.036323907971382144, + "step": 118220 + }, + { + "epoch": 16.782114975159686, + "grad_norm": 0.6719693541526794, + "learning_rate": 8.32251242015614e-05, + "loss": 0.013683655858039856, + "step": 118230 + }, + { + "epoch": 16.783534421575585, + "grad_norm": 2.3426082134246826, + "learning_rate": 8.322370475514549e-05, + "loss": 0.028924247622489928, + "step": 118240 + }, + { + "epoch": 16.784953867991483, + "grad_norm": 18.29952621459961, + "learning_rate": 8.32222853087296e-05, + "loss": 0.030454546213150024, + "step": 118250 + }, + { + "epoch": 16.78637331440738, + "grad_norm": 0.7703476548194885, + "learning_rate": 8.32208658623137e-05, + "loss": 0.021138817071914673, + "step": 118260 + }, + { + "epoch": 16.78779276082328, + "grad_norm": 0.6782881617546082, + "learning_rate": 8.321944641589781e-05, + "loss": 0.05559571981430054, + "step": 118270 + }, + { + "epoch": 16.789212207239178, + "grad_norm": 3.4025721549987793, + "learning_rate": 8.32180269694819e-05, + "loss": 0.03414316773414612, + "step": 118280 + }, + { + "epoch": 16.790631653655076, + "grad_norm": 9.724266052246094, + "learning_rate": 8.3216607523066e-05, + "loss": 0.014724119007587433, + "step": 118290 + }, + { + "epoch": 16.79205110007097, + "grad_norm": 0.03437022492289543, + "learning_rate": 8.32151880766501e-05, + "loss": 0.025532713532447814, + "step": 118300 + }, + { + "epoch": 16.79347054648687, + "grad_norm": 4.5364766120910645, + "learning_rate": 8.321376863023421e-05, + "loss": 0.009097173064947128, + "step": 118310 + }, + { + "epoch": 16.794889992902768, + "grad_norm": 0.1253851056098938, + "learning_rate": 8.321234918381832e-05, + "loss": 0.014451992511749268, + "step": 118320 + }, + { + "epoch": 16.796309439318666, + "grad_norm": 1.6803902387619019, + "learning_rate": 8.321092973740242e-05, + "loss": 0.039790353178977965, + "step": 118330 + }, + { + "epoch": 16.797728885734564, + "grad_norm": 10.777732849121094, + "learning_rate": 8.320951029098652e-05, + "loss": 0.026776975393295287, + "step": 118340 + }, + { + "epoch": 16.799148332150462, + "grad_norm": 0.556053876876831, + "learning_rate": 8.320809084457062e-05, + "loss": 0.04346528351306915, + "step": 118350 + }, + { + "epoch": 16.80056777856636, + "grad_norm": 1.746227741241455, + "learning_rate": 8.320667139815473e-05, + "loss": 0.017362989485263824, + "step": 118360 + }, + { + "epoch": 16.801987224982255, + "grad_norm": 0.28359484672546387, + "learning_rate": 8.320525195173882e-05, + "loss": 0.05255056619644165, + "step": 118370 + }, + { + "epoch": 16.803406671398154, + "grad_norm": 0.07087396085262299, + "learning_rate": 8.320383250532293e-05, + "loss": 0.0291361004114151, + "step": 118380 + }, + { + "epoch": 16.804826117814052, + "grad_norm": 6.125046730041504, + "learning_rate": 8.320241305890702e-05, + "loss": 0.0548758864402771, + "step": 118390 + }, + { + "epoch": 16.80624556422995, + "grad_norm": 0.7101525068283081, + "learning_rate": 8.320099361249113e-05, + "loss": 0.015115344524383545, + "step": 118400 + }, + { + "epoch": 16.80766501064585, + "grad_norm": 1.2516734600067139, + "learning_rate": 8.319957416607524e-05, + "loss": 0.03866508603096008, + "step": 118410 + }, + { + "epoch": 16.809084457061747, + "grad_norm": 0.5947251319885254, + "learning_rate": 8.319815471965934e-05, + "loss": 0.025516587495803832, + "step": 118420 + }, + { + "epoch": 16.810503903477645, + "grad_norm": 0.09165836125612259, + "learning_rate": 8.319673527324345e-05, + "loss": 0.018566188216209412, + "step": 118430 + }, + { + "epoch": 16.81192334989354, + "grad_norm": 0.45714905858039856, + "learning_rate": 8.319531582682755e-05, + "loss": 0.026043158769607545, + "step": 118440 + }, + { + "epoch": 16.81334279630944, + "grad_norm": 0.05650230124592781, + "learning_rate": 8.319389638041164e-05, + "loss": 0.0274466872215271, + "step": 118450 + }, + { + "epoch": 16.814762242725337, + "grad_norm": 0.006217160262167454, + "learning_rate": 8.319247693399574e-05, + "loss": 0.03689888119697571, + "step": 118460 + }, + { + "epoch": 16.816181689141235, + "grad_norm": 1.209546685218811, + "learning_rate": 8.319105748757985e-05, + "loss": 0.004774202033877373, + "step": 118470 + }, + { + "epoch": 16.817601135557133, + "grad_norm": 1.517468810081482, + "learning_rate": 8.318963804116395e-05, + "loss": 0.016903454065322877, + "step": 118480 + }, + { + "epoch": 16.81902058197303, + "grad_norm": 0.11526894569396973, + "learning_rate": 8.318821859474806e-05, + "loss": 0.039901772141456605, + "step": 118490 + }, + { + "epoch": 16.82044002838893, + "grad_norm": 0.47346436977386475, + "learning_rate": 8.318679914833216e-05, + "loss": 0.04827032089233398, + "step": 118500 + }, + { + "epoch": 16.82044002838893, + "eval_accuracy": 0.9791441470083296, + "eval_loss": 0.08274804055690765, + "eval_runtime": 32.5356, + "eval_samples_per_second": 483.378, + "eval_steps_per_second": 15.122, + "step": 118500 + }, + { + "epoch": 16.821859474804825, + "grad_norm": 0.843011736869812, + "learning_rate": 8.318537970191625e-05, + "loss": 0.029598337411880494, + "step": 118510 + }, + { + "epoch": 16.823278921220723, + "grad_norm": 7.9473371505737305, + "learning_rate": 8.318396025550037e-05, + "loss": 0.02198069393634796, + "step": 118520 + }, + { + "epoch": 16.82469836763662, + "grad_norm": 0.07339850068092346, + "learning_rate": 8.318254080908446e-05, + "loss": 0.011041966825723648, + "step": 118530 + }, + { + "epoch": 16.82611781405252, + "grad_norm": 0.6384608745574951, + "learning_rate": 8.318112136266857e-05, + "loss": 0.04286980330944061, + "step": 118540 + }, + { + "epoch": 16.827537260468418, + "grad_norm": 10.388303756713867, + "learning_rate": 8.317970191625266e-05, + "loss": 0.032366585731506345, + "step": 118550 + }, + { + "epoch": 16.828956706884316, + "grad_norm": 0.1697257161140442, + "learning_rate": 8.317828246983677e-05, + "loss": 0.06269558072090149, + "step": 118560 + }, + { + "epoch": 16.830376153300215, + "grad_norm": 0.03282211720943451, + "learning_rate": 8.317686302342087e-05, + "loss": 0.005208947509527206, + "step": 118570 + }, + { + "epoch": 16.83179559971611, + "grad_norm": 0.026220278814435005, + "learning_rate": 8.317544357700498e-05, + "loss": 0.010369515419006348, + "step": 118580 + }, + { + "epoch": 16.833215046132008, + "grad_norm": 0.057037319988012314, + "learning_rate": 8.317402413058907e-05, + "loss": 0.05828549861907959, + "step": 118590 + }, + { + "epoch": 16.834634492547906, + "grad_norm": 0.006431225221604109, + "learning_rate": 8.317260468417317e-05, + "loss": 0.00878979116678238, + "step": 118600 + }, + { + "epoch": 16.836053938963804, + "grad_norm": 3.5843164920806885, + "learning_rate": 8.317118523775728e-05, + "loss": 0.007160958647727966, + "step": 118610 + }, + { + "epoch": 16.837473385379703, + "grad_norm": 8.680359840393066, + "learning_rate": 8.316976579134138e-05, + "loss": 0.04745030403137207, + "step": 118620 + }, + { + "epoch": 16.8388928317956, + "grad_norm": 0.028621600940823555, + "learning_rate": 8.316834634492549e-05, + "loss": 0.01384253203868866, + "step": 118630 + }, + { + "epoch": 16.8403122782115, + "grad_norm": 6.854605197906494, + "learning_rate": 8.316692689850959e-05, + "loss": 0.0036096815019845963, + "step": 118640 + }, + { + "epoch": 16.841731724627394, + "grad_norm": 0.07637093961238861, + "learning_rate": 8.316550745209369e-05, + "loss": 0.0033883821219205857, + "step": 118650 + }, + { + "epoch": 16.843151171043292, + "grad_norm": 0.10442734509706497, + "learning_rate": 8.316408800567778e-05, + "loss": 0.002956448122859001, + "step": 118660 + }, + { + "epoch": 16.84457061745919, + "grad_norm": 0.02588530443608761, + "learning_rate": 8.31626685592619e-05, + "loss": 0.03580034673213959, + "step": 118670 + }, + { + "epoch": 16.84599006387509, + "grad_norm": 0.3507205545902252, + "learning_rate": 8.316124911284599e-05, + "loss": 0.02349122762680054, + "step": 118680 + }, + { + "epoch": 16.847409510290987, + "grad_norm": 0.3434644937515259, + "learning_rate": 8.31598296664301e-05, + "loss": 0.01267555058002472, + "step": 118690 + }, + { + "epoch": 16.848828956706885, + "grad_norm": 6.556180000305176, + "learning_rate": 8.31584102200142e-05, + "loss": 0.006739020347595215, + "step": 118700 + }, + { + "epoch": 16.850248403122784, + "grad_norm": 0.01769874058663845, + "learning_rate": 8.31569907735983e-05, + "loss": 0.001453077420592308, + "step": 118710 + }, + { + "epoch": 16.85166784953868, + "grad_norm": 0.01593855768442154, + "learning_rate": 8.315557132718241e-05, + "loss": 0.02313033491373062, + "step": 118720 + }, + { + "epoch": 16.853087295954577, + "grad_norm": 0.7602181434631348, + "learning_rate": 8.31541518807665e-05, + "loss": 0.005456413701176643, + "step": 118730 + }, + { + "epoch": 16.854506742370475, + "grad_norm": 7.872560024261475, + "learning_rate": 8.315273243435062e-05, + "loss": 0.007511251419782638, + "step": 118740 + }, + { + "epoch": 16.855926188786373, + "grad_norm": 0.27230358123779297, + "learning_rate": 8.31513129879347e-05, + "loss": 0.02225078046321869, + "step": 118750 + }, + { + "epoch": 16.85734563520227, + "grad_norm": 1.2540390491485596, + "learning_rate": 8.314989354151881e-05, + "loss": 0.010367317497730255, + "step": 118760 + }, + { + "epoch": 16.85876508161817, + "grad_norm": 0.20370694994926453, + "learning_rate": 8.314847409510291e-05, + "loss": 0.007272961735725403, + "step": 118770 + }, + { + "epoch": 16.86018452803407, + "grad_norm": 0.1761959195137024, + "learning_rate": 8.314705464868702e-05, + "loss": 0.004762361571192741, + "step": 118780 + }, + { + "epoch": 16.861603974449963, + "grad_norm": 0.09174513816833496, + "learning_rate": 8.314563520227112e-05, + "loss": 0.014287218451499939, + "step": 118790 + }, + { + "epoch": 16.86302342086586, + "grad_norm": 1.3199766874313354, + "learning_rate": 8.314421575585523e-05, + "loss": 0.012278030812740325, + "step": 118800 + }, + { + "epoch": 16.86444286728176, + "grad_norm": 0.07524289190769196, + "learning_rate": 8.314279630943933e-05, + "loss": 0.0019908275455236433, + "step": 118810 + }, + { + "epoch": 16.865862313697658, + "grad_norm": 0.8612643480300903, + "learning_rate": 8.314137686302342e-05, + "loss": 0.010582181811332702, + "step": 118820 + }, + { + "epoch": 16.867281760113556, + "grad_norm": 0.04525861144065857, + "learning_rate": 8.313995741660753e-05, + "loss": 0.0019207149744033813, + "step": 118830 + }, + { + "epoch": 16.868701206529455, + "grad_norm": 0.05441594496369362, + "learning_rate": 8.313853797019163e-05, + "loss": 0.0021228346973657607, + "step": 118840 + }, + { + "epoch": 16.870120652945353, + "grad_norm": 2.334911584854126, + "learning_rate": 8.313711852377574e-05, + "loss": 0.004405818507075309, + "step": 118850 + }, + { + "epoch": 16.871540099361248, + "grad_norm": 9.895645141601562, + "learning_rate": 8.313569907735983e-05, + "loss": 0.018842428922653198, + "step": 118860 + }, + { + "epoch": 16.872959545777146, + "grad_norm": 1.2320467233657837, + "learning_rate": 8.313427963094394e-05, + "loss": 0.022146573662757872, + "step": 118870 + }, + { + "epoch": 16.874378992193044, + "grad_norm": 0.09316658228635788, + "learning_rate": 8.313286018452803e-05, + "loss": 0.014387600123882294, + "step": 118880 + }, + { + "epoch": 16.875798438608943, + "grad_norm": 0.4701049029827118, + "learning_rate": 8.313144073811214e-05, + "loss": 0.02150590270757675, + "step": 118890 + }, + { + "epoch": 16.87721788502484, + "grad_norm": 0.18994688987731934, + "learning_rate": 8.313002129169624e-05, + "loss": 0.06476402878761292, + "step": 118900 + }, + { + "epoch": 16.87863733144074, + "grad_norm": 0.664664089679718, + "learning_rate": 8.312860184528034e-05, + "loss": 0.06443645358085633, + "step": 118910 + }, + { + "epoch": 16.880056777856637, + "grad_norm": 11.022554397583008, + "learning_rate": 8.312718239886445e-05, + "loss": 0.010656381398439408, + "step": 118920 + }, + { + "epoch": 16.881476224272532, + "grad_norm": 0.023860175162553787, + "learning_rate": 8.312576295244855e-05, + "loss": 0.009106354415416717, + "step": 118930 + }, + { + "epoch": 16.88289567068843, + "grad_norm": 10.94786548614502, + "learning_rate": 8.312434350603266e-05, + "loss": 0.026509439945220946, + "step": 118940 + }, + { + "epoch": 16.88431511710433, + "grad_norm": 4.937229156494141, + "learning_rate": 8.312292405961676e-05, + "loss": 0.018260036408901215, + "step": 118950 + }, + { + "epoch": 16.885734563520227, + "grad_norm": 0.05745793506503105, + "learning_rate": 8.312150461320085e-05, + "loss": 0.09326847195625305, + "step": 118960 + }, + { + "epoch": 16.887154009936125, + "grad_norm": 6.062938690185547, + "learning_rate": 8.312008516678495e-05, + "loss": 0.044204458594322205, + "step": 118970 + }, + { + "epoch": 16.888573456352024, + "grad_norm": 8.90611743927002, + "learning_rate": 8.311866572036906e-05, + "loss": 0.018737226724624634, + "step": 118980 + }, + { + "epoch": 16.889992902767922, + "grad_norm": 0.03786252439022064, + "learning_rate": 8.311724627395316e-05, + "loss": 0.07848379015922546, + "step": 118990 + }, + { + "epoch": 16.891412349183817, + "grad_norm": 0.3104473054409027, + "learning_rate": 8.311582682753727e-05, + "loss": 0.01446186900138855, + "step": 119000 + }, + { + "epoch": 16.891412349183817, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.045681487768888474, + "eval_runtime": 32.6866, + "eval_samples_per_second": 481.146, + "eval_steps_per_second": 15.052, + "step": 119000 + }, + { + "epoch": 16.892831795599715, + "grad_norm": 1.2676396369934082, + "learning_rate": 8.311440738112137e-05, + "loss": 0.02440263032913208, + "step": 119010 + }, + { + "epoch": 16.894251242015613, + "grad_norm": 0.10896630585193634, + "learning_rate": 8.311298793470546e-05, + "loss": 0.04403060376644134, + "step": 119020 + }, + { + "epoch": 16.89567068843151, + "grad_norm": 0.6196576952934265, + "learning_rate": 8.311156848828958e-05, + "loss": 0.01849239319562912, + "step": 119030 + }, + { + "epoch": 16.89709013484741, + "grad_norm": 4.4326090812683105, + "learning_rate": 8.311014904187367e-05, + "loss": 0.044346022605896, + "step": 119040 + }, + { + "epoch": 16.89850958126331, + "grad_norm": 0.3181878626346588, + "learning_rate": 8.310872959545778e-05, + "loss": 0.06927153468132019, + "step": 119050 + }, + { + "epoch": 16.899929027679207, + "grad_norm": 0.17974849045276642, + "learning_rate": 8.310731014904187e-05, + "loss": 0.005778995156288147, + "step": 119060 + }, + { + "epoch": 16.9013484740951, + "grad_norm": 0.14680950343608856, + "learning_rate": 8.310589070262598e-05, + "loss": 0.030992990732192992, + "step": 119070 + }, + { + "epoch": 16.902767920511, + "grad_norm": 0.28509706258773804, + "learning_rate": 8.310447125621008e-05, + "loss": 0.008585918694734573, + "step": 119080 + }, + { + "epoch": 16.904187366926898, + "grad_norm": 1.0589877367019653, + "learning_rate": 8.310305180979419e-05, + "loss": 0.014909610152244568, + "step": 119090 + }, + { + "epoch": 16.905606813342796, + "grad_norm": 5.249059200286865, + "learning_rate": 8.310163236337828e-05, + "loss": 0.026605024933815002, + "step": 119100 + }, + { + "epoch": 16.907026259758695, + "grad_norm": 0.2209336906671524, + "learning_rate": 8.310021291696238e-05, + "loss": 0.03418198227882385, + "step": 119110 + }, + { + "epoch": 16.908445706174593, + "grad_norm": 0.33446890115737915, + "learning_rate": 8.309879347054649e-05, + "loss": 0.0033166084438562395, + "step": 119120 + }, + { + "epoch": 16.90986515259049, + "grad_norm": 3.7871241569519043, + "learning_rate": 8.309737402413059e-05, + "loss": 0.06138277053833008, + "step": 119130 + }, + { + "epoch": 16.911284599006386, + "grad_norm": 1.6204237937927246, + "learning_rate": 8.30959545777147e-05, + "loss": 0.027744191884994506, + "step": 119140 + }, + { + "epoch": 16.912704045422284, + "grad_norm": 0.0447792150080204, + "learning_rate": 8.30945351312988e-05, + "loss": 0.022081327438354493, + "step": 119150 + }, + { + "epoch": 16.914123491838183, + "grad_norm": 1.4444817304611206, + "learning_rate": 8.309311568488291e-05, + "loss": 0.020131246745586397, + "step": 119160 + }, + { + "epoch": 16.91554293825408, + "grad_norm": 9.670873641967773, + "learning_rate": 8.3091696238467e-05, + "loss": 0.04307814538478851, + "step": 119170 + }, + { + "epoch": 16.91696238466998, + "grad_norm": 3.049792766571045, + "learning_rate": 8.30902767920511e-05, + "loss": 0.013657885789871215, + "step": 119180 + }, + { + "epoch": 16.918381831085878, + "grad_norm": 0.13626520335674286, + "learning_rate": 8.30888573456352e-05, + "loss": 0.03500192165374756, + "step": 119190 + }, + { + "epoch": 16.919801277501776, + "grad_norm": 3.698606252670288, + "learning_rate": 8.308743789921931e-05, + "loss": 0.02214457541704178, + "step": 119200 + }, + { + "epoch": 16.92122072391767, + "grad_norm": 0.14914394915103912, + "learning_rate": 8.308601845280341e-05, + "loss": 0.034185728430747984, + "step": 119210 + }, + { + "epoch": 16.92264017033357, + "grad_norm": 0.009660141542553902, + "learning_rate": 8.308459900638751e-05, + "loss": 0.019360674917697905, + "step": 119220 + }, + { + "epoch": 16.924059616749467, + "grad_norm": 1.695876121520996, + "learning_rate": 8.308317955997162e-05, + "loss": 0.0037443704903125765, + "step": 119230 + }, + { + "epoch": 16.925479063165366, + "grad_norm": 1.125134825706482, + "learning_rate": 8.308176011355572e-05, + "loss": 0.04052403867244721, + "step": 119240 + }, + { + "epoch": 16.926898509581264, + "grad_norm": 0.0727100819349289, + "learning_rate": 8.308034066713983e-05, + "loss": 0.030857852101325987, + "step": 119250 + }, + { + "epoch": 16.928317955997162, + "grad_norm": 5.554221153259277, + "learning_rate": 8.307892122072392e-05, + "loss": 0.03768602609634399, + "step": 119260 + }, + { + "epoch": 16.92973740241306, + "grad_norm": 0.8900468349456787, + "learning_rate": 8.307750177430802e-05, + "loss": 0.009594323486089707, + "step": 119270 + }, + { + "epoch": 16.931156848828955, + "grad_norm": 1.195831537246704, + "learning_rate": 8.307608232789212e-05, + "loss": 0.026123252511024476, + "step": 119280 + }, + { + "epoch": 16.932576295244854, + "grad_norm": 0.023000942543148994, + "learning_rate": 8.307466288147623e-05, + "loss": 0.004097868502140045, + "step": 119290 + }, + { + "epoch": 16.933995741660752, + "grad_norm": 0.07576500624418259, + "learning_rate": 8.307324343506033e-05, + "loss": 0.018370094895362853, + "step": 119300 + }, + { + "epoch": 16.93541518807665, + "grad_norm": 1.6272691488265991, + "learning_rate": 8.307182398864444e-05, + "loss": 0.007767494022846222, + "step": 119310 + }, + { + "epoch": 16.93683463449255, + "grad_norm": 0.062356818467378616, + "learning_rate": 8.307040454222854e-05, + "loss": 0.005623319372534752, + "step": 119320 + }, + { + "epoch": 16.938254080908447, + "grad_norm": 0.7122419476509094, + "learning_rate": 8.306898509581263e-05, + "loss": 0.02900628447532654, + "step": 119330 + }, + { + "epoch": 16.939673527324345, + "grad_norm": 0.7076064944267273, + "learning_rate": 8.306756564939674e-05, + "loss": 0.07354017496109008, + "step": 119340 + }, + { + "epoch": 16.94109297374024, + "grad_norm": 0.0743996724486351, + "learning_rate": 8.306614620298084e-05, + "loss": 0.007228873670101166, + "step": 119350 + }, + { + "epoch": 16.942512420156138, + "grad_norm": 0.05068621039390564, + "learning_rate": 8.306472675656495e-05, + "loss": 0.03541290760040283, + "step": 119360 + }, + { + "epoch": 16.943931866572036, + "grad_norm": 0.4136679768562317, + "learning_rate": 8.306330731014904e-05, + "loss": 0.003943739831447602, + "step": 119370 + }, + { + "epoch": 16.945351312987935, + "grad_norm": 0.21719232201576233, + "learning_rate": 8.306188786373315e-05, + "loss": 0.005219864472746849, + "step": 119380 + }, + { + "epoch": 16.946770759403833, + "grad_norm": 4.703658580780029, + "learning_rate": 8.306046841731724e-05, + "loss": 0.014817482233047486, + "step": 119390 + }, + { + "epoch": 16.94819020581973, + "grad_norm": 0.1651671677827835, + "learning_rate": 8.305904897090135e-05, + "loss": 0.009183306992053986, + "step": 119400 + }, + { + "epoch": 16.94960965223563, + "grad_norm": 3.381152391433716, + "learning_rate": 8.305762952448545e-05, + "loss": 0.02705332338809967, + "step": 119410 + }, + { + "epoch": 16.951029098651524, + "grad_norm": 0.05078170821070671, + "learning_rate": 8.305621007806955e-05, + "loss": 0.0509130597114563, + "step": 119420 + }, + { + "epoch": 16.952448545067423, + "grad_norm": 6.6379313468933105, + "learning_rate": 8.305479063165366e-05, + "loss": 0.028801146149635314, + "step": 119430 + }, + { + "epoch": 16.95386799148332, + "grad_norm": 6.881738662719727, + "learning_rate": 8.305337118523776e-05, + "loss": 0.010584786534309387, + "step": 119440 + }, + { + "epoch": 16.95528743789922, + "grad_norm": 0.21296283602714539, + "learning_rate": 8.305195173882187e-05, + "loss": 0.009670565277338028, + "step": 119450 + }, + { + "epoch": 16.956706884315118, + "grad_norm": 0.04307686537504196, + "learning_rate": 8.305053229240597e-05, + "loss": 0.008084750920534133, + "step": 119460 + }, + { + "epoch": 16.958126330731016, + "grad_norm": 3.313858985900879, + "learning_rate": 8.304911284599006e-05, + "loss": 0.005667193233966828, + "step": 119470 + }, + { + "epoch": 16.959545777146914, + "grad_norm": 0.04059341922402382, + "learning_rate": 8.304769339957416e-05, + "loss": 0.007192540168762207, + "step": 119480 + }, + { + "epoch": 16.96096522356281, + "grad_norm": 0.09984651952981949, + "learning_rate": 8.304627395315827e-05, + "loss": 0.006505205482244492, + "step": 119490 + }, + { + "epoch": 16.962384669978707, + "grad_norm": 0.019779440015554428, + "learning_rate": 8.304485450674237e-05, + "loss": 0.008879543095827103, + "step": 119500 + }, + { + "epoch": 16.962384669978707, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.06713375449180603, + "eval_runtime": 33.0669, + "eval_samples_per_second": 475.611, + "eval_steps_per_second": 14.879, + "step": 119500 + }, + { + "epoch": 16.963804116394606, + "grad_norm": 0.07041562348604202, + "learning_rate": 8.304343506032648e-05, + "loss": 0.04628815948963165, + "step": 119510 + }, + { + "epoch": 16.965223562810504, + "grad_norm": 0.5285801887512207, + "learning_rate": 8.304201561391058e-05, + "loss": 0.025107762217521666, + "step": 119520 + }, + { + "epoch": 16.966643009226402, + "grad_norm": 12.144177436828613, + "learning_rate": 8.304059616749467e-05, + "loss": 0.06175790429115295, + "step": 119530 + }, + { + "epoch": 16.9680624556423, + "grad_norm": 0.12573617696762085, + "learning_rate": 8.303917672107879e-05, + "loss": 0.0293417751789093, + "step": 119540 + }, + { + "epoch": 16.9694819020582, + "grad_norm": 0.13848525285720825, + "learning_rate": 8.303775727466288e-05, + "loss": 0.006557562947273254, + "step": 119550 + }, + { + "epoch": 16.970901348474094, + "grad_norm": 0.01964346133172512, + "learning_rate": 8.3036337828247e-05, + "loss": 0.010669238865375519, + "step": 119560 + }, + { + "epoch": 16.972320794889992, + "grad_norm": 0.2593211829662323, + "learning_rate": 8.303491838183109e-05, + "loss": 0.0015080250799655915, + "step": 119570 + }, + { + "epoch": 16.97374024130589, + "grad_norm": 15.115253448486328, + "learning_rate": 8.303349893541519e-05, + "loss": 0.023484209179878236, + "step": 119580 + }, + { + "epoch": 16.97515968772179, + "grad_norm": 0.018677575513720512, + "learning_rate": 8.303207948899929e-05, + "loss": 0.004597761482000351, + "step": 119590 + }, + { + "epoch": 16.976579134137687, + "grad_norm": 2.686915874481201, + "learning_rate": 8.30306600425834e-05, + "loss": 0.0300980269908905, + "step": 119600 + }, + { + "epoch": 16.977998580553585, + "grad_norm": 0.6160420179367065, + "learning_rate": 8.30292405961675e-05, + "loss": 0.024341486394405365, + "step": 119610 + }, + { + "epoch": 16.979418026969483, + "grad_norm": 0.10150213539600372, + "learning_rate": 8.30278211497516e-05, + "loss": 0.013426159322261811, + "step": 119620 + }, + { + "epoch": 16.980837473385378, + "grad_norm": 0.00996475201100111, + "learning_rate": 8.30264017033357e-05, + "loss": 0.019675298035144805, + "step": 119630 + }, + { + "epoch": 16.982256919801276, + "grad_norm": 0.14315427839756012, + "learning_rate": 8.30249822569198e-05, + "loss": 0.003599084168672562, + "step": 119640 + }, + { + "epoch": 16.983676366217175, + "grad_norm": 0.23158226907253265, + "learning_rate": 8.302356281050391e-05, + "loss": 0.006026803702116013, + "step": 119650 + }, + { + "epoch": 16.985095812633073, + "grad_norm": 0.622970700263977, + "learning_rate": 8.302214336408801e-05, + "loss": 0.0015147797763347625, + "step": 119660 + }, + { + "epoch": 16.98651525904897, + "grad_norm": 0.9638655781745911, + "learning_rate": 8.302072391767212e-05, + "loss": 0.002849790453910828, + "step": 119670 + }, + { + "epoch": 16.98793470546487, + "grad_norm": 2.5214967727661133, + "learning_rate": 8.30193044712562e-05, + "loss": 0.0074197396636009215, + "step": 119680 + }, + { + "epoch": 16.989354151880768, + "grad_norm": 0.2692882716655731, + "learning_rate": 8.301788502484031e-05, + "loss": 0.005155200883746147, + "step": 119690 + }, + { + "epoch": 16.990773598296663, + "grad_norm": 0.41026079654693604, + "learning_rate": 8.301646557842441e-05, + "loss": 0.0035352624952793123, + "step": 119700 + }, + { + "epoch": 16.99219304471256, + "grad_norm": 3.5877914428710938, + "learning_rate": 8.301504613200852e-05, + "loss": 0.009292619675397873, + "step": 119710 + }, + { + "epoch": 16.99361249112846, + "grad_norm": 0.016373714432120323, + "learning_rate": 8.301362668559263e-05, + "loss": 0.004624640569090843, + "step": 119720 + }, + { + "epoch": 16.995031937544358, + "grad_norm": 0.15189428627490997, + "learning_rate": 8.301220723917672e-05, + "loss": 0.03775623440742493, + "step": 119730 + }, + { + "epoch": 16.996451383960256, + "grad_norm": 0.027784127742052078, + "learning_rate": 8.301078779276083e-05, + "loss": 0.010945260524749756, + "step": 119740 + }, + { + "epoch": 16.997870830376154, + "grad_norm": 0.5772379636764526, + "learning_rate": 8.300936834634493e-05, + "loss": 0.020091539621353148, + "step": 119750 + }, + { + "epoch": 16.999290276792053, + "grad_norm": 12.586730003356934, + "learning_rate": 8.300794889992904e-05, + "loss": 0.03808550238609314, + "step": 119760 + }, + { + "epoch": 17.000709723207947, + "grad_norm": 9.696110725402832, + "learning_rate": 8.300652945351313e-05, + "loss": 0.044254863262176515, + "step": 119770 + }, + { + "epoch": 17.002129169623846, + "grad_norm": 0.37043657898902893, + "learning_rate": 8.300511000709723e-05, + "loss": 0.0012788783758878707, + "step": 119780 + }, + { + "epoch": 17.003548616039744, + "grad_norm": 8.824217796325684, + "learning_rate": 8.300369056068133e-05, + "loss": 0.017943207919597626, + "step": 119790 + }, + { + "epoch": 17.004968062455642, + "grad_norm": 0.37432098388671875, + "learning_rate": 8.300227111426544e-05, + "loss": 0.0019177347421646118, + "step": 119800 + }, + { + "epoch": 17.00638750887154, + "grad_norm": 0.8555291295051575, + "learning_rate": 8.300085166784955e-05, + "loss": 0.014973253011703491, + "step": 119810 + }, + { + "epoch": 17.00780695528744, + "grad_norm": 4.908260822296143, + "learning_rate": 8.299943222143365e-05, + "loss": 0.047904562950134275, + "step": 119820 + }, + { + "epoch": 17.009226401703337, + "grad_norm": 1.7123541831970215, + "learning_rate": 8.299801277501775e-05, + "loss": 0.05410314798355102, + "step": 119830 + }, + { + "epoch": 17.010645848119232, + "grad_norm": 0.4361095726490021, + "learning_rate": 8.299659332860184e-05, + "loss": 0.027262836694717407, + "step": 119840 + }, + { + "epoch": 17.01206529453513, + "grad_norm": 0.17793868482112885, + "learning_rate": 8.299517388218595e-05, + "loss": 0.04731735289096832, + "step": 119850 + }, + { + "epoch": 17.01348474095103, + "grad_norm": 1.0978947877883911, + "learning_rate": 8.299375443577005e-05, + "loss": 0.022935733199119568, + "step": 119860 + }, + { + "epoch": 17.014904187366927, + "grad_norm": 2.6851255893707275, + "learning_rate": 8.299233498935416e-05, + "loss": 0.014493119716644288, + "step": 119870 + }, + { + "epoch": 17.016323633782825, + "grad_norm": 0.12189056724309921, + "learning_rate": 8.299091554293826e-05, + "loss": 0.008826699852943421, + "step": 119880 + }, + { + "epoch": 17.017743080198724, + "grad_norm": 4.789248943328857, + "learning_rate": 8.298949609652236e-05, + "loss": 0.04857603013515473, + "step": 119890 + }, + { + "epoch": 17.019162526614622, + "grad_norm": 0.46851640939712524, + "learning_rate": 8.298807665010647e-05, + "loss": 0.01318027526140213, + "step": 119900 + }, + { + "epoch": 17.020581973030517, + "grad_norm": 7.147706031799316, + "learning_rate": 8.298665720369056e-05, + "loss": 0.03265936076641083, + "step": 119910 + }, + { + "epoch": 17.022001419446415, + "grad_norm": 6.2478861808776855, + "learning_rate": 8.298523775727468e-05, + "loss": 0.01385502964258194, + "step": 119920 + }, + { + "epoch": 17.023420865862313, + "grad_norm": 0.4883221983909607, + "learning_rate": 8.298381831085877e-05, + "loss": 0.014128404855728149, + "step": 119930 + }, + { + "epoch": 17.02484031227821, + "grad_norm": 0.1252172440290451, + "learning_rate": 8.298239886444287e-05, + "loss": 0.024860450625419618, + "step": 119940 + }, + { + "epoch": 17.02625975869411, + "grad_norm": 4.0285491943359375, + "learning_rate": 8.298097941802697e-05, + "loss": 0.02474023848772049, + "step": 119950 + }, + { + "epoch": 17.027679205110008, + "grad_norm": 10.738550186157227, + "learning_rate": 8.297955997161108e-05, + "loss": 0.029977038502693176, + "step": 119960 + }, + { + "epoch": 17.029098651525906, + "grad_norm": 1.833249807357788, + "learning_rate": 8.297814052519518e-05, + "loss": 0.03657858073711395, + "step": 119970 + }, + { + "epoch": 17.0305180979418, + "grad_norm": 0.35576480627059937, + "learning_rate": 8.297672107877929e-05, + "loss": 0.012883573770523071, + "step": 119980 + }, + { + "epoch": 17.0319375443577, + "grad_norm": 0.8515356779098511, + "learning_rate": 8.297530163236338e-05, + "loss": 0.0216094046831131, + "step": 119990 + }, + { + "epoch": 17.033356990773598, + "grad_norm": 10.729890823364258, + "learning_rate": 8.297388218594748e-05, + "loss": 0.026598137617111207, + "step": 120000 + }, + { + "epoch": 17.033356990773598, + "eval_accuracy": 0.9808609397850829, + "eval_loss": 0.07485253363847733, + "eval_runtime": 33.4946, + "eval_samples_per_second": 469.539, + "eval_steps_per_second": 14.689, + "step": 120000 + }, + { + "epoch": 17.034776437189496, + "grad_norm": 0.18618199229240417, + "learning_rate": 8.297246273953159e-05, + "loss": 0.043621528148651126, + "step": 120010 + }, + { + "epoch": 17.036195883605394, + "grad_norm": 9.199254989624023, + "learning_rate": 8.297104329311569e-05, + "loss": 0.04960590302944183, + "step": 120020 + }, + { + "epoch": 17.037615330021293, + "grad_norm": 0.9070085287094116, + "learning_rate": 8.29696238466998e-05, + "loss": 0.008391667902469636, + "step": 120030 + }, + { + "epoch": 17.03903477643719, + "grad_norm": 4.971753120422363, + "learning_rate": 8.296820440028389e-05, + "loss": 0.02203463315963745, + "step": 120040 + }, + { + "epoch": 17.040454222853086, + "grad_norm": 0.009288261644542217, + "learning_rate": 8.2966784953868e-05, + "loss": 0.030891206860542298, + "step": 120050 + }, + { + "epoch": 17.041873669268984, + "grad_norm": 0.1559710055589676, + "learning_rate": 8.29653655074521e-05, + "loss": 0.03783779442310333, + "step": 120060 + }, + { + "epoch": 17.043293115684882, + "grad_norm": 0.01857968419790268, + "learning_rate": 8.29639460610362e-05, + "loss": 0.0034990094602108, + "step": 120070 + }, + { + "epoch": 17.04471256210078, + "grad_norm": 0.101988784968853, + "learning_rate": 8.29625266146203e-05, + "loss": 0.020602494478225708, + "step": 120080 + }, + { + "epoch": 17.04613200851668, + "grad_norm": 1.6410483121871948, + "learning_rate": 8.29611071682044e-05, + "loss": 0.005050593242049217, + "step": 120090 + }, + { + "epoch": 17.047551454932577, + "grad_norm": 0.1444510519504547, + "learning_rate": 8.295968772178851e-05, + "loss": 0.029584896564483643, + "step": 120100 + }, + { + "epoch": 17.048970901348476, + "grad_norm": 0.8876551985740662, + "learning_rate": 8.295826827537261e-05, + "loss": 0.0014938555657863617, + "step": 120110 + }, + { + "epoch": 17.05039034776437, + "grad_norm": 4.955441951751709, + "learning_rate": 8.295684882895672e-05, + "loss": 0.006155164912343025, + "step": 120120 + }, + { + "epoch": 17.05180979418027, + "grad_norm": 0.16779440641403198, + "learning_rate": 8.295542938254082e-05, + "loss": 0.005987958237528801, + "step": 120130 + }, + { + "epoch": 17.053229240596167, + "grad_norm": 0.022262701764702797, + "learning_rate": 8.295400993612491e-05, + "loss": 0.010092838108539582, + "step": 120140 + }, + { + "epoch": 17.054648687012065, + "grad_norm": 0.5307697057723999, + "learning_rate": 8.295259048970901e-05, + "loss": 0.014554566144943238, + "step": 120150 + }, + { + "epoch": 17.056068133427964, + "grad_norm": 4.913189888000488, + "learning_rate": 8.295117104329312e-05, + "loss": 0.0074235409498214725, + "step": 120160 + }, + { + "epoch": 17.057487579843862, + "grad_norm": 0.4308110475540161, + "learning_rate": 8.294975159687722e-05, + "loss": 0.03521883189678192, + "step": 120170 + }, + { + "epoch": 17.05890702625976, + "grad_norm": 0.12361428141593933, + "learning_rate": 8.294833215046133e-05, + "loss": 0.021901145577430725, + "step": 120180 + }, + { + "epoch": 17.060326472675655, + "grad_norm": 0.020824233070015907, + "learning_rate": 8.294691270404543e-05, + "loss": 0.008531014621257781, + "step": 120190 + }, + { + "epoch": 17.061745919091553, + "grad_norm": 0.01912788301706314, + "learning_rate": 8.294549325762952e-05, + "loss": 0.04999278485774994, + "step": 120200 + }, + { + "epoch": 17.06316536550745, + "grad_norm": 10.640271186828613, + "learning_rate": 8.294407381121364e-05, + "loss": 0.028563928604125977, + "step": 120210 + }, + { + "epoch": 17.06458481192335, + "grad_norm": 0.020733371376991272, + "learning_rate": 8.294265436479773e-05, + "loss": 0.016567130386829377, + "step": 120220 + }, + { + "epoch": 17.066004258339248, + "grad_norm": 0.9342880249023438, + "learning_rate": 8.294123491838184e-05, + "loss": 0.004354240372776985, + "step": 120230 + }, + { + "epoch": 17.067423704755146, + "grad_norm": 10.978273391723633, + "learning_rate": 8.293981547196594e-05, + "loss": 0.020922911167144776, + "step": 120240 + }, + { + "epoch": 17.068843151171045, + "grad_norm": 0.08470244705677032, + "learning_rate": 8.293839602555004e-05, + "loss": 0.0323804646730423, + "step": 120250 + }, + { + "epoch": 17.07026259758694, + "grad_norm": Infinity, + "learning_rate": 8.293697657913414e-05, + "loss": 0.02929849028587341, + "step": 120260 + }, + { + "epoch": 17.071682044002838, + "grad_norm": 0.2951527237892151, + "learning_rate": 8.293569907735983e-05, + "loss": 0.012332384288311005, + "step": 120270 + }, + { + "epoch": 17.073101490418736, + "grad_norm": 0.23411825299263, + "learning_rate": 8.293427963094393e-05, + "loss": 0.002320580929517746, + "step": 120280 + }, + { + "epoch": 17.074520936834634, + "grad_norm": 0.03805660083889961, + "learning_rate": 8.293286018452804e-05, + "loss": 0.0022540684789419175, + "step": 120290 + }, + { + "epoch": 17.075940383250533, + "grad_norm": 0.1722414642572403, + "learning_rate": 8.293144073811214e-05, + "loss": 0.014226651191711426, + "step": 120300 + }, + { + "epoch": 17.07735982966643, + "grad_norm": 0.022016188129782677, + "learning_rate": 8.293002129169625e-05, + "loss": 0.019705028831958772, + "step": 120310 + }, + { + "epoch": 17.07877927608233, + "grad_norm": 2.4051177501678467, + "learning_rate": 8.292860184528033e-05, + "loss": 0.032577145099639895, + "step": 120320 + }, + { + "epoch": 17.080198722498224, + "grad_norm": 0.014880457893013954, + "learning_rate": 8.292718239886445e-05, + "loss": 0.08195329308509827, + "step": 120330 + }, + { + "epoch": 17.081618168914122, + "grad_norm": 0.017962895333766937, + "learning_rate": 8.292576295244854e-05, + "loss": 0.04217566847801209, + "step": 120340 + }, + { + "epoch": 17.08303761533002, + "grad_norm": 0.20503850281238556, + "learning_rate": 8.292434350603265e-05, + "loss": 0.033675742149353025, + "step": 120350 + }, + { + "epoch": 17.08445706174592, + "grad_norm": 0.3100937604904175, + "learning_rate": 8.292292405961675e-05, + "loss": 0.03976408541202545, + "step": 120360 + }, + { + "epoch": 17.085876508161817, + "grad_norm": 1.0643759965896606, + "learning_rate": 8.292150461320085e-05, + "loss": 0.017562437057495116, + "step": 120370 + }, + { + "epoch": 17.087295954577716, + "grad_norm": 2.6324429512023926, + "learning_rate": 8.292008516678496e-05, + "loss": 0.01645803153514862, + "step": 120380 + }, + { + "epoch": 17.088715400993614, + "grad_norm": 0.010898143984377384, + "learning_rate": 8.291866572036906e-05, + "loss": 0.03926792144775391, + "step": 120390 + }, + { + "epoch": 17.09013484740951, + "grad_norm": 0.06077408045530319, + "learning_rate": 8.291724627395317e-05, + "loss": 0.06492968797683715, + "step": 120400 + }, + { + "epoch": 17.091554293825407, + "grad_norm": 0.03656835854053497, + "learning_rate": 8.291582682753727e-05, + "loss": 0.003984153643250466, + "step": 120410 + }, + { + "epoch": 17.092973740241305, + "grad_norm": 0.011007381603121758, + "learning_rate": 8.291440738112136e-05, + "loss": 0.03613340556621551, + "step": 120420 + }, + { + "epoch": 17.094393186657204, + "grad_norm": 36.203392028808594, + "learning_rate": 8.291298793470546e-05, + "loss": 0.052862972021102905, + "step": 120430 + }, + { + "epoch": 17.095812633073102, + "grad_norm": 2.5039913654327393, + "learning_rate": 8.291156848828957e-05, + "loss": 0.038491514325141904, + "step": 120440 + }, + { + "epoch": 17.097232079489, + "grad_norm": 0.05600656941533089, + "learning_rate": 8.291014904187367e-05, + "loss": 0.04204607605934143, + "step": 120450 + }, + { + "epoch": 17.0986515259049, + "grad_norm": 5.478761672973633, + "learning_rate": 8.290872959545778e-05, + "loss": 0.014320333302021027, + "step": 120460 + }, + { + "epoch": 17.100070972320793, + "grad_norm": 0.1664823591709137, + "learning_rate": 8.290731014904188e-05, + "loss": 0.029626739025115967, + "step": 120470 + }, + { + "epoch": 17.10149041873669, + "grad_norm": 0.4680706560611725, + "learning_rate": 8.290589070262597e-05, + "loss": 0.016333407163619994, + "step": 120480 + }, + { + "epoch": 17.10290986515259, + "grad_norm": 15.665885925292969, + "learning_rate": 8.290447125621008e-05, + "loss": 0.018834175169467927, + "step": 120490 + }, + { + "epoch": 17.10432931156849, + "grad_norm": 0.8051186203956604, + "learning_rate": 8.290305180979418e-05, + "loss": 0.006639273464679718, + "step": 120500 + }, + { + "epoch": 17.10432931156849, + "eval_accuracy": 0.9870286767978635, + "eval_loss": 0.045041538774967194, + "eval_runtime": 32.0723, + "eval_samples_per_second": 490.361, + "eval_steps_per_second": 15.34, + "step": 120500 + }, + { + "epoch": 17.105748757984387, + "grad_norm": 2.2258312702178955, + "learning_rate": 8.29016323633783e-05, + "loss": 0.04136236906051636, + "step": 120510 + }, + { + "epoch": 17.107168204400285, + "grad_norm": 0.1064390316605568, + "learning_rate": 8.290021291696238e-05, + "loss": 0.010789933800697326, + "step": 120520 + }, + { + "epoch": 17.108587650816183, + "grad_norm": 3.2055423259735107, + "learning_rate": 8.289879347054649e-05, + "loss": 0.003744557872414589, + "step": 120530 + }, + { + "epoch": 17.110007097232078, + "grad_norm": 0.0287025086581707, + "learning_rate": 8.289737402413059e-05, + "loss": 0.020008505880832674, + "step": 120540 + }, + { + "epoch": 17.111426543647976, + "grad_norm": 0.08441568911075592, + "learning_rate": 8.28959545777147e-05, + "loss": 0.05495128631591797, + "step": 120550 + }, + { + "epoch": 17.112845990063875, + "grad_norm": 0.016766395419836044, + "learning_rate": 8.289453513129881e-05, + "loss": 0.019158127903938293, + "step": 120560 + }, + { + "epoch": 17.114265436479773, + "grad_norm": 3.567664384841919, + "learning_rate": 8.28931156848829e-05, + "loss": 0.021029704809188844, + "step": 120570 + }, + { + "epoch": 17.11568488289567, + "grad_norm": 0.0842832401394844, + "learning_rate": 8.2891696238467e-05, + "loss": 0.012277697026729584, + "step": 120580 + }, + { + "epoch": 17.11710432931157, + "grad_norm": 4.586905479431152, + "learning_rate": 8.28902767920511e-05, + "loss": 0.011499184370040893, + "step": 120590 + }, + { + "epoch": 17.118523775727468, + "grad_norm": 0.027589887380599976, + "learning_rate": 8.288885734563521e-05, + "loss": 0.025917389988899232, + "step": 120600 + }, + { + "epoch": 17.119943222143363, + "grad_norm": 5.147835731506348, + "learning_rate": 8.288743789921931e-05, + "loss": 0.02600862979888916, + "step": 120610 + }, + { + "epoch": 17.12136266855926, + "grad_norm": 0.028256772086024284, + "learning_rate": 8.288601845280342e-05, + "loss": 0.04589993357658386, + "step": 120620 + }, + { + "epoch": 17.12278211497516, + "grad_norm": 10.49030876159668, + "learning_rate": 8.28845990063875e-05, + "loss": 0.02437007427215576, + "step": 120630 + }, + { + "epoch": 17.124201561391057, + "grad_norm": 0.09230612218379974, + "learning_rate": 8.288317955997161e-05, + "loss": 0.028782013058662414, + "step": 120640 + }, + { + "epoch": 17.125621007806956, + "grad_norm": 2.3693878650665283, + "learning_rate": 8.288176011355572e-05, + "loss": 0.023813261091709136, + "step": 120650 + }, + { + "epoch": 17.127040454222854, + "grad_norm": 0.04619796574115753, + "learning_rate": 8.288034066713982e-05, + "loss": 0.051468032598495486, + "step": 120660 + }, + { + "epoch": 17.128459900638752, + "grad_norm": 0.03357632830739021, + "learning_rate": 8.287892122072393e-05, + "loss": 0.0446009486913681, + "step": 120670 + }, + { + "epoch": 17.129879347054647, + "grad_norm": 0.021320484578609467, + "learning_rate": 8.287750177430802e-05, + "loss": 0.028787761926651, + "step": 120680 + }, + { + "epoch": 17.131298793470545, + "grad_norm": 0.3757644593715668, + "learning_rate": 8.287608232789213e-05, + "loss": 0.009390568733215332, + "step": 120690 + }, + { + "epoch": 17.132718239886444, + "grad_norm": 8.768340110778809, + "learning_rate": 8.287466288147622e-05, + "loss": 0.03199634850025177, + "step": 120700 + }, + { + "epoch": 17.134137686302342, + "grad_norm": 1.7924593687057495, + "learning_rate": 8.287324343506034e-05, + "loss": 0.014015412330627442, + "step": 120710 + }, + { + "epoch": 17.13555713271824, + "grad_norm": 0.03260492905974388, + "learning_rate": 8.287182398864443e-05, + "loss": 0.04740467071533203, + "step": 120720 + }, + { + "epoch": 17.13697657913414, + "grad_norm": 29.415185928344727, + "learning_rate": 8.287040454222853e-05, + "loss": 0.038969749212265016, + "step": 120730 + }, + { + "epoch": 17.138396025550037, + "grad_norm": 0.05544610321521759, + "learning_rate": 8.286898509581264e-05, + "loss": 0.011639602482318878, + "step": 120740 + }, + { + "epoch": 17.13981547196593, + "grad_norm": 0.8772669434547424, + "learning_rate": 8.286756564939674e-05, + "loss": 0.028922209143638612, + "step": 120750 + }, + { + "epoch": 17.14123491838183, + "grad_norm": 0.09601476788520813, + "learning_rate": 8.286614620298085e-05, + "loss": 0.04748598635196686, + "step": 120760 + }, + { + "epoch": 17.14265436479773, + "grad_norm": 0.09443246573209763, + "learning_rate": 8.286472675656495e-05, + "loss": 0.016865630447864533, + "step": 120770 + }, + { + "epoch": 17.144073811213627, + "grad_norm": 1.598710298538208, + "learning_rate": 8.286330731014904e-05, + "loss": 0.029343438148498536, + "step": 120780 + }, + { + "epoch": 17.145493257629525, + "grad_norm": 0.13849106431007385, + "learning_rate": 8.286188786373314e-05, + "loss": 0.008465579897165298, + "step": 120790 + }, + { + "epoch": 17.146912704045423, + "grad_norm": 1.7292816638946533, + "learning_rate": 8.286046841731725e-05, + "loss": 0.01654750108718872, + "step": 120800 + }, + { + "epoch": 17.14833215046132, + "grad_norm": 3.3607993125915527, + "learning_rate": 8.285904897090135e-05, + "loss": 0.008860719203948975, + "step": 120810 + }, + { + "epoch": 17.149751596877216, + "grad_norm": 0.058206669986248016, + "learning_rate": 8.285762952448546e-05, + "loss": 0.04676066637039185, + "step": 120820 + }, + { + "epoch": 17.151171043293115, + "grad_norm": 1.7293013334274292, + "learning_rate": 8.285621007806956e-05, + "loss": 0.006193574145436287, + "step": 120830 + }, + { + "epoch": 17.152590489709013, + "grad_norm": 0.01077141985297203, + "learning_rate": 8.285479063165366e-05, + "loss": 0.019160452485084533, + "step": 120840 + }, + { + "epoch": 17.15400993612491, + "grad_norm": 7.551402568817139, + "learning_rate": 8.285337118523777e-05, + "loss": 0.04898516833782196, + "step": 120850 + }, + { + "epoch": 17.15542938254081, + "grad_norm": 2.899712324142456, + "learning_rate": 8.285195173882186e-05, + "loss": 0.017336773872375488, + "step": 120860 + }, + { + "epoch": 17.156848828956708, + "grad_norm": 0.03883068636059761, + "learning_rate": 8.285053229240597e-05, + "loss": 0.0031766846776008608, + "step": 120870 + }, + { + "epoch": 17.158268275372606, + "grad_norm": 0.9664191007614136, + "learning_rate": 8.284911284599007e-05, + "loss": 0.012236092239618301, + "step": 120880 + }, + { + "epoch": 17.1596877217885, + "grad_norm": 13.277713775634766, + "learning_rate": 8.284769339957417e-05, + "loss": 0.01570097804069519, + "step": 120890 + }, + { + "epoch": 17.1611071682044, + "grad_norm": 6.466482639312744, + "learning_rate": 8.284627395315827e-05, + "loss": 0.004724283888936043, + "step": 120900 + }, + { + "epoch": 17.162526614620297, + "grad_norm": 0.30006489157676697, + "learning_rate": 8.284485450674238e-05, + "loss": 0.025148922204971315, + "step": 120910 + }, + { + "epoch": 17.163946061036196, + "grad_norm": 0.01525171473622322, + "learning_rate": 8.284343506032648e-05, + "loss": 0.005151396989822388, + "step": 120920 + }, + { + "epoch": 17.165365507452094, + "grad_norm": 0.9813533425331116, + "learning_rate": 8.284201561391059e-05, + "loss": 0.014321206510066986, + "step": 120930 + }, + { + "epoch": 17.166784953867992, + "grad_norm": 0.272886723279953, + "learning_rate": 8.284059616749468e-05, + "loss": 0.0091228649020195, + "step": 120940 + }, + { + "epoch": 17.16820440028389, + "grad_norm": 12.312128067016602, + "learning_rate": 8.283917672107878e-05, + "loss": 0.04469579458236694, + "step": 120950 + }, + { + "epoch": 17.169623846699785, + "grad_norm": 4.318283557891846, + "learning_rate": 8.283775727466289e-05, + "loss": 0.0037714622914791105, + "step": 120960 + }, + { + "epoch": 17.171043293115684, + "grad_norm": 1.3079272508621216, + "learning_rate": 8.283633782824699e-05, + "loss": 0.012800368666648864, + "step": 120970 + }, + { + "epoch": 17.172462739531582, + "grad_norm": 0.02630901150405407, + "learning_rate": 8.28349183818311e-05, + "loss": 0.007006227970123291, + "step": 120980 + }, + { + "epoch": 17.17388218594748, + "grad_norm": 0.018336854875087738, + "learning_rate": 8.283349893541518e-05, + "loss": 0.035109061002731326, + "step": 120990 + }, + { + "epoch": 17.17530163236338, + "grad_norm": 0.03967367857694626, + "learning_rate": 8.28320794889993e-05, + "loss": 0.00726487934589386, + "step": 121000 + }, + { + "epoch": 17.17530163236338, + "eval_accuracy": 0.9865199974566033, + "eval_loss": 0.05103665217757225, + "eval_runtime": 32.6428, + "eval_samples_per_second": 481.791, + "eval_steps_per_second": 15.072, + "step": 121000 + }, + { + "epoch": 17.176721078779277, + "grad_norm": 0.012679275125265121, + "learning_rate": 8.283066004258339e-05, + "loss": 0.01789276897907257, + "step": 121010 + }, + { + "epoch": 17.178140525195175, + "grad_norm": 0.02514667622745037, + "learning_rate": 8.28292405961675e-05, + "loss": 0.0161560595035553, + "step": 121020 + }, + { + "epoch": 17.17955997161107, + "grad_norm": 0.04653330147266388, + "learning_rate": 8.28278211497516e-05, + "loss": 0.003413556143641472, + "step": 121030 + }, + { + "epoch": 17.18097941802697, + "grad_norm": 2.3200175762176514, + "learning_rate": 8.28264017033357e-05, + "loss": 0.009373904764652252, + "step": 121040 + }, + { + "epoch": 17.182398864442867, + "grad_norm": 0.04067833721637726, + "learning_rate": 8.282498225691981e-05, + "loss": 0.016110491752624512, + "step": 121050 + }, + { + "epoch": 17.183818310858765, + "grad_norm": 8.260937690734863, + "learning_rate": 8.28235628105039e-05, + "loss": 0.0567485511302948, + "step": 121060 + }, + { + "epoch": 17.185237757274663, + "grad_norm": 3.1584644317626953, + "learning_rate": 8.282214336408802e-05, + "loss": 0.005105612799525261, + "step": 121070 + }, + { + "epoch": 17.18665720369056, + "grad_norm": 1.6670535802841187, + "learning_rate": 8.282072391767211e-05, + "loss": 0.006072807312011719, + "step": 121080 + }, + { + "epoch": 17.18807665010646, + "grad_norm": 0.24677909910678864, + "learning_rate": 8.281930447125621e-05, + "loss": 0.02320306599140167, + "step": 121090 + }, + { + "epoch": 17.189496096522355, + "grad_norm": 0.1326128989458084, + "learning_rate": 8.281788502484031e-05, + "loss": 0.015490972995758056, + "step": 121100 + }, + { + "epoch": 17.190915542938253, + "grad_norm": 0.8754839301109314, + "learning_rate": 8.281646557842442e-05, + "loss": 0.03079564869403839, + "step": 121110 + }, + { + "epoch": 17.19233498935415, + "grad_norm": 0.04686080291867256, + "learning_rate": 8.281504613200852e-05, + "loss": 0.007701759040355682, + "step": 121120 + }, + { + "epoch": 17.19375443577005, + "grad_norm": 9.40129566192627, + "learning_rate": 8.281362668559263e-05, + "loss": 0.008802379667758941, + "step": 121130 + }, + { + "epoch": 17.195173882185948, + "grad_norm": 0.033822499215602875, + "learning_rate": 8.281220723917673e-05, + "loss": 0.01620979458093643, + "step": 121140 + }, + { + "epoch": 17.196593328601846, + "grad_norm": 0.35262858867645264, + "learning_rate": 8.281078779276082e-05, + "loss": 0.01161227524280548, + "step": 121150 + }, + { + "epoch": 17.198012775017745, + "grad_norm": 0.006954657379537821, + "learning_rate": 8.280936834634493e-05, + "loss": 0.004226695373654365, + "step": 121160 + }, + { + "epoch": 17.19943222143364, + "grad_norm": 0.048306047916412354, + "learning_rate": 8.280794889992903e-05, + "loss": 0.030858996510505676, + "step": 121170 + }, + { + "epoch": 17.200851667849538, + "grad_norm": 6.007036209106445, + "learning_rate": 8.280652945351314e-05, + "loss": 0.010294271260499954, + "step": 121180 + }, + { + "epoch": 17.202271114265436, + "grad_norm": 12.101383209228516, + "learning_rate": 8.280511000709723e-05, + "loss": 0.03878332376480102, + "step": 121190 + }, + { + "epoch": 17.203690560681334, + "grad_norm": 0.39908748865127563, + "learning_rate": 8.280369056068134e-05, + "loss": 0.022022242844104766, + "step": 121200 + }, + { + "epoch": 17.205110007097232, + "grad_norm": 0.17057831585407257, + "learning_rate": 8.280227111426543e-05, + "loss": 0.004962685331702232, + "step": 121210 + }, + { + "epoch": 17.20652945351313, + "grad_norm": 1.1699227094650269, + "learning_rate": 8.280085166784955e-05, + "loss": 0.005730428919196129, + "step": 121220 + }, + { + "epoch": 17.20794889992903, + "grad_norm": 6.226938724517822, + "learning_rate": 8.279943222143364e-05, + "loss": 0.028130099177360535, + "step": 121230 + }, + { + "epoch": 17.209368346344924, + "grad_norm": 13.366537094116211, + "learning_rate": 8.279801277501775e-05, + "loss": 0.020548251271247864, + "step": 121240 + }, + { + "epoch": 17.210787792760822, + "grad_norm": 0.5270779728889465, + "learning_rate": 8.279659332860185e-05, + "loss": 0.01929387152194977, + "step": 121250 + }, + { + "epoch": 17.21220723917672, + "grad_norm": 0.3697451949119568, + "learning_rate": 8.279517388218595e-05, + "loss": 0.006632042676210403, + "step": 121260 + }, + { + "epoch": 17.21362668559262, + "grad_norm": 0.06791370362043381, + "learning_rate": 8.279375443577006e-05, + "loss": 0.00866928994655609, + "step": 121270 + }, + { + "epoch": 17.215046132008517, + "grad_norm": 0.09903264790773392, + "learning_rate": 8.279233498935416e-05, + "loss": 0.008664032816886902, + "step": 121280 + }, + { + "epoch": 17.216465578424415, + "grad_norm": 7.171376705169678, + "learning_rate": 8.279091554293827e-05, + "loss": 0.03021889626979828, + "step": 121290 + }, + { + "epoch": 17.217885024840314, + "grad_norm": 0.05718240141868591, + "learning_rate": 8.278949609652235e-05, + "loss": 0.0042637944221496586, + "step": 121300 + }, + { + "epoch": 17.21930447125621, + "grad_norm": 1.197914719581604, + "learning_rate": 8.278807665010646e-05, + "loss": 0.0027816496789455415, + "step": 121310 + }, + { + "epoch": 17.220723917672107, + "grad_norm": 0.26062679290771484, + "learning_rate": 8.278665720369056e-05, + "loss": 0.011219573765993118, + "step": 121320 + }, + { + "epoch": 17.222143364088005, + "grad_norm": 0.691809356212616, + "learning_rate": 8.278523775727467e-05, + "loss": 0.018367362022399903, + "step": 121330 + }, + { + "epoch": 17.223562810503903, + "grad_norm": 0.027361121028661728, + "learning_rate": 8.278381831085877e-05, + "loss": 0.0044341754168272015, + "step": 121340 + }, + { + "epoch": 17.2249822569198, + "grad_norm": 0.09908737242221832, + "learning_rate": 8.278239886444287e-05, + "loss": 0.028595495223999023, + "step": 121350 + }, + { + "epoch": 17.2264017033357, + "grad_norm": 0.009565730579197407, + "learning_rate": 8.278097941802698e-05, + "loss": 0.01315288245677948, + "step": 121360 + }, + { + "epoch": 17.2278211497516, + "grad_norm": 0.06306063383817673, + "learning_rate": 8.277955997161107e-05, + "loss": 0.004615992680191994, + "step": 121370 + }, + { + "epoch": 17.229240596167493, + "grad_norm": 0.03943710774183273, + "learning_rate": 8.277814052519519e-05, + "loss": 0.005523869767785072, + "step": 121380 + }, + { + "epoch": 17.23066004258339, + "grad_norm": 0.16066552698612213, + "learning_rate": 8.277672107877928e-05, + "loss": 0.030437877774238585, + "step": 121390 + }, + { + "epoch": 17.23207948899929, + "grad_norm": 0.5343520641326904, + "learning_rate": 8.277530163236338e-05, + "loss": 0.016141740977764128, + "step": 121400 + }, + { + "epoch": 17.233498935415188, + "grad_norm": 0.10204742848873138, + "learning_rate": 8.277388218594748e-05, + "loss": 0.04754527807235718, + "step": 121410 + }, + { + "epoch": 17.234918381831086, + "grad_norm": 0.04669100418686867, + "learning_rate": 8.277246273953159e-05, + "loss": 0.00649983286857605, + "step": 121420 + }, + { + "epoch": 17.236337828246985, + "grad_norm": 19.72222900390625, + "learning_rate": 8.277104329311569e-05, + "loss": 0.03361916840076447, + "step": 121430 + }, + { + "epoch": 17.237757274662883, + "grad_norm": 1.6603556871414185, + "learning_rate": 8.27696238466998e-05, + "loss": 0.006782519817352295, + "step": 121440 + }, + { + "epoch": 17.239176721078778, + "grad_norm": 2.3674466609954834, + "learning_rate": 8.27682044002839e-05, + "loss": 0.024116578698158263, + "step": 121450 + }, + { + "epoch": 17.240596167494676, + "grad_norm": 2.121873140335083, + "learning_rate": 8.276678495386799e-05, + "loss": 0.007182253897190094, + "step": 121460 + }, + { + "epoch": 17.242015613910574, + "grad_norm": 9.855586051940918, + "learning_rate": 8.27653655074521e-05, + "loss": 0.028914478421211243, + "step": 121470 + }, + { + "epoch": 17.243435060326473, + "grad_norm": 6.052547454833984, + "learning_rate": 8.27639460610362e-05, + "loss": 0.029678156971931456, + "step": 121480 + }, + { + "epoch": 17.24485450674237, + "grad_norm": 0.012733505107462406, + "learning_rate": 8.276252661462031e-05, + "loss": 0.015345364809036255, + "step": 121490 + }, + { + "epoch": 17.24627395315827, + "grad_norm": 8.1347017288208, + "learning_rate": 8.27611071682044e-05, + "loss": 0.021826182305812836, + "step": 121500 + }, + { + "epoch": 17.24627395315827, + "eval_accuracy": 0.9786990525847269, + "eval_loss": 0.07808168977499008, + "eval_runtime": 32.7353, + "eval_samples_per_second": 480.429, + "eval_steps_per_second": 15.03, + "step": 121500 + }, + { + "epoch": 17.247693399574167, + "grad_norm": 1.4311810731887817, + "learning_rate": 8.27596877217885e-05, + "loss": 0.03259517550468445, + "step": 121510 + }, + { + "epoch": 17.249112845990062, + "grad_norm": 0.18520532548427582, + "learning_rate": 8.27582682753726e-05, + "loss": 0.029880058765411378, + "step": 121520 + }, + { + "epoch": 17.25053229240596, + "grad_norm": 3.1363275051116943, + "learning_rate": 8.275684882895671e-05, + "loss": 0.01889028251171112, + "step": 121530 + }, + { + "epoch": 17.25195173882186, + "grad_norm": 7.482678413391113, + "learning_rate": 8.275542938254081e-05, + "loss": 0.04509726464748383, + "step": 121540 + }, + { + "epoch": 17.253371185237757, + "grad_norm": 0.14285561442375183, + "learning_rate": 8.275400993612491e-05, + "loss": 0.0074087709188461305, + "step": 121550 + }, + { + "epoch": 17.254790631653655, + "grad_norm": 5.067549228668213, + "learning_rate": 8.275259048970902e-05, + "loss": 0.06535211801528931, + "step": 121560 + }, + { + "epoch": 17.256210078069554, + "grad_norm": 1.6123889684677124, + "learning_rate": 8.275117104329312e-05, + "loss": 0.005470262467861175, + "step": 121570 + }, + { + "epoch": 17.257629524485452, + "grad_norm": 0.4612501561641693, + "learning_rate": 8.274975159687723e-05, + "loss": 0.02524920701980591, + "step": 121580 + }, + { + "epoch": 17.259048970901347, + "grad_norm": 0.20753082633018494, + "learning_rate": 8.274833215046132e-05, + "loss": 0.007443346083164215, + "step": 121590 + }, + { + "epoch": 17.260468417317245, + "grad_norm": 2.3877856731414795, + "learning_rate": 8.274691270404544e-05, + "loss": 0.029876506328582762, + "step": 121600 + }, + { + "epoch": 17.261887863733143, + "grad_norm": 0.1580190658569336, + "learning_rate": 8.274549325762952e-05, + "loss": 0.010098446160554886, + "step": 121610 + }, + { + "epoch": 17.26330731014904, + "grad_norm": 1.5521838665008545, + "learning_rate": 8.274407381121363e-05, + "loss": 0.01123420000076294, + "step": 121620 + }, + { + "epoch": 17.26472675656494, + "grad_norm": 0.2570251226425171, + "learning_rate": 8.274265436479773e-05, + "loss": 0.03267633020877838, + "step": 121630 + }, + { + "epoch": 17.26614620298084, + "grad_norm": 0.011689351871609688, + "learning_rate": 8.274123491838184e-05, + "loss": 0.022249601781368256, + "step": 121640 + }, + { + "epoch": 17.267565649396737, + "grad_norm": 0.24122844636440277, + "learning_rate": 8.273981547196594e-05, + "loss": 0.08932775855064393, + "step": 121650 + }, + { + "epoch": 17.26898509581263, + "grad_norm": 1.660841703414917, + "learning_rate": 8.273839602555003e-05, + "loss": 0.030342379212379457, + "step": 121660 + }, + { + "epoch": 17.27040454222853, + "grad_norm": 0.09175290167331696, + "learning_rate": 8.273697657913414e-05, + "loss": 0.042768290638923644, + "step": 121670 + }, + { + "epoch": 17.271823988644428, + "grad_norm": 0.018420705571770668, + "learning_rate": 8.273555713271824e-05, + "loss": 0.03903657495975495, + "step": 121680 + }, + { + "epoch": 17.273243435060326, + "grad_norm": 0.12965130805969238, + "learning_rate": 8.273413768630235e-05, + "loss": 0.004619887471199036, + "step": 121690 + }, + { + "epoch": 17.274662881476225, + "grad_norm": 0.061711788177490234, + "learning_rate": 8.273271823988645e-05, + "loss": 0.0060319904237985614, + "step": 121700 + }, + { + "epoch": 17.276082327892123, + "grad_norm": 2.4323067665100098, + "learning_rate": 8.273129879347055e-05, + "loss": 0.007204280793666839, + "step": 121710 + }, + { + "epoch": 17.27750177430802, + "grad_norm": 0.02041490562260151, + "learning_rate": 8.272987934705464e-05, + "loss": 0.018590529263019562, + "step": 121720 + }, + { + "epoch": 17.278921220723916, + "grad_norm": 0.2832045257091522, + "learning_rate": 8.272845990063876e-05, + "loss": 0.03935945630073547, + "step": 121730 + }, + { + "epoch": 17.280340667139814, + "grad_norm": 0.08537329733371735, + "learning_rate": 8.272704045422285e-05, + "loss": 0.0617376446723938, + "step": 121740 + }, + { + "epoch": 17.281760113555713, + "grad_norm": 10.530901908874512, + "learning_rate": 8.272562100780696e-05, + "loss": 0.013979089260101319, + "step": 121750 + }, + { + "epoch": 17.28317955997161, + "grad_norm": 0.13446474075317383, + "learning_rate": 8.272420156139106e-05, + "loss": 0.038662633299827574, + "step": 121760 + }, + { + "epoch": 17.28459900638751, + "grad_norm": 0.2811603546142578, + "learning_rate": 8.272278211497516e-05, + "loss": 0.028803160786628722, + "step": 121770 + }, + { + "epoch": 17.286018452803408, + "grad_norm": 0.03432750329375267, + "learning_rate": 8.272136266855927e-05, + "loss": 0.010302607715129853, + "step": 121780 + }, + { + "epoch": 17.287437899219306, + "grad_norm": 0.030038023367524147, + "learning_rate": 8.271994322214337e-05, + "loss": 0.024182312190532684, + "step": 121790 + }, + { + "epoch": 17.2888573456352, + "grad_norm": 1.8961012363433838, + "learning_rate": 8.271852377572748e-05, + "loss": 0.018085433542728423, + "step": 121800 + }, + { + "epoch": 17.2902767920511, + "grad_norm": 4.688851356506348, + "learning_rate": 8.271710432931156e-05, + "loss": 0.009723028540611267, + "step": 121810 + }, + { + "epoch": 17.291696238466997, + "grad_norm": 0.5379230976104736, + "learning_rate": 8.271568488289567e-05, + "loss": 0.015512585639953613, + "step": 121820 + }, + { + "epoch": 17.293115684882896, + "grad_norm": 0.060846734791994095, + "learning_rate": 8.271426543647977e-05, + "loss": 0.0035803850740194322, + "step": 121830 + }, + { + "epoch": 17.294535131298794, + "grad_norm": 0.1275729387998581, + "learning_rate": 8.271284599006388e-05, + "loss": 0.02291179299354553, + "step": 121840 + }, + { + "epoch": 17.295954577714692, + "grad_norm": 0.041570551693439484, + "learning_rate": 8.271142654364798e-05, + "loss": 0.03817935287952423, + "step": 121850 + }, + { + "epoch": 17.29737402413059, + "grad_norm": 9.03583812713623, + "learning_rate": 8.271000709723208e-05, + "loss": 0.030699890851974488, + "step": 121860 + }, + { + "epoch": 17.298793470546485, + "grad_norm": 0.2126571238040924, + "learning_rate": 8.270858765081619e-05, + "loss": 0.005072015523910523, + "step": 121870 + }, + { + "epoch": 17.300212916962384, + "grad_norm": 0.5314030051231384, + "learning_rate": 8.270716820440028e-05, + "loss": 0.03464093804359436, + "step": 121880 + }, + { + "epoch": 17.301632363378282, + "grad_norm": 0.17660662531852722, + "learning_rate": 8.27057487579844e-05, + "loss": 0.02229333221912384, + "step": 121890 + }, + { + "epoch": 17.30305180979418, + "grad_norm": 0.10263410210609436, + "learning_rate": 8.270432931156849e-05, + "loss": 0.03747016191482544, + "step": 121900 + }, + { + "epoch": 17.30447125621008, + "grad_norm": 0.6229006052017212, + "learning_rate": 8.270290986515259e-05, + "loss": 0.010727620124816895, + "step": 121910 + }, + { + "epoch": 17.305890702625977, + "grad_norm": 3.878967523574829, + "learning_rate": 8.270149041873669e-05, + "loss": 0.06718948483467102, + "step": 121920 + }, + { + "epoch": 17.307310149041875, + "grad_norm": 0.12267734855413437, + "learning_rate": 8.27000709723208e-05, + "loss": 0.04643978476524353, + "step": 121930 + }, + { + "epoch": 17.30872959545777, + "grad_norm": 12.556729316711426, + "learning_rate": 8.26986515259049e-05, + "loss": 0.012549221515655518, + "step": 121940 + }, + { + "epoch": 17.310149041873668, + "grad_norm": 1.200626254081726, + "learning_rate": 8.2697232079489e-05, + "loss": 0.0023645937442779543, + "step": 121950 + }, + { + "epoch": 17.311568488289566, + "grad_norm": 1.4385911226272583, + "learning_rate": 8.269581263307312e-05, + "loss": 0.018995174765586854, + "step": 121960 + }, + { + "epoch": 17.312987934705465, + "grad_norm": 1.4057707786560059, + "learning_rate": 8.26943931866572e-05, + "loss": 0.006676866114139557, + "step": 121970 + }, + { + "epoch": 17.314407381121363, + "grad_norm": 0.4110714793205261, + "learning_rate": 8.269297374024131e-05, + "loss": 0.004142989590764046, + "step": 121980 + }, + { + "epoch": 17.31582682753726, + "grad_norm": 2.908630609512329, + "learning_rate": 8.269155429382541e-05, + "loss": 0.003224276378750801, + "step": 121990 + }, + { + "epoch": 17.31724627395316, + "grad_norm": 0.3079705536365509, + "learning_rate": 8.269013484740952e-05, + "loss": 0.009928861260414123, + "step": 122000 + }, + { + "epoch": 17.31724627395316, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04895373061299324, + "eval_runtime": 33.3139, + "eval_samples_per_second": 472.085, + "eval_steps_per_second": 14.769, + "step": 122000 + }, + { + "epoch": 17.318665720369054, + "grad_norm": 0.014612467028200626, + "learning_rate": 8.268871540099362e-05, + "loss": 0.006916648149490357, + "step": 122010 + }, + { + "epoch": 17.320085166784953, + "grad_norm": 11.766655921936035, + "learning_rate": 8.268729595457772e-05, + "loss": 0.016812363266944887, + "step": 122020 + }, + { + "epoch": 17.32150461320085, + "grad_norm": 8.348554611206055, + "learning_rate": 8.268587650816181e-05, + "loss": 0.005784840509295464, + "step": 122030 + }, + { + "epoch": 17.32292405961675, + "grad_norm": 0.24908818304538727, + "learning_rate": 8.268445706174592e-05, + "loss": 0.006596586108207703, + "step": 122040 + }, + { + "epoch": 17.324343506032648, + "grad_norm": 0.5274747014045715, + "learning_rate": 8.268303761533003e-05, + "loss": 0.01484144628047943, + "step": 122050 + }, + { + "epoch": 17.325762952448546, + "grad_norm": 0.1944778710603714, + "learning_rate": 8.268161816891413e-05, + "loss": 0.02620922923088074, + "step": 122060 + }, + { + "epoch": 17.327182398864444, + "grad_norm": 26.002870559692383, + "learning_rate": 8.268019872249823e-05, + "loss": 0.07551113963127136, + "step": 122070 + }, + { + "epoch": 17.32860184528034, + "grad_norm": 9.135618209838867, + "learning_rate": 8.267877927608233e-05, + "loss": 0.03293728232383728, + "step": 122080 + }, + { + "epoch": 17.330021291696237, + "grad_norm": 0.08039802312850952, + "learning_rate": 8.267735982966644e-05, + "loss": 0.013569638133049011, + "step": 122090 + }, + { + "epoch": 17.331440738112136, + "grad_norm": 0.0037199612706899643, + "learning_rate": 8.267594038325053e-05, + "loss": 0.024605175852775572, + "step": 122100 + }, + { + "epoch": 17.332860184528034, + "grad_norm": 0.17483936250209808, + "learning_rate": 8.267452093683465e-05, + "loss": 0.049089929461479186, + "step": 122110 + }, + { + "epoch": 17.334279630943932, + "grad_norm": 8.319599151611328, + "learning_rate": 8.267310149041873e-05, + "loss": 0.048873132467269896, + "step": 122120 + }, + { + "epoch": 17.33569907735983, + "grad_norm": 0.996322751045227, + "learning_rate": 8.267168204400284e-05, + "loss": 0.020006181299686433, + "step": 122130 + }, + { + "epoch": 17.33711852377573, + "grad_norm": 12.19799518585205, + "learning_rate": 8.267026259758695e-05, + "loss": 0.02444319725036621, + "step": 122140 + }, + { + "epoch": 17.338537970191624, + "grad_norm": 0.5746012330055237, + "learning_rate": 8.266884315117105e-05, + "loss": 0.014555779099464417, + "step": 122150 + }, + { + "epoch": 17.339957416607522, + "grad_norm": 0.1442962884902954, + "learning_rate": 8.266742370475516e-05, + "loss": 0.0333253413438797, + "step": 122160 + }, + { + "epoch": 17.34137686302342, + "grad_norm": 0.23911288380622864, + "learning_rate": 8.266600425833924e-05, + "loss": 0.02843923568725586, + "step": 122170 + }, + { + "epoch": 17.34279630943932, + "grad_norm": 3.6721339225769043, + "learning_rate": 8.266458481192335e-05, + "loss": 0.015358135104179382, + "step": 122180 + }, + { + "epoch": 17.344215755855217, + "grad_norm": 0.6218733191490173, + "learning_rate": 8.266316536550745e-05, + "loss": 0.02243678867816925, + "step": 122190 + }, + { + "epoch": 17.345635202271115, + "grad_norm": 1.7230888605117798, + "learning_rate": 8.266174591909156e-05, + "loss": 0.02495470941066742, + "step": 122200 + }, + { + "epoch": 17.347054648687013, + "grad_norm": 14.371573448181152, + "learning_rate": 8.266032647267566e-05, + "loss": 0.02764323353767395, + "step": 122210 + }, + { + "epoch": 17.348474095102908, + "grad_norm": 2.212644338607788, + "learning_rate": 8.265890702625976e-05, + "loss": 0.0040929153561592106, + "step": 122220 + }, + { + "epoch": 17.349893541518806, + "grad_norm": 4.548835754394531, + "learning_rate": 8.265748757984387e-05, + "loss": 0.08133320808410645, + "step": 122230 + }, + { + "epoch": 17.351312987934705, + "grad_norm": 4.254519939422607, + "learning_rate": 8.265606813342797e-05, + "loss": 0.006079412624239921, + "step": 122240 + }, + { + "epoch": 17.352732434350603, + "grad_norm": 0.9229611158370972, + "learning_rate": 8.265464868701208e-05, + "loss": 0.012695755064487457, + "step": 122250 + }, + { + "epoch": 17.3541518807665, + "grad_norm": 10.626506805419922, + "learning_rate": 8.265322924059617e-05, + "loss": 0.06295732855796814, + "step": 122260 + }, + { + "epoch": 17.3555713271824, + "grad_norm": 0.015296485275030136, + "learning_rate": 8.265180979418027e-05, + "loss": 0.009174713492393493, + "step": 122270 + }, + { + "epoch": 17.356990773598298, + "grad_norm": 6.6800031661987305, + "learning_rate": 8.265039034776437e-05, + "loss": 0.0033336080610752105, + "step": 122280 + }, + { + "epoch": 17.358410220014193, + "grad_norm": 0.12137117236852646, + "learning_rate": 8.264897090134848e-05, + "loss": 0.003195350244641304, + "step": 122290 + }, + { + "epoch": 17.35982966643009, + "grad_norm": 0.057339224964380264, + "learning_rate": 8.264755145493258e-05, + "loss": 0.011133617162704468, + "step": 122300 + }, + { + "epoch": 17.36124911284599, + "grad_norm": 0.023208029568195343, + "learning_rate": 8.264613200851669e-05, + "loss": 0.012216722965240479, + "step": 122310 + }, + { + "epoch": 17.362668559261888, + "grad_norm": 8.354445457458496, + "learning_rate": 8.264471256210079e-05, + "loss": 0.019915086030960084, + "step": 122320 + }, + { + "epoch": 17.364088005677786, + "grad_norm": 10.264193534851074, + "learning_rate": 8.264329311568488e-05, + "loss": 0.060139000415802, + "step": 122330 + }, + { + "epoch": 17.365507452093684, + "grad_norm": 1.1199427843093872, + "learning_rate": 8.2641873669269e-05, + "loss": 0.03597682416439056, + "step": 122340 + }, + { + "epoch": 17.366926898509583, + "grad_norm": 10.73701286315918, + "learning_rate": 8.264045422285309e-05, + "loss": 0.01692398190498352, + "step": 122350 + }, + { + "epoch": 17.368346344925477, + "grad_norm": 0.06042497605085373, + "learning_rate": 8.26390347764372e-05, + "loss": 0.007322944700717926, + "step": 122360 + }, + { + "epoch": 17.369765791341376, + "grad_norm": 1.2759391069412231, + "learning_rate": 8.26376153300213e-05, + "loss": 0.016948218643665313, + "step": 122370 + }, + { + "epoch": 17.371185237757274, + "grad_norm": 1.5327088832855225, + "learning_rate": 8.26361958836054e-05, + "loss": 0.040832871198654176, + "step": 122380 + }, + { + "epoch": 17.372604684173172, + "grad_norm": 2.6886208057403564, + "learning_rate": 8.26347764371895e-05, + "loss": 0.006062294170260429, + "step": 122390 + }, + { + "epoch": 17.37402413058907, + "grad_norm": 1.667949914932251, + "learning_rate": 8.26333569907736e-05, + "loss": 0.009034250676631928, + "step": 122400 + }, + { + "epoch": 17.37544357700497, + "grad_norm": 5.088666915893555, + "learning_rate": 8.26319375443577e-05, + "loss": 0.03124779760837555, + "step": 122410 + }, + { + "epoch": 17.376863023420867, + "grad_norm": 1.2055740356445312, + "learning_rate": 8.263051809794181e-05, + "loss": 0.05579102635383606, + "step": 122420 + }, + { + "epoch": 17.378282469836762, + "grad_norm": 11.72123908996582, + "learning_rate": 8.262909865152591e-05, + "loss": 0.05243555903434753, + "step": 122430 + }, + { + "epoch": 17.37970191625266, + "grad_norm": 0.23794768750667572, + "learning_rate": 8.262767920511001e-05, + "loss": 0.034197428822517396, + "step": 122440 + }, + { + "epoch": 17.38112136266856, + "grad_norm": 0.012232398614287376, + "learning_rate": 8.262625975869412e-05, + "loss": 0.008090271055698395, + "step": 122450 + }, + { + "epoch": 17.382540809084457, + "grad_norm": 0.5846762657165527, + "learning_rate": 8.262484031227822e-05, + "loss": 0.020568540692329405, + "step": 122460 + }, + { + "epoch": 17.383960255500355, + "grad_norm": 7.196073055267334, + "learning_rate": 8.262342086586233e-05, + "loss": 0.052748876810073855, + "step": 122470 + }, + { + "epoch": 17.385379701916253, + "grad_norm": 6.977900981903076, + "learning_rate": 8.262200141944641e-05, + "loss": 0.07267424464225769, + "step": 122480 + }, + { + "epoch": 17.386799148332152, + "grad_norm": 1.2999193668365479, + "learning_rate": 8.262058197303052e-05, + "loss": 0.008215299993753433, + "step": 122490 + }, + { + "epoch": 17.388218594748047, + "grad_norm": 0.12169066816568375, + "learning_rate": 8.261916252661462e-05, + "loss": 0.03030954897403717, + "step": 122500 + }, + { + "epoch": 17.388218594748047, + "eval_accuracy": 0.9837858459973294, + "eval_loss": 0.059243522584438324, + "eval_runtime": 32.5355, + "eval_samples_per_second": 483.379, + "eval_steps_per_second": 15.122, + "step": 122500 + }, + { + "epoch": 17.389638041163945, + "grad_norm": 0.11882514506578445, + "learning_rate": 8.261774308019873e-05, + "loss": 0.011411736905574798, + "step": 122510 + }, + { + "epoch": 17.391057487579843, + "grad_norm": 0.280032217502594, + "learning_rate": 8.261632363378283e-05, + "loss": 0.0065232284367084505, + "step": 122520 + }, + { + "epoch": 17.39247693399574, + "grad_norm": 10.429749488830566, + "learning_rate": 8.261490418736693e-05, + "loss": 0.017095638811588286, + "step": 122530 + }, + { + "epoch": 17.39389638041164, + "grad_norm": 0.5666552186012268, + "learning_rate": 8.261348474095104e-05, + "loss": 0.01274164617061615, + "step": 122540 + }, + { + "epoch": 17.395315826827538, + "grad_norm": 0.42551353573799133, + "learning_rate": 8.261206529453513e-05, + "loss": 0.007419595122337341, + "step": 122550 + }, + { + "epoch": 17.396735273243436, + "grad_norm": 0.09856267273426056, + "learning_rate": 8.261064584811924e-05, + "loss": 0.0018107056617736816, + "step": 122560 + }, + { + "epoch": 17.39815471965933, + "grad_norm": 0.04244295507669449, + "learning_rate": 8.260922640170334e-05, + "loss": 0.04652504920959473, + "step": 122570 + }, + { + "epoch": 17.39957416607523, + "grad_norm": 0.38564613461494446, + "learning_rate": 8.260780695528744e-05, + "loss": 0.026515766978263855, + "step": 122580 + }, + { + "epoch": 17.400993612491128, + "grad_norm": 4.476937770843506, + "learning_rate": 8.260638750887154e-05, + "loss": 0.0071770638227462765, + "step": 122590 + }, + { + "epoch": 17.402413058907026, + "grad_norm": 0.33714860677719116, + "learning_rate": 8.260496806245565e-05, + "loss": 0.02237725555896759, + "step": 122600 + }, + { + "epoch": 17.403832505322924, + "grad_norm": 4.282869338989258, + "learning_rate": 8.260354861603974e-05, + "loss": 0.04165985584259033, + "step": 122610 + }, + { + "epoch": 17.405251951738823, + "grad_norm": 6.363341808319092, + "learning_rate": 8.260212916962386e-05, + "loss": 0.049985453486442566, + "step": 122620 + }, + { + "epoch": 17.40667139815472, + "grad_norm": 0.8332139849662781, + "learning_rate": 8.260070972320795e-05, + "loss": 0.00713193416595459, + "step": 122630 + }, + { + "epoch": 17.408090844570616, + "grad_norm": 0.33843275904655457, + "learning_rate": 8.259929027679205e-05, + "loss": 0.03059439957141876, + "step": 122640 + }, + { + "epoch": 17.409510290986514, + "grad_norm": 14.195676803588867, + "learning_rate": 8.259787083037616e-05, + "loss": 0.018163122236728668, + "step": 122650 + }, + { + "epoch": 17.410929737402412, + "grad_norm": 9.23517894744873, + "learning_rate": 8.259645138396026e-05, + "loss": 0.026523211598396303, + "step": 122660 + }, + { + "epoch": 17.41234918381831, + "grad_norm": 0.05418802425265312, + "learning_rate": 8.259503193754437e-05, + "loss": 0.02210947871208191, + "step": 122670 + }, + { + "epoch": 17.41376863023421, + "grad_norm": 1.8904386758804321, + "learning_rate": 8.259361249112847e-05, + "loss": 0.004061230272054672, + "step": 122680 + }, + { + "epoch": 17.415188076650107, + "grad_norm": 0.045158129185438156, + "learning_rate": 8.259219304471256e-05, + "loss": 0.004564892128109932, + "step": 122690 + }, + { + "epoch": 17.416607523066006, + "grad_norm": 3.936619997024536, + "learning_rate": 8.259077359829666e-05, + "loss": 0.020028875768184663, + "step": 122700 + }, + { + "epoch": 17.4180269694819, + "grad_norm": 12.677026748657227, + "learning_rate": 8.258935415188077e-05, + "loss": 0.022690902650356292, + "step": 122710 + }, + { + "epoch": 17.4194464158978, + "grad_norm": 1.4186701774597168, + "learning_rate": 8.258793470546487e-05, + "loss": 0.0031532850116491318, + "step": 122720 + }, + { + "epoch": 17.420865862313697, + "grad_norm": 1.636500358581543, + "learning_rate": 8.258651525904898e-05, + "loss": 0.005997669324278832, + "step": 122730 + }, + { + "epoch": 17.422285308729595, + "grad_norm": 7.789987087249756, + "learning_rate": 8.258509581263308e-05, + "loss": 0.018677373230457307, + "step": 122740 + }, + { + "epoch": 17.423704755145494, + "grad_norm": 0.18241193890571594, + "learning_rate": 8.258367636621718e-05, + "loss": 0.0023742862045764922, + "step": 122750 + }, + { + "epoch": 17.425124201561392, + "grad_norm": 7.324385643005371, + "learning_rate": 8.258225691980129e-05, + "loss": 0.013009896874427796, + "step": 122760 + }, + { + "epoch": 17.42654364797729, + "grad_norm": 0.04243845120072365, + "learning_rate": 8.258083747338538e-05, + "loss": 0.011265130341053009, + "step": 122770 + }, + { + "epoch": 17.427963094393185, + "grad_norm": 6.735060214996338, + "learning_rate": 8.25794180269695e-05, + "loss": 0.049876469373703006, + "step": 122780 + }, + { + "epoch": 17.429382540809083, + "grad_norm": 7.400155544281006, + "learning_rate": 8.257799858055358e-05, + "loss": 0.0073287680745124815, + "step": 122790 + }, + { + "epoch": 17.43080198722498, + "grad_norm": 0.12465141713619232, + "learning_rate": 8.257657913413769e-05, + "loss": 0.027358931303024293, + "step": 122800 + }, + { + "epoch": 17.43222143364088, + "grad_norm": 0.1785537749528885, + "learning_rate": 8.257515968772179e-05, + "loss": 0.032534864544868466, + "step": 122810 + }, + { + "epoch": 17.433640880056778, + "grad_norm": 11.284260749816895, + "learning_rate": 8.25737402413059e-05, + "loss": 0.064073246717453, + "step": 122820 + }, + { + "epoch": 17.435060326472676, + "grad_norm": 8.10233211517334, + "learning_rate": 8.257232079489e-05, + "loss": 0.016384488344192503, + "step": 122830 + }, + { + "epoch": 17.436479772888575, + "grad_norm": 0.02594900317490101, + "learning_rate": 8.257090134847409e-05, + "loss": 0.06889531016349792, + "step": 122840 + }, + { + "epoch": 17.43789921930447, + "grad_norm": 7.683279514312744, + "learning_rate": 8.25694819020582e-05, + "loss": 0.007571496069431305, + "step": 122850 + }, + { + "epoch": 17.439318665720368, + "grad_norm": 0.5677283406257629, + "learning_rate": 8.25680624556423e-05, + "loss": 0.024730677902698516, + "step": 122860 + }, + { + "epoch": 17.440738112136266, + "grad_norm": 0.08760765194892883, + "learning_rate": 8.256664300922641e-05, + "loss": 0.015320856869220734, + "step": 122870 + }, + { + "epoch": 17.442157558552164, + "grad_norm": 19.979145050048828, + "learning_rate": 8.256522356281051e-05, + "loss": 0.058557575941085814, + "step": 122880 + }, + { + "epoch": 17.443577004968063, + "grad_norm": 0.24306319653987885, + "learning_rate": 8.256380411639461e-05, + "loss": 0.0089756540954113, + "step": 122890 + }, + { + "epoch": 17.44499645138396, + "grad_norm": 0.09230535477399826, + "learning_rate": 8.25623846699787e-05, + "loss": 0.036602020263671875, + "step": 122900 + }, + { + "epoch": 17.44641589779986, + "grad_norm": 10.769634246826172, + "learning_rate": 8.256096522356282e-05, + "loss": 0.03061630725860596, + "step": 122910 + }, + { + "epoch": 17.447835344215754, + "grad_norm": 1.5239731073379517, + "learning_rate": 8.255954577714691e-05, + "loss": 0.00846146047115326, + "step": 122920 + }, + { + "epoch": 17.449254790631652, + "grad_norm": 0.0684681311249733, + "learning_rate": 8.255812633073102e-05, + "loss": 0.005754857137799263, + "step": 122930 + }, + { + "epoch": 17.45067423704755, + "grad_norm": 1.251187801361084, + "learning_rate": 8.255670688431512e-05, + "loss": 0.011119232326745988, + "step": 122940 + }, + { + "epoch": 17.45209368346345, + "grad_norm": 7.6524224281311035, + "learning_rate": 8.255528743789922e-05, + "loss": 0.012783028185367584, + "step": 122950 + }, + { + "epoch": 17.453513129879347, + "grad_norm": 1.5091584920883179, + "learning_rate": 8.255386799148333e-05, + "loss": 0.005531028658151626, + "step": 122960 + }, + { + "epoch": 17.454932576295246, + "grad_norm": 10.661508560180664, + "learning_rate": 8.255244854506743e-05, + "loss": 0.029649490118026735, + "step": 122970 + }, + { + "epoch": 17.456352022711144, + "grad_norm": 0.053090885281562805, + "learning_rate": 8.255102909865154e-05, + "loss": 0.021497386693954467, + "step": 122980 + }, + { + "epoch": 17.45777146912704, + "grad_norm": 0.14183126389980316, + "learning_rate": 8.254960965223562e-05, + "loss": 0.016825027763843536, + "step": 122990 + }, + { + "epoch": 17.459190915542937, + "grad_norm": 0.06370903551578522, + "learning_rate": 8.254819020581973e-05, + "loss": 0.02068781554698944, + "step": 123000 + }, + { + "epoch": 17.459190915542937, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.04308421537280083, + "eval_runtime": 32.7742, + "eval_samples_per_second": 479.859, + "eval_steps_per_second": 15.012, + "step": 123000 + }, + { + "epoch": 17.460610361958835, + "grad_norm": 0.2073710411787033, + "learning_rate": 8.254677075940383e-05, + "loss": 0.01421993225812912, + "step": 123010 + }, + { + "epoch": 17.462029808374734, + "grad_norm": 0.29374533891677856, + "learning_rate": 8.254535131298794e-05, + "loss": 0.009316784143447877, + "step": 123020 + }, + { + "epoch": 17.463449254790632, + "grad_norm": 1.4187288284301758, + "learning_rate": 8.254393186657204e-05, + "loss": 0.026600950956344606, + "step": 123030 + }, + { + "epoch": 17.46486870120653, + "grad_norm": 1.2814127206802368, + "learning_rate": 8.254251242015615e-05, + "loss": 0.033495780825614926, + "step": 123040 + }, + { + "epoch": 17.46628814762243, + "grad_norm": 0.09562135487794876, + "learning_rate": 8.254109297374025e-05, + "loss": 0.029808908700942993, + "step": 123050 + }, + { + "epoch": 17.467707594038323, + "grad_norm": 14.986631393432617, + "learning_rate": 8.253967352732434e-05, + "loss": 0.02389901578426361, + "step": 123060 + }, + { + "epoch": 17.46912704045422, + "grad_norm": 0.3095096945762634, + "learning_rate": 8.253825408090845e-05, + "loss": 0.01468038409948349, + "step": 123070 + }, + { + "epoch": 17.47054648687012, + "grad_norm": 0.12282241135835648, + "learning_rate": 8.253683463449255e-05, + "loss": 0.011646793782711029, + "step": 123080 + }, + { + "epoch": 17.471965933286018, + "grad_norm": 0.03262794017791748, + "learning_rate": 8.253541518807666e-05, + "loss": 0.002748313918709755, + "step": 123090 + }, + { + "epoch": 17.473385379701917, + "grad_norm": 0.7681041359901428, + "learning_rate": 8.253399574166075e-05, + "loss": 0.03596278131008148, + "step": 123100 + }, + { + "epoch": 17.474804826117815, + "grad_norm": 1.0353686809539795, + "learning_rate": 8.253257629524486e-05, + "loss": 0.004134359210729599, + "step": 123110 + }, + { + "epoch": 17.476224272533713, + "grad_norm": 0.7349243760108948, + "learning_rate": 8.253115684882896e-05, + "loss": 0.006130160763859749, + "step": 123120 + }, + { + "epoch": 17.477643718949608, + "grad_norm": 0.018726080656051636, + "learning_rate": 8.252973740241307e-05, + "loss": 0.004772935435175896, + "step": 123130 + }, + { + "epoch": 17.479063165365506, + "grad_norm": 0.2598189413547516, + "learning_rate": 8.252831795599716e-05, + "loss": 0.011500736325979232, + "step": 123140 + }, + { + "epoch": 17.480482611781405, + "grad_norm": 0.9024463295936584, + "learning_rate": 8.252689850958126e-05, + "loss": 0.020366585254669188, + "step": 123150 + }, + { + "epoch": 17.481902058197303, + "grad_norm": 11.113703727722168, + "learning_rate": 8.252547906316537e-05, + "loss": 0.02417703568935394, + "step": 123160 + }, + { + "epoch": 17.4833215046132, + "grad_norm": 0.005567350424826145, + "learning_rate": 8.252405961674947e-05, + "loss": 0.01901007890701294, + "step": 123170 + }, + { + "epoch": 17.4847409510291, + "grad_norm": 0.3311940133571625, + "learning_rate": 8.252264017033358e-05, + "loss": 0.012172038853168487, + "step": 123180 + }, + { + "epoch": 17.486160397444998, + "grad_norm": 0.018239812925457954, + "learning_rate": 8.252122072391768e-05, + "loss": 0.0036922723054885866, + "step": 123190 + }, + { + "epoch": 17.487579843860892, + "grad_norm": 0.007090285886079073, + "learning_rate": 8.251980127750177e-05, + "loss": 0.0167811781167984, + "step": 123200 + }, + { + "epoch": 17.48899929027679, + "grad_norm": 4.8959269523620605, + "learning_rate": 8.251838183108587e-05, + "loss": 0.06325067281723022, + "step": 123210 + }, + { + "epoch": 17.49041873669269, + "grad_norm": 0.44174933433532715, + "learning_rate": 8.251696238466998e-05, + "loss": 0.008886340260505676, + "step": 123220 + }, + { + "epoch": 17.491838183108587, + "grad_norm": 12.290687561035156, + "learning_rate": 8.251554293825408e-05, + "loss": 0.031110918521881102, + "step": 123230 + }, + { + "epoch": 17.493257629524486, + "grad_norm": 2.3835344314575195, + "learning_rate": 8.251412349183819e-05, + "loss": 0.05299111008644104, + "step": 123240 + }, + { + "epoch": 17.494677075940384, + "grad_norm": 0.015629790723323822, + "learning_rate": 8.251270404542229e-05, + "loss": 0.04958618879318237, + "step": 123250 + }, + { + "epoch": 17.496096522356282, + "grad_norm": 0.18703432381153107, + "learning_rate": 8.251128459900639e-05, + "loss": 0.013501530885696411, + "step": 123260 + }, + { + "epoch": 17.497515968772177, + "grad_norm": 7.4073991775512695, + "learning_rate": 8.25098651525905e-05, + "loss": 0.042862167954444884, + "step": 123270 + }, + { + "epoch": 17.498935415188075, + "grad_norm": 3.2967214584350586, + "learning_rate": 8.25084457061746e-05, + "loss": 0.004912526533007622, + "step": 123280 + }, + { + "epoch": 17.500354861603974, + "grad_norm": 2.1429383754730225, + "learning_rate": 8.25070262597587e-05, + "loss": 0.05954912304878235, + "step": 123290 + }, + { + "epoch": 17.501774308019872, + "grad_norm": 0.5956802368164062, + "learning_rate": 8.250560681334279e-05, + "loss": 0.031099620461463928, + "step": 123300 + }, + { + "epoch": 17.50319375443577, + "grad_norm": 0.24786630272865295, + "learning_rate": 8.25041873669269e-05, + "loss": 0.018831300735473632, + "step": 123310 + }, + { + "epoch": 17.50461320085167, + "grad_norm": 0.10456979274749756, + "learning_rate": 8.2502767920511e-05, + "loss": 0.011707174777984618, + "step": 123320 + }, + { + "epoch": 17.506032647267567, + "grad_norm": 0.050170306116342545, + "learning_rate": 8.250134847409511e-05, + "loss": 0.039228835701942445, + "step": 123330 + }, + { + "epoch": 17.50745209368346, + "grad_norm": 2.9133970737457275, + "learning_rate": 8.24999290276792e-05, + "loss": 0.003978077322244644, + "step": 123340 + }, + { + "epoch": 17.50887154009936, + "grad_norm": 2.6007888317108154, + "learning_rate": 8.24985095812633e-05, + "loss": 0.05585094690322876, + "step": 123350 + }, + { + "epoch": 17.51029098651526, + "grad_norm": 0.1047113761305809, + "learning_rate": 8.249709013484741e-05, + "loss": 0.012924128770828247, + "step": 123360 + }, + { + "epoch": 17.511710432931157, + "grad_norm": 0.23884299397468567, + "learning_rate": 8.249567068843151e-05, + "loss": 0.022875207662582397, + "step": 123370 + }, + { + "epoch": 17.513129879347055, + "grad_norm": 0.16158747673034668, + "learning_rate": 8.249425124201562e-05, + "loss": 0.02361272871494293, + "step": 123380 + }, + { + "epoch": 17.514549325762953, + "grad_norm": 3.2364308834075928, + "learning_rate": 8.249283179559972e-05, + "loss": 0.03016907870769501, + "step": 123390 + }, + { + "epoch": 17.51596877217885, + "grad_norm": 0.15410637855529785, + "learning_rate": 8.249141234918383e-05, + "loss": 0.015434008836746217, + "step": 123400 + }, + { + "epoch": 17.517388218594746, + "grad_norm": 0.05540246143937111, + "learning_rate": 8.248999290276791e-05, + "loss": 0.009972456097602844, + "step": 123410 + }, + { + "epoch": 17.518807665010645, + "grad_norm": 6.280983924865723, + "learning_rate": 8.248857345635203e-05, + "loss": 0.016720139980316163, + "step": 123420 + }, + { + "epoch": 17.520227111426543, + "grad_norm": 0.0030701293144375086, + "learning_rate": 8.248715400993612e-05, + "loss": 0.04440495073795318, + "step": 123430 + }, + { + "epoch": 17.52164655784244, + "grad_norm": 1.1619906425476074, + "learning_rate": 8.248573456352023e-05, + "loss": 0.0237629234790802, + "step": 123440 + }, + { + "epoch": 17.52306600425834, + "grad_norm": 0.05782632157206535, + "learning_rate": 8.248431511710434e-05, + "loss": 0.025528308749198914, + "step": 123450 + }, + { + "epoch": 17.524485450674238, + "grad_norm": 4.620859622955322, + "learning_rate": 8.248289567068843e-05, + "loss": 0.022039780020713808, + "step": 123460 + }, + { + "epoch": 17.525904897090136, + "grad_norm": 0.013535212725400925, + "learning_rate": 8.248147622427254e-05, + "loss": 0.013678130507469178, + "step": 123470 + }, + { + "epoch": 17.52732434350603, + "grad_norm": 1.571839690208435, + "learning_rate": 8.248005677785664e-05, + "loss": 0.015347103774547576, + "step": 123480 + }, + { + "epoch": 17.52874378992193, + "grad_norm": 12.499358177185059, + "learning_rate": 8.247863733144075e-05, + "loss": 0.09705874919891358, + "step": 123490 + }, + { + "epoch": 17.530163236337827, + "grad_norm": 0.08306962251663208, + "learning_rate": 8.247721788502485e-05, + "loss": 0.007053288817405701, + "step": 123500 + }, + { + "epoch": 17.530163236337827, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.07242994010448456, + "eval_runtime": 32.3528, + "eval_samples_per_second": 486.109, + "eval_steps_per_second": 15.207, + "step": 123500 + }, + { + "epoch": 17.531582682753726, + "grad_norm": 0.28261151909828186, + "learning_rate": 8.247579843860894e-05, + "loss": 0.020450976490974427, + "step": 123510 + }, + { + "epoch": 17.533002129169624, + "grad_norm": 7.134189128875732, + "learning_rate": 8.247437899219304e-05, + "loss": 0.06149494647979736, + "step": 123520 + }, + { + "epoch": 17.534421575585522, + "grad_norm": 5.049461841583252, + "learning_rate": 8.247295954577715e-05, + "loss": 0.02939991354942322, + "step": 123530 + }, + { + "epoch": 17.53584102200142, + "grad_norm": 1.4308009147644043, + "learning_rate": 8.247154009936126e-05, + "loss": 0.0470628172159195, + "step": 123540 + }, + { + "epoch": 17.537260468417315, + "grad_norm": 0.2950064539909363, + "learning_rate": 8.247012065294536e-05, + "loss": 0.031284031271934507, + "step": 123550 + }, + { + "epoch": 17.538679914833214, + "grad_norm": 6.529641628265381, + "learning_rate": 8.246870120652946e-05, + "loss": 0.025937312841415407, + "step": 123560 + }, + { + "epoch": 17.540099361249112, + "grad_norm": 0.01473158597946167, + "learning_rate": 8.246728176011355e-05, + "loss": 0.06190433502197266, + "step": 123570 + }, + { + "epoch": 17.54151880766501, + "grad_norm": 0.28922995924949646, + "learning_rate": 8.246586231369766e-05, + "loss": 0.008700785040855408, + "step": 123580 + }, + { + "epoch": 17.54293825408091, + "grad_norm": 12.637017250061035, + "learning_rate": 8.246444286728176e-05, + "loss": 0.03645790219306946, + "step": 123590 + }, + { + "epoch": 17.544357700496807, + "grad_norm": 0.027270788326859474, + "learning_rate": 8.246302342086587e-05, + "loss": 0.015550993382930756, + "step": 123600 + }, + { + "epoch": 17.545777146912705, + "grad_norm": 8.379532814025879, + "learning_rate": 8.246160397444996e-05, + "loss": 0.021490223705768585, + "step": 123610 + }, + { + "epoch": 17.5471965933286, + "grad_norm": 7.282254219055176, + "learning_rate": 8.246018452803407e-05, + "loss": 0.01094207763671875, + "step": 123620 + }, + { + "epoch": 17.5486160397445, + "grad_norm": 0.03394869714975357, + "learning_rate": 8.245876508161818e-05, + "loss": 0.004773364588618279, + "step": 123630 + }, + { + "epoch": 17.550035486160397, + "grad_norm": 0.06286856532096863, + "learning_rate": 8.245734563520228e-05, + "loss": 0.0032698489725589753, + "step": 123640 + }, + { + "epoch": 17.551454932576295, + "grad_norm": 1.2578529119491577, + "learning_rate": 8.245592618878639e-05, + "loss": 0.02867700159549713, + "step": 123650 + }, + { + "epoch": 17.552874378992193, + "grad_norm": 0.13294143974781036, + "learning_rate": 8.245450674237047e-05, + "loss": 0.03565240502357483, + "step": 123660 + }, + { + "epoch": 17.55429382540809, + "grad_norm": 0.19572824239730835, + "learning_rate": 8.245308729595458e-05, + "loss": 0.017168080806732176, + "step": 123670 + }, + { + "epoch": 17.55571327182399, + "grad_norm": 0.611763060092926, + "learning_rate": 8.245166784953868e-05, + "loss": 0.01786961555480957, + "step": 123680 + }, + { + "epoch": 17.557132718239885, + "grad_norm": 0.8012092113494873, + "learning_rate": 8.245024840312279e-05, + "loss": 0.011101428419351578, + "step": 123690 + }, + { + "epoch": 17.558552164655783, + "grad_norm": 0.12960731983184814, + "learning_rate": 8.244882895670689e-05, + "loss": 0.040901082754135135, + "step": 123700 + }, + { + "epoch": 17.55997161107168, + "grad_norm": 1.262016773223877, + "learning_rate": 8.244740951029098e-05, + "loss": 0.028654444217681884, + "step": 123710 + }, + { + "epoch": 17.56139105748758, + "grad_norm": 0.09878029674291611, + "learning_rate": 8.24459900638751e-05, + "loss": 0.005360887199640274, + "step": 123720 + }, + { + "epoch": 17.562810503903478, + "grad_norm": 0.01878870464861393, + "learning_rate": 8.244457061745919e-05, + "loss": 0.06400970816612243, + "step": 123730 + }, + { + "epoch": 17.564229950319376, + "grad_norm": 8.757014274597168, + "learning_rate": 8.24431511710433e-05, + "loss": 0.026718950271606444, + "step": 123740 + }, + { + "epoch": 17.565649396735274, + "grad_norm": 1.1390107870101929, + "learning_rate": 8.24417317246274e-05, + "loss": 0.013444627821445464, + "step": 123750 + }, + { + "epoch": 17.56706884315117, + "grad_norm": 0.5454521775245667, + "learning_rate": 8.244031227821151e-05, + "loss": 0.023906174302101135, + "step": 123760 + }, + { + "epoch": 17.568488289567068, + "grad_norm": 0.45480877161026, + "learning_rate": 8.24388928317956e-05, + "loss": 0.05484031438827515, + "step": 123770 + }, + { + "epoch": 17.569907735982966, + "grad_norm": 4.2534589767456055, + "learning_rate": 8.243747338537971e-05, + "loss": 0.020327845215797426, + "step": 123780 + }, + { + "epoch": 17.571327182398864, + "grad_norm": 2.1090826988220215, + "learning_rate": 8.24360539389638e-05, + "loss": 0.01915072351694107, + "step": 123790 + }, + { + "epoch": 17.572746628814762, + "grad_norm": 0.3815958499908447, + "learning_rate": 8.243463449254792e-05, + "loss": 0.012741312384605408, + "step": 123800 + }, + { + "epoch": 17.57416607523066, + "grad_norm": 3.8844399452209473, + "learning_rate": 8.243321504613201e-05, + "loss": 0.014030416309833527, + "step": 123810 + }, + { + "epoch": 17.57558552164656, + "grad_norm": 3.2737066745758057, + "learning_rate": 8.243179559971611e-05, + "loss": 0.022963154315948486, + "step": 123820 + }, + { + "epoch": 17.577004968062454, + "grad_norm": 9.672563552856445, + "learning_rate": 8.243037615330022e-05, + "loss": 0.02373957335948944, + "step": 123830 + }, + { + "epoch": 17.578424414478352, + "grad_norm": 0.26230794191360474, + "learning_rate": 8.242895670688432e-05, + "loss": 0.020463331043720244, + "step": 123840 + }, + { + "epoch": 17.57984386089425, + "grad_norm": 1.9679028987884521, + "learning_rate": 8.242753726046843e-05, + "loss": 0.007435081154108047, + "step": 123850 + }, + { + "epoch": 17.58126330731015, + "grad_norm": 0.18290266394615173, + "learning_rate": 8.242611781405253e-05, + "loss": 0.028893864154815672, + "step": 123860 + }, + { + "epoch": 17.582682753726047, + "grad_norm": 5.281581401824951, + "learning_rate": 8.242469836763662e-05, + "loss": 0.025062206387519836, + "step": 123870 + }, + { + "epoch": 17.584102200141945, + "grad_norm": 0.441976934671402, + "learning_rate": 8.242327892122072e-05, + "loss": 0.009923070669174194, + "step": 123880 + }, + { + "epoch": 17.585521646557844, + "grad_norm": 0.3485376536846161, + "learning_rate": 8.242185947480483e-05, + "loss": 0.023020398616790772, + "step": 123890 + }, + { + "epoch": 17.58694109297374, + "grad_norm": 0.02939898520708084, + "learning_rate": 8.242044002838893e-05, + "loss": 0.015514726936817168, + "step": 123900 + }, + { + "epoch": 17.588360539389637, + "grad_norm": 0.2501057982444763, + "learning_rate": 8.241902058197304e-05, + "loss": 0.021653544902801514, + "step": 123910 + }, + { + "epoch": 17.589779985805535, + "grad_norm": 0.25378942489624023, + "learning_rate": 8.241760113555714e-05, + "loss": 0.029783162474632262, + "step": 123920 + }, + { + "epoch": 17.591199432221433, + "grad_norm": 0.5727351903915405, + "learning_rate": 8.241618168914124e-05, + "loss": 0.01356571912765503, + "step": 123930 + }, + { + "epoch": 17.59261887863733, + "grad_norm": 0.010458029806613922, + "learning_rate": 8.241476224272535e-05, + "loss": 0.039194774627685544, + "step": 123940 + }, + { + "epoch": 17.59403832505323, + "grad_norm": 0.6755303144454956, + "learning_rate": 8.241334279630944e-05, + "loss": 0.013087576627731324, + "step": 123950 + }, + { + "epoch": 17.59545777146913, + "grad_norm": 0.9623459577560425, + "learning_rate": 8.241192334989355e-05, + "loss": 0.0032142918556928636, + "step": 123960 + }, + { + "epoch": 17.596877217885023, + "grad_norm": 1.4890637397766113, + "learning_rate": 8.241050390347764e-05, + "loss": 0.016361470520496368, + "step": 123970 + }, + { + "epoch": 17.59829666430092, + "grad_norm": 1.138136625289917, + "learning_rate": 8.240908445706175e-05, + "loss": 0.01091969683766365, + "step": 123980 + }, + { + "epoch": 17.59971611071682, + "grad_norm": 0.7977745532989502, + "learning_rate": 8.240766501064585e-05, + "loss": 0.02227955460548401, + "step": 123990 + }, + { + "epoch": 17.601135557132718, + "grad_norm": 11.51099681854248, + "learning_rate": 8.240624556422996e-05, + "loss": 0.06129167079925537, + "step": 124000 + }, + { + "epoch": 17.601135557132718, + "eval_accuracy": 0.9866471672919184, + "eval_loss": 0.04729590564966202, + "eval_runtime": 33.4178, + "eval_samples_per_second": 470.617, + "eval_steps_per_second": 14.723, + "step": 124000 + }, + { + "epoch": 17.602555003548616, + "grad_norm": 1.4898444414138794, + "learning_rate": 8.240482611781406e-05, + "loss": 0.024441052973270417, + "step": 124010 + }, + { + "epoch": 17.603974449964515, + "grad_norm": 0.052741412073373795, + "learning_rate": 8.240340667139815e-05, + "loss": 0.04893164336681366, + "step": 124020 + }, + { + "epoch": 17.605393896380413, + "grad_norm": 0.6417841911315918, + "learning_rate": 8.240198722498226e-05, + "loss": 0.011471222341060638, + "step": 124030 + }, + { + "epoch": 17.606813342796308, + "grad_norm": 0.02762775495648384, + "learning_rate": 8.240056777856636e-05, + "loss": 0.028142285346984864, + "step": 124040 + }, + { + "epoch": 17.608232789212206, + "grad_norm": 1.8497322797775269, + "learning_rate": 8.239914833215047e-05, + "loss": 0.03893795311450958, + "step": 124050 + }, + { + "epoch": 17.609652235628104, + "grad_norm": 0.47043776512145996, + "learning_rate": 8.239772888573457e-05, + "loss": 0.014402249455451965, + "step": 124060 + }, + { + "epoch": 17.611071682044003, + "grad_norm": 1.0286686420440674, + "learning_rate": 8.239630943931868e-05, + "loss": 0.011852450668811798, + "step": 124070 + }, + { + "epoch": 17.6124911284599, + "grad_norm": 0.024061763659119606, + "learning_rate": 8.239488999290276e-05, + "loss": 0.014504016935825348, + "step": 124080 + }, + { + "epoch": 17.6139105748758, + "grad_norm": 1.218518614768982, + "learning_rate": 8.239347054648687e-05, + "loss": 0.023448963463306428, + "step": 124090 + }, + { + "epoch": 17.615330021291697, + "grad_norm": 5.819756507873535, + "learning_rate": 8.239205110007097e-05, + "loss": 0.017736424505710603, + "step": 124100 + }, + { + "epoch": 17.616749467707596, + "grad_norm": 0.11186391115188599, + "learning_rate": 8.239063165365508e-05, + "loss": 0.012619951367378235, + "step": 124110 + }, + { + "epoch": 17.61816891412349, + "grad_norm": 12.263643264770508, + "learning_rate": 8.238921220723918e-05, + "loss": 0.04501128494739533, + "step": 124120 + }, + { + "epoch": 17.61958836053939, + "grad_norm": 0.25323036313056946, + "learning_rate": 8.238779276082328e-05, + "loss": 0.027339151501655577, + "step": 124130 + }, + { + "epoch": 17.621007806955287, + "grad_norm": 1.4857368469238281, + "learning_rate": 8.238637331440739e-05, + "loss": 0.01260230541229248, + "step": 124140 + }, + { + "epoch": 17.622427253371185, + "grad_norm": 1.9226738214492798, + "learning_rate": 8.238495386799149e-05, + "loss": 0.005465056374669075, + "step": 124150 + }, + { + "epoch": 17.623846699787084, + "grad_norm": 0.01063599530607462, + "learning_rate": 8.23835344215756e-05, + "loss": 0.02559836208820343, + "step": 124160 + }, + { + "epoch": 17.625266146202982, + "grad_norm": 0.022318635135889053, + "learning_rate": 8.23821149751597e-05, + "loss": 0.0031830746680498122, + "step": 124170 + }, + { + "epoch": 17.62668559261888, + "grad_norm": 7.479580879211426, + "learning_rate": 8.238069552874379e-05, + "loss": 0.016616930067539216, + "step": 124180 + }, + { + "epoch": 17.628105039034775, + "grad_norm": 6.861004829406738, + "learning_rate": 8.237927608232789e-05, + "loss": 0.02781272232532501, + "step": 124190 + }, + { + "epoch": 17.629524485450673, + "grad_norm": 0.3056953549385071, + "learning_rate": 8.2377856635912e-05, + "loss": 0.011820276081562043, + "step": 124200 + }, + { + "epoch": 17.63094393186657, + "grad_norm": 0.2387654185295105, + "learning_rate": 8.23764371894961e-05, + "loss": 0.03424981236457825, + "step": 124210 + }, + { + "epoch": 17.63236337828247, + "grad_norm": 13.709823608398438, + "learning_rate": 8.237501774308021e-05, + "loss": 0.03825869858264923, + "step": 124220 + }, + { + "epoch": 17.63378282469837, + "grad_norm": 0.09609783440828323, + "learning_rate": 8.23735982966643e-05, + "loss": 0.034221959114074704, + "step": 124230 + }, + { + "epoch": 17.635202271114267, + "grad_norm": 0.06290843337774277, + "learning_rate": 8.23721788502484e-05, + "loss": 0.019939064979553223, + "step": 124240 + }, + { + "epoch": 17.636621717530165, + "grad_norm": 3.8382863998413086, + "learning_rate": 8.237075940383251e-05, + "loss": 0.021433869004249574, + "step": 124250 + }, + { + "epoch": 17.63804116394606, + "grad_norm": 1.720458984375, + "learning_rate": 8.236933995741661e-05, + "loss": 0.0241766482591629, + "step": 124260 + }, + { + "epoch": 17.639460610361958, + "grad_norm": 8.562639236450195, + "learning_rate": 8.236792051100072e-05, + "loss": 0.02787082493305206, + "step": 124270 + }, + { + "epoch": 17.640880056777856, + "grad_norm": 0.12311892211437225, + "learning_rate": 8.236664300922641e-05, + "loss": 0.0271776020526886, + "step": 124280 + }, + { + "epoch": 17.642299503193755, + "grad_norm": 0.6175635457038879, + "learning_rate": 8.236522356281052e-05, + "loss": 0.03380132913589477, + "step": 124290 + }, + { + "epoch": 17.643718949609653, + "grad_norm": 0.050154250115156174, + "learning_rate": 8.23638041163946e-05, + "loss": 0.023596420884132385, + "step": 124300 + }, + { + "epoch": 17.64513839602555, + "grad_norm": 0.5078491568565369, + "learning_rate": 8.236238466997871e-05, + "loss": 0.012544466555118561, + "step": 124310 + }, + { + "epoch": 17.64655784244145, + "grad_norm": 8.803682327270508, + "learning_rate": 8.236096522356281e-05, + "loss": 0.012344937026500701, + "step": 124320 + }, + { + "epoch": 17.647977288857344, + "grad_norm": 9.935019493103027, + "learning_rate": 8.235954577714692e-05, + "loss": 0.01960514485836029, + "step": 124330 + }, + { + "epoch": 17.649396735273243, + "grad_norm": 9.821824073791504, + "learning_rate": 8.235812633073102e-05, + "loss": 0.015455111861228943, + "step": 124340 + }, + { + "epoch": 17.65081618168914, + "grad_norm": 0.24194194376468658, + "learning_rate": 8.235684882895672e-05, + "loss": 0.019938093423843384, + "step": 124350 + }, + { + "epoch": 17.65223562810504, + "grad_norm": 7.058005332946777, + "learning_rate": 8.235542938254081e-05, + "loss": 0.018306195735931396, + "step": 124360 + }, + { + "epoch": 17.653655074520938, + "grad_norm": 3.6557610034942627, + "learning_rate": 8.235400993612493e-05, + "loss": 0.008752855658531188, + "step": 124370 + }, + { + "epoch": 17.655074520936836, + "grad_norm": 0.06243341043591499, + "learning_rate": 8.235259048970901e-05, + "loss": 0.012735649943351746, + "step": 124380 + }, + { + "epoch": 17.656493967352734, + "grad_norm": 0.3967527151107788, + "learning_rate": 8.235117104329312e-05, + "loss": 0.03084678053855896, + "step": 124390 + }, + { + "epoch": 17.65791341376863, + "grad_norm": 1.7461168766021729, + "learning_rate": 8.234975159687722e-05, + "loss": 0.013518714904785156, + "step": 124400 + }, + { + "epoch": 17.659332860184527, + "grad_norm": 8.566067695617676, + "learning_rate": 8.234833215046133e-05, + "loss": 0.044089671969413755, + "step": 124410 + }, + { + "epoch": 17.660752306600425, + "grad_norm": 14.566707611083984, + "learning_rate": 8.234691270404543e-05, + "loss": 0.010162675380706787, + "step": 124420 + }, + { + "epoch": 17.662171753016324, + "grad_norm": 1.658950924873352, + "learning_rate": 8.234549325762952e-05, + "loss": 0.005507271736860275, + "step": 124430 + }, + { + "epoch": 17.663591199432222, + "grad_norm": 7.552545070648193, + "learning_rate": 8.234407381121363e-05, + "loss": 0.01709498018026352, + "step": 124440 + }, + { + "epoch": 17.66501064584812, + "grad_norm": 10.414005279541016, + "learning_rate": 8.234265436479773e-05, + "loss": 0.0319903165102005, + "step": 124450 + }, + { + "epoch": 17.66643009226402, + "grad_norm": 0.043101776391267776, + "learning_rate": 8.234123491838184e-05, + "loss": 0.03780293762683869, + "step": 124460 + }, + { + "epoch": 17.667849538679913, + "grad_norm": 0.3531849682331085, + "learning_rate": 8.233981547196594e-05, + "loss": 0.011514118313789368, + "step": 124470 + }, + { + "epoch": 17.669268985095812, + "grad_norm": 6.060878276824951, + "learning_rate": 8.233839602555004e-05, + "loss": 0.03236663341522217, + "step": 124480 + }, + { + "epoch": 17.67068843151171, + "grad_norm": 0.04426760599017143, + "learning_rate": 8.233697657913413e-05, + "loss": 0.0357169508934021, + "step": 124490 + }, + { + "epoch": 17.67210787792761, + "grad_norm": 4.216787815093994, + "learning_rate": 8.233555713271825e-05, + "loss": 0.027725604176521302, + "step": 124500 + }, + { + "epoch": 17.67210787792761, + "eval_accuracy": 0.9877281108920964, + "eval_loss": 0.04588211327791214, + "eval_runtime": 33.295, + "eval_samples_per_second": 472.353, + "eval_steps_per_second": 14.777, + "step": 124500 + }, + { + "epoch": 17.673527324343507, + "grad_norm": 7.310577869415283, + "learning_rate": 8.233413768630234e-05, + "loss": 0.08030985593795777, + "step": 124510 + }, + { + "epoch": 17.674946770759405, + "grad_norm": 1.1622400283813477, + "learning_rate": 8.233271823988645e-05, + "loss": 0.0325200617313385, + "step": 124520 + }, + { + "epoch": 17.676366217175303, + "grad_norm": 0.07404623925685883, + "learning_rate": 8.233129879347055e-05, + "loss": 0.02496753931045532, + "step": 124530 + }, + { + "epoch": 17.677785663591198, + "grad_norm": 2.0904412269592285, + "learning_rate": 8.232987934705465e-05, + "loss": 0.01579650193452835, + "step": 124540 + }, + { + "epoch": 17.679205110007096, + "grad_norm": 1.241287350654602, + "learning_rate": 8.232845990063876e-05, + "loss": 0.009946402907371522, + "step": 124550 + }, + { + "epoch": 17.680624556422995, + "grad_norm": 0.16404542326927185, + "learning_rate": 8.232704045422286e-05, + "loss": 0.01308448165655136, + "step": 124560 + }, + { + "epoch": 17.682044002838893, + "grad_norm": 0.14319021999835968, + "learning_rate": 8.232562100780697e-05, + "loss": 0.009645262360572815, + "step": 124570 + }, + { + "epoch": 17.68346344925479, + "grad_norm": 1.4152799844741821, + "learning_rate": 8.232420156139105e-05, + "loss": 0.010371728241443634, + "step": 124580 + }, + { + "epoch": 17.68488289567069, + "grad_norm": 0.107726089656353, + "learning_rate": 8.232278211497516e-05, + "loss": 0.07219036817550659, + "step": 124590 + }, + { + "epoch": 17.686302342086588, + "grad_norm": 10.603217124938965, + "learning_rate": 8.232136266855926e-05, + "loss": 0.0411306768655777, + "step": 124600 + }, + { + "epoch": 17.687721788502483, + "grad_norm": 0.11992136389017105, + "learning_rate": 8.231994322214337e-05, + "loss": 0.033462563157081605, + "step": 124610 + }, + { + "epoch": 17.68914123491838, + "grad_norm": 0.36357948184013367, + "learning_rate": 8.231852377572747e-05, + "loss": 0.02809482216835022, + "step": 124620 + }, + { + "epoch": 17.69056068133428, + "grad_norm": 0.8509595394134521, + "learning_rate": 8.231710432931157e-05, + "loss": 0.029175931215286256, + "step": 124630 + }, + { + "epoch": 17.691980127750178, + "grad_norm": 3.2803003787994385, + "learning_rate": 8.231568488289568e-05, + "loss": 0.07031551599502564, + "step": 124640 + }, + { + "epoch": 17.693399574166076, + "grad_norm": 12.018957138061523, + "learning_rate": 8.231426543647977e-05, + "loss": 0.04732522368431091, + "step": 124650 + }, + { + "epoch": 17.694819020581974, + "grad_norm": 0.03543302044272423, + "learning_rate": 8.231284599006388e-05, + "loss": 0.02590615451335907, + "step": 124660 + }, + { + "epoch": 17.696238466997873, + "grad_norm": 0.1485099345445633, + "learning_rate": 8.231142654364798e-05, + "loss": 0.037968295812606814, + "step": 124670 + }, + { + "epoch": 17.697657913413767, + "grad_norm": 0.10532703250646591, + "learning_rate": 8.231000709723208e-05, + "loss": 0.02712967395782471, + "step": 124680 + }, + { + "epoch": 17.699077359829666, + "grad_norm": 0.5705673098564148, + "learning_rate": 8.230858765081618e-05, + "loss": 0.03225035667419433, + "step": 124690 + }, + { + "epoch": 17.700496806245564, + "grad_norm": 0.003143586916849017, + "learning_rate": 8.230716820440029e-05, + "loss": 0.004828961193561554, + "step": 124700 + }, + { + "epoch": 17.701916252661462, + "grad_norm": 0.09341616183519363, + "learning_rate": 8.230574875798439e-05, + "loss": 0.01144537478685379, + "step": 124710 + }, + { + "epoch": 17.70333569907736, + "grad_norm": 0.6089215874671936, + "learning_rate": 8.23043293115685e-05, + "loss": 0.016727571189403535, + "step": 124720 + }, + { + "epoch": 17.70475514549326, + "grad_norm": 0.12068206071853638, + "learning_rate": 8.23029098651526e-05, + "loss": 0.029977282881736754, + "step": 124730 + }, + { + "epoch": 17.706174591909157, + "grad_norm": 0.4399046003818512, + "learning_rate": 8.230149041873669e-05, + "loss": 0.004052478447556495, + "step": 124740 + }, + { + "epoch": 17.707594038325052, + "grad_norm": 7.016625881195068, + "learning_rate": 8.23000709723208e-05, + "loss": 0.01683313101530075, + "step": 124750 + }, + { + "epoch": 17.70901348474095, + "grad_norm": 0.6942397356033325, + "learning_rate": 8.22986515259049e-05, + "loss": 0.0047518197447061535, + "step": 124760 + }, + { + "epoch": 17.71043293115685, + "grad_norm": 0.5672115683555603, + "learning_rate": 8.229723207948901e-05, + "loss": 0.030978906154632568, + "step": 124770 + }, + { + "epoch": 17.711852377572747, + "grad_norm": 0.7284126877784729, + "learning_rate": 8.229581263307311e-05, + "loss": 0.008985263854265213, + "step": 124780 + }, + { + "epoch": 17.713271823988645, + "grad_norm": 0.7015783190727234, + "learning_rate": 8.22943931866572e-05, + "loss": 0.04537283778190613, + "step": 124790 + }, + { + "epoch": 17.714691270404543, + "grad_norm": 0.14790210127830505, + "learning_rate": 8.22929737402413e-05, + "loss": 0.004269610345363617, + "step": 124800 + }, + { + "epoch": 17.71611071682044, + "grad_norm": 7.4049224853515625, + "learning_rate": 8.229155429382541e-05, + "loss": 0.04315000772476196, + "step": 124810 + }, + { + "epoch": 17.717530163236336, + "grad_norm": 0.7216931581497192, + "learning_rate": 8.229013484740951e-05, + "loss": 0.06089695692062378, + "step": 124820 + }, + { + "epoch": 17.718949609652235, + "grad_norm": 0.060295574367046356, + "learning_rate": 8.228871540099362e-05, + "loss": 0.028363001346588135, + "step": 124830 + }, + { + "epoch": 17.720369056068133, + "grad_norm": 3.4502999782562256, + "learning_rate": 8.228729595457772e-05, + "loss": 0.021394972503185273, + "step": 124840 + }, + { + "epoch": 17.72178850248403, + "grad_norm": 0.5024992823600769, + "learning_rate": 8.228587650816182e-05, + "loss": 0.009638495743274689, + "step": 124850 + }, + { + "epoch": 17.72320794889993, + "grad_norm": 7.753618240356445, + "learning_rate": 8.228445706174593e-05, + "loss": 0.046898412704467776, + "step": 124860 + }, + { + "epoch": 17.724627395315828, + "grad_norm": 0.00778708653524518, + "learning_rate": 8.228303761533002e-05, + "loss": 0.01838609725236893, + "step": 124870 + }, + { + "epoch": 17.726046841731726, + "grad_norm": 0.9254558086395264, + "learning_rate": 8.228161816891414e-05, + "loss": 0.024858033657073973, + "step": 124880 + }, + { + "epoch": 17.72746628814762, + "grad_norm": 0.058187440037727356, + "learning_rate": 8.228019872249822e-05, + "loss": 0.010150325298309327, + "step": 124890 + }, + { + "epoch": 17.72888573456352, + "grad_norm": 5.154980659484863, + "learning_rate": 8.227877927608233e-05, + "loss": 0.06288841962814332, + "step": 124900 + }, + { + "epoch": 17.730305180979418, + "grad_norm": 0.2450360506772995, + "learning_rate": 8.227735982966643e-05, + "loss": 0.025718489289283754, + "step": 124910 + }, + { + "epoch": 17.731724627395316, + "grad_norm": 4.560354709625244, + "learning_rate": 8.227594038325054e-05, + "loss": 0.006730784475803375, + "step": 124920 + }, + { + "epoch": 17.733144073811214, + "grad_norm": 9.024094581604004, + "learning_rate": 8.227452093683464e-05, + "loss": 0.014889387786388398, + "step": 124930 + }, + { + "epoch": 17.734563520227113, + "grad_norm": 0.312635213136673, + "learning_rate": 8.227310149041873e-05, + "loss": 0.0237678125500679, + "step": 124940 + }, + { + "epoch": 17.73598296664301, + "grad_norm": 0.08515344560146332, + "learning_rate": 8.227168204400284e-05, + "loss": 0.02553623020648956, + "step": 124950 + }, + { + "epoch": 17.737402413058906, + "grad_norm": 1.9939868450164795, + "learning_rate": 8.227026259758694e-05, + "loss": 0.012267166376113891, + "step": 124960 + }, + { + "epoch": 17.738821859474804, + "grad_norm": 0.26859158277511597, + "learning_rate": 8.226884315117105e-05, + "loss": 0.007619164884090424, + "step": 124970 + }, + { + "epoch": 17.740241305890702, + "grad_norm": 0.0797005370259285, + "learning_rate": 8.226742370475515e-05, + "loss": 0.08029309511184693, + "step": 124980 + }, + { + "epoch": 17.7416607523066, + "grad_norm": 15.179434776306152, + "learning_rate": 8.226600425833925e-05, + "loss": 0.024601057171821594, + "step": 124990 + }, + { + "epoch": 17.7430801987225, + "grad_norm": 0.025385675951838493, + "learning_rate": 8.226458481192334e-05, + "loss": 0.005561524629592895, + "step": 125000 + }, + { + "epoch": 17.7430801987225, + "eval_accuracy": 0.9884275449863292, + "eval_loss": 0.043167341500520706, + "eval_runtime": 32.2048, + "eval_samples_per_second": 488.344, + "eval_steps_per_second": 15.277, + "step": 125000 + }, + { + "epoch": 17.744499645138397, + "grad_norm": 0.022722860798239708, + "learning_rate": 8.226316536550746e-05, + "loss": 0.022749267518520355, + "step": 125010 + }, + { + "epoch": 17.745919091554295, + "grad_norm": 9.28839111328125, + "learning_rate": 8.226174591909155e-05, + "loss": 0.043350374698638915, + "step": 125020 + }, + { + "epoch": 17.74733853797019, + "grad_norm": 4.699091911315918, + "learning_rate": 8.226032647267566e-05, + "loss": 0.06598352193832398, + "step": 125030 + }, + { + "epoch": 17.74875798438609, + "grad_norm": 1.6834558248519897, + "learning_rate": 8.225890702625976e-05, + "loss": 0.02005355656147003, + "step": 125040 + }, + { + "epoch": 17.750177430801987, + "grad_norm": 0.021661954000592232, + "learning_rate": 8.225748757984386e-05, + "loss": 0.05298972725868225, + "step": 125050 + }, + { + "epoch": 17.751596877217885, + "grad_norm": 0.35899174213409424, + "learning_rate": 8.225606813342797e-05, + "loss": 0.006761927902698517, + "step": 125060 + }, + { + "epoch": 17.753016323633783, + "grad_norm": 0.041905470192432404, + "learning_rate": 8.225464868701207e-05, + "loss": 0.020069000124931336, + "step": 125070 + }, + { + "epoch": 17.75443577004968, + "grad_norm": 0.7074944376945496, + "learning_rate": 8.225322924059618e-05, + "loss": 0.02227655053138733, + "step": 125080 + }, + { + "epoch": 17.75585521646558, + "grad_norm": 0.05130421370267868, + "learning_rate": 8.225180979418028e-05, + "loss": 0.01859382688999176, + "step": 125090 + }, + { + "epoch": 17.757274662881475, + "grad_norm": 5.374364376068115, + "learning_rate": 8.225039034776437e-05, + "loss": 0.010960782319307328, + "step": 125100 + }, + { + "epoch": 17.758694109297373, + "grad_norm": 0.010262245312333107, + "learning_rate": 8.224897090134847e-05, + "loss": 0.00434894859790802, + "step": 125110 + }, + { + "epoch": 17.76011355571327, + "grad_norm": 1.0669256448745728, + "learning_rate": 8.224755145493258e-05, + "loss": 0.01742391139268875, + "step": 125120 + }, + { + "epoch": 17.76153300212917, + "grad_norm": 0.36459091305732727, + "learning_rate": 8.224613200851669e-05, + "loss": 0.013276790082454682, + "step": 125130 + }, + { + "epoch": 17.762952448545068, + "grad_norm": 0.09000733494758606, + "learning_rate": 8.224471256210079e-05, + "loss": 0.001743781939148903, + "step": 125140 + }, + { + "epoch": 17.764371894960966, + "grad_norm": 0.1272316426038742, + "learning_rate": 8.224329311568489e-05, + "loss": 0.01023392528295517, + "step": 125150 + }, + { + "epoch": 17.765791341376865, + "grad_norm": 1.799155354499817, + "learning_rate": 8.224187366926898e-05, + "loss": 0.0067228637635707855, + "step": 125160 + }, + { + "epoch": 17.76721078779276, + "grad_norm": 0.9365578889846802, + "learning_rate": 8.22404542228531e-05, + "loss": 0.008356766402721405, + "step": 125170 + }, + { + "epoch": 17.768630234208658, + "grad_norm": 0.6004683375358582, + "learning_rate": 8.223903477643719e-05, + "loss": 0.014860156178474426, + "step": 125180 + }, + { + "epoch": 17.770049680624556, + "grad_norm": 0.9573814868927002, + "learning_rate": 8.22376153300213e-05, + "loss": 0.01890397071838379, + "step": 125190 + }, + { + "epoch": 17.771469127040454, + "grad_norm": 0.2591208815574646, + "learning_rate": 8.223619588360539e-05, + "loss": 0.007004987448453903, + "step": 125200 + }, + { + "epoch": 17.772888573456353, + "grad_norm": 0.821864664554596, + "learning_rate": 8.22347764371895e-05, + "loss": 0.012261110544204711, + "step": 125210 + }, + { + "epoch": 17.77430801987225, + "grad_norm": 0.10942957550287247, + "learning_rate": 8.223335699077361e-05, + "loss": 0.05162022709846496, + "step": 125220 + }, + { + "epoch": 17.77572746628815, + "grad_norm": 0.10782337933778763, + "learning_rate": 8.22319375443577e-05, + "loss": 0.018732479214668273, + "step": 125230 + }, + { + "epoch": 17.777146912704044, + "grad_norm": 0.04795246571302414, + "learning_rate": 8.223051809794182e-05, + "loss": 0.0030033662915229797, + "step": 125240 + }, + { + "epoch": 17.778566359119942, + "grad_norm": 0.05053180456161499, + "learning_rate": 8.22290986515259e-05, + "loss": 0.04560690820217132, + "step": 125250 + }, + { + "epoch": 17.77998580553584, + "grad_norm": 0.10530559718608856, + "learning_rate": 8.222767920511001e-05, + "loss": 0.005130567401647568, + "step": 125260 + }, + { + "epoch": 17.78140525195174, + "grad_norm": 0.23054805397987366, + "learning_rate": 8.222625975869411e-05, + "loss": 0.012102346122264861, + "step": 125270 + }, + { + "epoch": 17.782824698367637, + "grad_norm": 0.09214671701192856, + "learning_rate": 8.222484031227822e-05, + "loss": 0.03815768957138062, + "step": 125280 + }, + { + "epoch": 17.784244144783536, + "grad_norm": 0.05258322134613991, + "learning_rate": 8.222342086586232e-05, + "loss": 0.014068126678466797, + "step": 125290 + }, + { + "epoch": 17.785663591199434, + "grad_norm": 7.656113624572754, + "learning_rate": 8.222200141944642e-05, + "loss": 0.019538348913192748, + "step": 125300 + }, + { + "epoch": 17.78708303761533, + "grad_norm": 0.06774721294641495, + "learning_rate": 8.222058197303053e-05, + "loss": 0.027745184302330018, + "step": 125310 + }, + { + "epoch": 17.788502484031227, + "grad_norm": 1.0145679712295532, + "learning_rate": 8.221916252661462e-05, + "loss": 0.030012327432632446, + "step": 125320 + }, + { + "epoch": 17.789921930447125, + "grad_norm": 0.03959951922297478, + "learning_rate": 8.221774308019873e-05, + "loss": 0.01319495439529419, + "step": 125330 + }, + { + "epoch": 17.791341376863024, + "grad_norm": 0.01071132905781269, + "learning_rate": 8.221632363378283e-05, + "loss": 0.02355894446372986, + "step": 125340 + }, + { + "epoch": 17.792760823278922, + "grad_norm": 4.440755844116211, + "learning_rate": 8.221490418736693e-05, + "loss": 0.008524559438228607, + "step": 125350 + }, + { + "epoch": 17.79418026969482, + "grad_norm": 0.028077952563762665, + "learning_rate": 8.221348474095103e-05, + "loss": 0.051354610919952394, + "step": 125360 + }, + { + "epoch": 17.79559971611072, + "grad_norm": 0.19310365617275238, + "learning_rate": 8.221206529453514e-05, + "loss": 0.016239266097545623, + "step": 125370 + }, + { + "epoch": 17.797019162526613, + "grad_norm": 0.8175052404403687, + "learning_rate": 8.221064584811923e-05, + "loss": 0.004264084994792939, + "step": 125380 + }, + { + "epoch": 17.79843860894251, + "grad_norm": 6.119190692901611, + "learning_rate": 8.220922640170335e-05, + "loss": 0.04939507246017456, + "step": 125390 + }, + { + "epoch": 17.79985805535841, + "grad_norm": 3.670011043548584, + "learning_rate": 8.220780695528744e-05, + "loss": 0.016700688004493713, + "step": 125400 + }, + { + "epoch": 17.801277501774308, + "grad_norm": 10.37806224822998, + "learning_rate": 8.220638750887154e-05, + "loss": 0.06917227506637573, + "step": 125410 + }, + { + "epoch": 17.802696948190206, + "grad_norm": 0.466407835483551, + "learning_rate": 8.220496806245565e-05, + "loss": 0.0451697438955307, + "step": 125420 + }, + { + "epoch": 17.804116394606105, + "grad_norm": 0.18278907239437103, + "learning_rate": 8.220354861603975e-05, + "loss": 0.0026770364493131638, + "step": 125430 + }, + { + "epoch": 17.805535841022003, + "grad_norm": 0.26450440287590027, + "learning_rate": 8.220212916962386e-05, + "loss": 0.02100861966609955, + "step": 125440 + }, + { + "epoch": 17.806955287437898, + "grad_norm": 0.026414619758725166, + "learning_rate": 8.220070972320796e-05, + "loss": 0.018268810212612153, + "step": 125450 + }, + { + "epoch": 17.808374733853796, + "grad_norm": 0.05897356569766998, + "learning_rate": 8.219929027679205e-05, + "loss": 0.003758923336863518, + "step": 125460 + }, + { + "epoch": 17.809794180269694, + "grad_norm": 0.001893079956062138, + "learning_rate": 8.219787083037615e-05, + "loss": 0.005195864289999008, + "step": 125470 + }, + { + "epoch": 17.811213626685593, + "grad_norm": 10.834534645080566, + "learning_rate": 8.219645138396026e-05, + "loss": 0.012825360894203186, + "step": 125480 + }, + { + "epoch": 17.81263307310149, + "grad_norm": 0.10624898970127106, + "learning_rate": 8.219503193754436e-05, + "loss": 0.0045176450163125995, + "step": 125490 + }, + { + "epoch": 17.81405251951739, + "grad_norm": 0.6353622674942017, + "learning_rate": 8.219361249112847e-05, + "loss": 0.04030350148677826, + "step": 125500 + }, + { + "epoch": 17.81405251951739, + "eval_accuracy": 0.9826413174794939, + "eval_loss": 0.06225878372788429, + "eval_runtime": 32.1976, + "eval_samples_per_second": 488.453, + "eval_steps_per_second": 15.281, + "step": 125500 + }, + { + "epoch": 17.815471965933288, + "grad_norm": 9.062827110290527, + "learning_rate": 8.219219304471257e-05, + "loss": 0.05064420104026794, + "step": 125510 + }, + { + "epoch": 17.816891412349182, + "grad_norm": 0.1245955228805542, + "learning_rate": 8.219077359829667e-05, + "loss": 0.04497800469398498, + "step": 125520 + }, + { + "epoch": 17.81831085876508, + "grad_norm": 0.023755589500069618, + "learning_rate": 8.218935415188078e-05, + "loss": 0.011510214954614639, + "step": 125530 + }, + { + "epoch": 17.81973030518098, + "grad_norm": 0.008768800646066666, + "learning_rate": 8.218793470546487e-05, + "loss": 0.007446672022342682, + "step": 125540 + }, + { + "epoch": 17.821149751596877, + "grad_norm": 12.563655853271484, + "learning_rate": 8.218651525904899e-05, + "loss": 0.01588403284549713, + "step": 125550 + }, + { + "epoch": 17.822569198012776, + "grad_norm": 1.1750997304916382, + "learning_rate": 8.218509581263307e-05, + "loss": 0.004291301220655441, + "step": 125560 + }, + { + "epoch": 17.823988644428674, + "grad_norm": 0.2507944405078888, + "learning_rate": 8.218367636621718e-05, + "loss": 0.01853466182947159, + "step": 125570 + }, + { + "epoch": 17.825408090844572, + "grad_norm": 1.2327880859375, + "learning_rate": 8.218225691980128e-05, + "loss": 0.05478519797325134, + "step": 125580 + }, + { + "epoch": 17.826827537260467, + "grad_norm": 8.883460998535156, + "learning_rate": 8.218083747338539e-05, + "loss": 0.0659081757068634, + "step": 125590 + }, + { + "epoch": 17.828246983676365, + "grad_norm": 2.973017930984497, + "learning_rate": 8.217941802696949e-05, + "loss": 0.02106604874134064, + "step": 125600 + }, + { + "epoch": 17.829666430092264, + "grad_norm": 0.6707797050476074, + "learning_rate": 8.217799858055358e-05, + "loss": 0.0067960724234580995, + "step": 125610 + }, + { + "epoch": 17.831085876508162, + "grad_norm": 10.60904598236084, + "learning_rate": 8.21765791341377e-05, + "loss": 0.03403809368610382, + "step": 125620 + }, + { + "epoch": 17.83250532292406, + "grad_norm": 0.2786976993083954, + "learning_rate": 8.217515968772179e-05, + "loss": 0.022289738059043884, + "step": 125630 + }, + { + "epoch": 17.83392476933996, + "grad_norm": 5.2510600090026855, + "learning_rate": 8.21737402413059e-05, + "loss": 0.05297409296035767, + "step": 125640 + }, + { + "epoch": 17.835344215755857, + "grad_norm": 0.7457426190376282, + "learning_rate": 8.217232079489e-05, + "loss": 0.03707561790943146, + "step": 125650 + }, + { + "epoch": 17.83676366217175, + "grad_norm": 0.5260939598083496, + "learning_rate": 8.21709013484741e-05, + "loss": 0.0180939644575119, + "step": 125660 + }, + { + "epoch": 17.83818310858765, + "grad_norm": 2.1039488315582275, + "learning_rate": 8.21694819020582e-05, + "loss": 0.012895692884922028, + "step": 125670 + }, + { + "epoch": 17.839602555003548, + "grad_norm": 0.09327096492052078, + "learning_rate": 8.21680624556423e-05, + "loss": 0.02153548151254654, + "step": 125680 + }, + { + "epoch": 17.841022001419446, + "grad_norm": 0.023013439029455185, + "learning_rate": 8.21666430092264e-05, + "loss": 0.039509478211402896, + "step": 125690 + }, + { + "epoch": 17.842441447835345, + "grad_norm": 0.023209473118185997, + "learning_rate": 8.216522356281051e-05, + "loss": 0.02362530380487442, + "step": 125700 + }, + { + "epoch": 17.843860894251243, + "grad_norm": 17.44260025024414, + "learning_rate": 8.216380411639461e-05, + "loss": 0.05340635180473328, + "step": 125710 + }, + { + "epoch": 17.84528034066714, + "grad_norm": 0.31442582607269287, + "learning_rate": 8.216238466997871e-05, + "loss": 0.007146529853343964, + "step": 125720 + }, + { + "epoch": 17.846699787083036, + "grad_norm": 6.465976715087891, + "learning_rate": 8.216096522356282e-05, + "loss": 0.023551468551158906, + "step": 125730 + }, + { + "epoch": 17.848119233498934, + "grad_norm": 1.7314174175262451, + "learning_rate": 8.215954577714692e-05, + "loss": 0.021711570024490357, + "step": 125740 + }, + { + "epoch": 17.849538679914833, + "grad_norm": 0.35447877645492554, + "learning_rate": 8.215812633073103e-05, + "loss": 0.04752359390258789, + "step": 125750 + }, + { + "epoch": 17.85095812633073, + "grad_norm": 0.16713052988052368, + "learning_rate": 8.215670688431511e-05, + "loss": 0.0385821133852005, + "step": 125760 + }, + { + "epoch": 17.85237757274663, + "grad_norm": 2.4941494464874268, + "learning_rate": 8.215528743789922e-05, + "loss": 0.061608928442001346, + "step": 125770 + }, + { + "epoch": 17.853797019162528, + "grad_norm": 3.051335096359253, + "learning_rate": 8.215386799148332e-05, + "loss": 0.04033620953559876, + "step": 125780 + }, + { + "epoch": 17.855216465578426, + "grad_norm": 4.2297210693359375, + "learning_rate": 8.215244854506743e-05, + "loss": 0.02570372223854065, + "step": 125790 + }, + { + "epoch": 17.85663591199432, + "grad_norm": 0.06211649626493454, + "learning_rate": 8.215102909865153e-05, + "loss": 0.022726473212242127, + "step": 125800 + }, + { + "epoch": 17.85805535841022, + "grad_norm": 5.127074241638184, + "learning_rate": 8.214960965223564e-05, + "loss": 0.012352780997753143, + "step": 125810 + }, + { + "epoch": 17.859474804826117, + "grad_norm": 6.0037713050842285, + "learning_rate": 8.214819020581974e-05, + "loss": 0.027094042301177977, + "step": 125820 + }, + { + "epoch": 17.860894251242016, + "grad_norm": 0.09564322978258133, + "learning_rate": 8.214677075940383e-05, + "loss": 0.014927592873573304, + "step": 125830 + }, + { + "epoch": 17.862313697657914, + "grad_norm": 3.6967010498046875, + "learning_rate": 8.214535131298794e-05, + "loss": 0.007258567214012146, + "step": 125840 + }, + { + "epoch": 17.863733144073812, + "grad_norm": 8.384481430053711, + "learning_rate": 8.214393186657204e-05, + "loss": 0.020434218645095825, + "step": 125850 + }, + { + "epoch": 17.86515259048971, + "grad_norm": 0.2185468077659607, + "learning_rate": 8.214251242015615e-05, + "loss": 0.00585351549088955, + "step": 125860 + }, + { + "epoch": 17.866572036905605, + "grad_norm": 0.16802102327346802, + "learning_rate": 8.214109297374024e-05, + "loss": 0.04269077479839325, + "step": 125870 + }, + { + "epoch": 17.867991483321504, + "grad_norm": 0.6906107068061829, + "learning_rate": 8.213967352732435e-05, + "loss": 0.010298679769039153, + "step": 125880 + }, + { + "epoch": 17.869410929737402, + "grad_norm": 2.713566541671753, + "learning_rate": 8.213825408090844e-05, + "loss": 0.026750019192695616, + "step": 125890 + }, + { + "epoch": 17.8708303761533, + "grad_norm": 9.485734939575195, + "learning_rate": 8.213683463449256e-05, + "loss": 0.006816279888153076, + "step": 125900 + }, + { + "epoch": 17.8722498225692, + "grad_norm": 0.0062073878943920135, + "learning_rate": 8.213541518807665e-05, + "loss": 0.0393365740776062, + "step": 125910 + }, + { + "epoch": 17.873669268985097, + "grad_norm": 0.17222857475280762, + "learning_rate": 8.213399574166075e-05, + "loss": 0.02054024189710617, + "step": 125920 + }, + { + "epoch": 17.875088715400995, + "grad_norm": 0.05197976902127266, + "learning_rate": 8.213257629524486e-05, + "loss": 0.04310756027698517, + "step": 125930 + }, + { + "epoch": 17.87650816181689, + "grad_norm": 0.09764538705348969, + "learning_rate": 8.213115684882896e-05, + "loss": 0.054064315557479856, + "step": 125940 + }, + { + "epoch": 17.87792760823279, + "grad_norm": 0.07554052025079727, + "learning_rate": 8.212973740241307e-05, + "loss": 0.025239002704620362, + "step": 125950 + }, + { + "epoch": 17.879347054648687, + "grad_norm": 0.349587082862854, + "learning_rate": 8.212831795599717e-05, + "loss": 0.007651855796575546, + "step": 125960 + }, + { + "epoch": 17.880766501064585, + "grad_norm": 13.251055717468262, + "learning_rate": 8.212689850958126e-05, + "loss": 0.06340181827545166, + "step": 125970 + }, + { + "epoch": 17.882185947480483, + "grad_norm": 0.27692946791648865, + "learning_rate": 8.212547906316536e-05, + "loss": 0.00708906427025795, + "step": 125980 + }, + { + "epoch": 17.88360539389638, + "grad_norm": 0.04449354112148285, + "learning_rate": 8.212405961674947e-05, + "loss": 0.01091306209564209, + "step": 125990 + }, + { + "epoch": 17.88502484031228, + "grad_norm": 0.03929013013839722, + "learning_rate": 8.212264017033357e-05, + "loss": 0.01091454029083252, + "step": 126000 + }, + { + "epoch": 17.88502484031228, + "eval_accuracy": 0.9870922617155211, + "eval_loss": 0.05081784725189209, + "eval_runtime": 31.9763, + "eval_samples_per_second": 491.833, + "eval_steps_per_second": 15.386, + "step": 126000 + }, + { + "epoch": 17.886444286728175, + "grad_norm": 0.03523946925997734, + "learning_rate": 8.212122072391768e-05, + "loss": 0.01761469542980194, + "step": 126010 + }, + { + "epoch": 17.887863733144073, + "grad_norm": 0.3471955955028534, + "learning_rate": 8.211980127750178e-05, + "loss": 0.02797209918498993, + "step": 126020 + }, + { + "epoch": 17.88928317955997, + "grad_norm": 0.18042099475860596, + "learning_rate": 8.211838183108588e-05, + "loss": 0.024269339442253113, + "step": 126030 + }, + { + "epoch": 17.89070262597587, + "grad_norm": 3.176307201385498, + "learning_rate": 8.211696238466999e-05, + "loss": 0.018799322843551635, + "step": 126040 + }, + { + "epoch": 17.892122072391768, + "grad_norm": 0.6055776476860046, + "learning_rate": 8.211554293825408e-05, + "loss": 0.048582276701927184, + "step": 126050 + }, + { + "epoch": 17.893541518807666, + "grad_norm": 8.118757247924805, + "learning_rate": 8.21141234918382e-05, + "loss": 0.045041635632514954, + "step": 126060 + }, + { + "epoch": 17.894960965223564, + "grad_norm": 0.2706262767314911, + "learning_rate": 8.211270404542228e-05, + "loss": 0.011531689763069152, + "step": 126070 + }, + { + "epoch": 17.89638041163946, + "grad_norm": 0.02398722618818283, + "learning_rate": 8.211128459900639e-05, + "loss": 0.011123070120811462, + "step": 126080 + }, + { + "epoch": 17.897799858055357, + "grad_norm": 0.4285498261451721, + "learning_rate": 8.210986515259049e-05, + "loss": 0.011589570343494416, + "step": 126090 + }, + { + "epoch": 17.899219304471256, + "grad_norm": 0.03318289667367935, + "learning_rate": 8.21084457061746e-05, + "loss": 0.02185286730527878, + "step": 126100 + }, + { + "epoch": 17.900638750887154, + "grad_norm": 2.2753231525421143, + "learning_rate": 8.21070262597587e-05, + "loss": 0.04003996551036835, + "step": 126110 + }, + { + "epoch": 17.902058197303052, + "grad_norm": 1.4221254587173462, + "learning_rate": 8.210560681334279e-05, + "loss": 0.04174538254737854, + "step": 126120 + }, + { + "epoch": 17.90347764371895, + "grad_norm": 0.8516020178794861, + "learning_rate": 8.21041873669269e-05, + "loss": 0.03194279670715332, + "step": 126130 + }, + { + "epoch": 17.90489709013485, + "grad_norm": 0.3246018588542938, + "learning_rate": 8.2102767920511e-05, + "loss": 0.05795959234237671, + "step": 126140 + }, + { + "epoch": 17.906316536550744, + "grad_norm": 1.9304618835449219, + "learning_rate": 8.210134847409511e-05, + "loss": 0.06401010155677796, + "step": 126150 + }, + { + "epoch": 17.907735982966642, + "grad_norm": 0.311942994594574, + "learning_rate": 8.209992902767921e-05, + "loss": 0.005649371817708015, + "step": 126160 + }, + { + "epoch": 17.90915542938254, + "grad_norm": 0.02588566765189171, + "learning_rate": 8.209850958126332e-05, + "loss": 0.0437458336353302, + "step": 126170 + }, + { + "epoch": 17.91057487579844, + "grad_norm": 0.28646552562713623, + "learning_rate": 8.20970901348474e-05, + "loss": 0.003959463909268379, + "step": 126180 + }, + { + "epoch": 17.911994322214337, + "grad_norm": 0.2520774304866791, + "learning_rate": 8.209567068843152e-05, + "loss": 0.00396527536213398, + "step": 126190 + }, + { + "epoch": 17.913413768630235, + "grad_norm": 0.05746285244822502, + "learning_rate": 8.209425124201561e-05, + "loss": 0.020936407148838043, + "step": 126200 + }, + { + "epoch": 17.914833215046134, + "grad_norm": 1.556191325187683, + "learning_rate": 8.209283179559972e-05, + "loss": 0.0039587758481502535, + "step": 126210 + }, + { + "epoch": 17.91625266146203, + "grad_norm": 11.09601879119873, + "learning_rate": 8.209141234918382e-05, + "loss": 0.0388120174407959, + "step": 126220 + }, + { + "epoch": 17.917672107877927, + "grad_norm": 12.754619598388672, + "learning_rate": 8.208999290276792e-05, + "loss": 0.031015089154243468, + "step": 126230 + }, + { + "epoch": 17.919091554293825, + "grad_norm": 0.07646643370389938, + "learning_rate": 8.208857345635203e-05, + "loss": 0.029643809795379637, + "step": 126240 + }, + { + "epoch": 17.920511000709723, + "grad_norm": 0.5458995699882507, + "learning_rate": 8.208715400993613e-05, + "loss": 0.01911151260137558, + "step": 126250 + }, + { + "epoch": 17.92193044712562, + "grad_norm": 0.9907100796699524, + "learning_rate": 8.208573456352024e-05, + "loss": 0.006705816090106964, + "step": 126260 + }, + { + "epoch": 17.92334989354152, + "grad_norm": 0.5216385722160339, + "learning_rate": 8.208431511710433e-05, + "loss": 0.013103863596916199, + "step": 126270 + }, + { + "epoch": 17.924769339957418, + "grad_norm": 2.8902833461761475, + "learning_rate": 8.208289567068843e-05, + "loss": 0.03189074695110321, + "step": 126280 + }, + { + "epoch": 17.926188786373313, + "grad_norm": 12.639440536499023, + "learning_rate": 8.208147622427253e-05, + "loss": 0.02160928547382355, + "step": 126290 + }, + { + "epoch": 17.92760823278921, + "grad_norm": 0.10358481109142303, + "learning_rate": 8.208005677785664e-05, + "loss": 0.015329189598560333, + "step": 126300 + }, + { + "epoch": 17.92902767920511, + "grad_norm": 1.279746413230896, + "learning_rate": 8.207863733144074e-05, + "loss": 0.030715879797935487, + "step": 126310 + }, + { + "epoch": 17.930447125621008, + "grad_norm": 0.05002136528491974, + "learning_rate": 8.207721788502485e-05, + "loss": 0.051266276836395265, + "step": 126320 + }, + { + "epoch": 17.931866572036906, + "grad_norm": 6.9989776611328125, + "learning_rate": 8.207579843860895e-05, + "loss": 0.050137877464294434, + "step": 126330 + }, + { + "epoch": 17.933286018452804, + "grad_norm": 0.1464066207408905, + "learning_rate": 8.207437899219304e-05, + "loss": 0.03081599771976471, + "step": 126340 + }, + { + "epoch": 17.934705464868703, + "grad_norm": 0.022758029401302338, + "learning_rate": 8.207295954577715e-05, + "loss": 0.0385101467370987, + "step": 126350 + }, + { + "epoch": 17.936124911284598, + "grad_norm": 2.994476079940796, + "learning_rate": 8.207154009936125e-05, + "loss": 0.04679540097713471, + "step": 126360 + }, + { + "epoch": 17.937544357700496, + "grad_norm": 0.03078029491007328, + "learning_rate": 8.207012065294536e-05, + "loss": 0.01109546199440956, + "step": 126370 + }, + { + "epoch": 17.938963804116394, + "grad_norm": 3.0478312969207764, + "learning_rate": 8.206870120652945e-05, + "loss": 0.019247525930404664, + "step": 126380 + }, + { + "epoch": 17.940383250532292, + "grad_norm": 0.9252429604530334, + "learning_rate": 8.206728176011356e-05, + "loss": 0.08226449489593506, + "step": 126390 + }, + { + "epoch": 17.94180269694819, + "grad_norm": 0.025419319048523903, + "learning_rate": 8.206586231369766e-05, + "loss": 0.007851970195770264, + "step": 126400 + }, + { + "epoch": 17.94322214336409, + "grad_norm": 7.943525314331055, + "learning_rate": 8.206444286728177e-05, + "loss": 0.037611573934555054, + "step": 126410 + }, + { + "epoch": 17.944641589779987, + "grad_norm": 0.060102060437202454, + "learning_rate": 8.206302342086586e-05, + "loss": 0.027545714378356935, + "step": 126420 + }, + { + "epoch": 17.946061036195882, + "grad_norm": 12.149624824523926, + "learning_rate": 8.206160397444996e-05, + "loss": 0.029793751239776612, + "step": 126430 + }, + { + "epoch": 17.94748048261178, + "grad_norm": 0.3006502687931061, + "learning_rate": 8.206018452803407e-05, + "loss": 0.012186054885387421, + "step": 126440 + }, + { + "epoch": 17.94889992902768, + "grad_norm": 0.08010541647672653, + "learning_rate": 8.205876508161817e-05, + "loss": 0.002343623712658882, + "step": 126450 + }, + { + "epoch": 17.950319375443577, + "grad_norm": 0.19521616399288177, + "learning_rate": 8.205734563520228e-05, + "loss": 0.011480587720870971, + "step": 126460 + }, + { + "epoch": 17.951738821859475, + "grad_norm": 0.05199567973613739, + "learning_rate": 8.205592618878638e-05, + "loss": 0.01906854808330536, + "step": 126470 + }, + { + "epoch": 17.953158268275374, + "grad_norm": 0.6597695350646973, + "learning_rate": 8.205450674237049e-05, + "loss": 0.0264710009098053, + "step": 126480 + }, + { + "epoch": 17.954577714691272, + "grad_norm": 0.0455927774310112, + "learning_rate": 8.205308729595457e-05, + "loss": 0.006426760554313659, + "step": 126490 + }, + { + "epoch": 17.955997161107167, + "grad_norm": 2.061488389968872, + "learning_rate": 8.205166784953868e-05, + "loss": 0.02843399941921234, + "step": 126500 + }, + { + "epoch": 17.955997161107167, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.05199264734983444, + "eval_runtime": 32.8178, + "eval_samples_per_second": 479.221, + "eval_steps_per_second": 14.992, + "step": 126500 + }, + { + "epoch": 17.957416607523065, + "grad_norm": 0.0125628262758255, + "learning_rate": 8.205024840312278e-05, + "loss": 0.012261651456356049, + "step": 126510 + }, + { + "epoch": 17.958836053938963, + "grad_norm": 19.391925811767578, + "learning_rate": 8.204882895670689e-05, + "loss": 0.014076711237430572, + "step": 126520 + }, + { + "epoch": 17.96025550035486, + "grad_norm": 0.007833714596927166, + "learning_rate": 8.2047409510291e-05, + "loss": 0.039005106687545775, + "step": 126530 + }, + { + "epoch": 17.96167494677076, + "grad_norm": 14.843253135681152, + "learning_rate": 8.204599006387509e-05, + "loss": 0.03159629106521607, + "step": 126540 + }, + { + "epoch": 17.96309439318666, + "grad_norm": 0.6994040012359619, + "learning_rate": 8.20445706174592e-05, + "loss": 0.034837445616722106, + "step": 126550 + }, + { + "epoch": 17.964513839602557, + "grad_norm": 2.879476308822632, + "learning_rate": 8.20431511710433e-05, + "loss": 0.034412276744842527, + "step": 126560 + }, + { + "epoch": 17.96593328601845, + "grad_norm": 10.790650367736816, + "learning_rate": 8.20417317246274e-05, + "loss": 0.02943170666694641, + "step": 126570 + }, + { + "epoch": 17.96735273243435, + "grad_norm": 0.146264910697937, + "learning_rate": 8.20403122782115e-05, + "loss": 0.02726702392101288, + "step": 126580 + }, + { + "epoch": 17.968772178850248, + "grad_norm": 0.09430106729269028, + "learning_rate": 8.20388928317956e-05, + "loss": 0.07335036993026733, + "step": 126590 + }, + { + "epoch": 17.970191625266146, + "grad_norm": 3.3713955879211426, + "learning_rate": 8.20374733853797e-05, + "loss": 0.013988673686981201, + "step": 126600 + }, + { + "epoch": 17.971611071682045, + "grad_norm": 0.6408278346061707, + "learning_rate": 8.203605393896381e-05, + "loss": 0.028031525015830994, + "step": 126610 + }, + { + "epoch": 17.973030518097943, + "grad_norm": 0.8432971239089966, + "learning_rate": 8.203463449254792e-05, + "loss": 0.01225607916712761, + "step": 126620 + }, + { + "epoch": 17.97444996451384, + "grad_norm": 0.672948956489563, + "learning_rate": 8.203321504613202e-05, + "loss": 0.04240490198135376, + "step": 126630 + }, + { + "epoch": 17.975869410929736, + "grad_norm": 0.4233643412590027, + "learning_rate": 8.203179559971611e-05, + "loss": 0.012204398214817048, + "step": 126640 + }, + { + "epoch": 17.977288857345634, + "grad_norm": 0.7830970883369446, + "learning_rate": 8.203037615330021e-05, + "loss": 0.004717732965946198, + "step": 126650 + }, + { + "epoch": 17.978708303761533, + "grad_norm": 1.6771160364151, + "learning_rate": 8.202895670688432e-05, + "loss": 0.02526704967021942, + "step": 126660 + }, + { + "epoch": 17.98012775017743, + "grad_norm": 1.994340181350708, + "learning_rate": 8.202753726046842e-05, + "loss": 0.019393594563007356, + "step": 126670 + }, + { + "epoch": 17.98154719659333, + "grad_norm": 0.24979780614376068, + "learning_rate": 8.202611781405253e-05, + "loss": 0.026185110211372375, + "step": 126680 + }, + { + "epoch": 17.982966643009227, + "grad_norm": 5.780867099761963, + "learning_rate": 8.202469836763661e-05, + "loss": 0.03918190896511078, + "step": 126690 + }, + { + "epoch": 17.984386089425126, + "grad_norm": 0.01048432756215334, + "learning_rate": 8.202327892122073e-05, + "loss": 0.015203535556793213, + "step": 126700 + }, + { + "epoch": 17.98580553584102, + "grad_norm": 2.775602102279663, + "learning_rate": 8.202185947480484e-05, + "loss": 0.0074320010840892795, + "step": 126710 + }, + { + "epoch": 17.98722498225692, + "grad_norm": 7.469266414642334, + "learning_rate": 8.202044002838893e-05, + "loss": 0.012325166165828705, + "step": 126720 + }, + { + "epoch": 17.988644428672817, + "grad_norm": 0.059806808829307556, + "learning_rate": 8.201902058197304e-05, + "loss": 0.02976747751235962, + "step": 126730 + }, + { + "epoch": 17.990063875088715, + "grad_norm": 0.03276080638170242, + "learning_rate": 8.201760113555713e-05, + "loss": 0.060982465744018555, + "step": 126740 + }, + { + "epoch": 17.991483321504614, + "grad_norm": 4.326698303222656, + "learning_rate": 8.201618168914124e-05, + "loss": 0.011280091106891632, + "step": 126750 + }, + { + "epoch": 17.992902767920512, + "grad_norm": 1.296022891998291, + "learning_rate": 8.201476224272534e-05, + "loss": 0.018512937426567077, + "step": 126760 + }, + { + "epoch": 17.99432221433641, + "grad_norm": 0.13187599182128906, + "learning_rate": 8.201348474095104e-05, + "loss": 0.036490410566329956, + "step": 126770 + }, + { + "epoch": 17.995741660752305, + "grad_norm": 0.8004917502403259, + "learning_rate": 8.201206529453513e-05, + "loss": 0.05194540023803711, + "step": 126780 + }, + { + "epoch": 17.997161107168203, + "grad_norm": 2.241412401199341, + "learning_rate": 8.201064584811924e-05, + "loss": 0.042962107062339785, + "step": 126790 + }, + { + "epoch": 17.9985805535841, + "grad_norm": 3.015472650527954, + "learning_rate": 8.200922640170334e-05, + "loss": 0.01860196590423584, + "step": 126800 + }, + { + "epoch": 18.0, + "grad_norm": 6.105637073516846, + "learning_rate": 8.200780695528745e-05, + "loss": 0.045799678564071654, + "step": 126810 + }, + { + "epoch": 18.0014194464159, + "grad_norm": 0.3411359488964081, + "learning_rate": 8.200638750887154e-05, + "loss": 0.022154295444488527, + "step": 126820 + }, + { + "epoch": 18.002838892831797, + "grad_norm": 3.1426784992218018, + "learning_rate": 8.200496806245565e-05, + "loss": 0.008062352985143661, + "step": 126830 + }, + { + "epoch": 18.004258339247695, + "grad_norm": 0.09681227803230286, + "learning_rate": 8.200354861603974e-05, + "loss": 0.050596457719802854, + "step": 126840 + }, + { + "epoch": 18.00567778566359, + "grad_norm": 0.666832685470581, + "learning_rate": 8.200212916962385e-05, + "loss": 0.013695698976516724, + "step": 126850 + }, + { + "epoch": 18.007097232079488, + "grad_norm": 0.31903573870658875, + "learning_rate": 8.200070972320795e-05, + "loss": 0.0022612977772951126, + "step": 126860 + }, + { + "epoch": 18.008516678495386, + "grad_norm": 0.0052805026061832905, + "learning_rate": 8.199929027679205e-05, + "loss": 0.027579629421234132, + "step": 126870 + }, + { + "epoch": 18.009936124911285, + "grad_norm": 0.006886809598654509, + "learning_rate": 8.199787083037616e-05, + "loss": 0.041148635745048526, + "step": 126880 + }, + { + "epoch": 18.011355571327183, + "grad_norm": 0.40197938680648804, + "learning_rate": 8.199645138396026e-05, + "loss": 0.023601478338241576, + "step": 126890 + }, + { + "epoch": 18.01277501774308, + "grad_norm": 0.012682395055890083, + "learning_rate": 8.199503193754437e-05, + "loss": 0.002660195529460907, + "step": 126900 + }, + { + "epoch": 18.01419446415898, + "grad_norm": 0.38907021284103394, + "learning_rate": 8.199361249112847e-05, + "loss": 0.005682775005698204, + "step": 126910 + }, + { + "epoch": 18.015613910574874, + "grad_norm": 2.3589015007019043, + "learning_rate": 8.199219304471256e-05, + "loss": 0.016606913506984712, + "step": 126920 + }, + { + "epoch": 18.017033356990773, + "grad_norm": 0.516564667224884, + "learning_rate": 8.199077359829666e-05, + "loss": 0.0015501592308282853, + "step": 126930 + }, + { + "epoch": 18.01845280340667, + "grad_norm": 0.02881469391286373, + "learning_rate": 8.198935415188077e-05, + "loss": 0.01067756861448288, + "step": 126940 + }, + { + "epoch": 18.01987224982257, + "grad_norm": 0.00568376574665308, + "learning_rate": 8.198793470546487e-05, + "loss": 0.041530221700668335, + "step": 126950 + }, + { + "epoch": 18.021291696238467, + "grad_norm": 0.17403747141361237, + "learning_rate": 8.198651525904898e-05, + "loss": 0.008302728086709977, + "step": 126960 + }, + { + "epoch": 18.022711142654366, + "grad_norm": 7.299103260040283, + "learning_rate": 8.198509581263308e-05, + "loss": 0.01957940012216568, + "step": 126970 + }, + { + "epoch": 18.024130589070264, + "grad_norm": 0.09948822855949402, + "learning_rate": 8.198367636621717e-05, + "loss": 0.011512156575918198, + "step": 126980 + }, + { + "epoch": 18.02555003548616, + "grad_norm": 0.20196844637393951, + "learning_rate": 8.198225691980129e-05, + "loss": 0.005794602632522583, + "step": 126990 + }, + { + "epoch": 18.026969481902057, + "grad_norm": 0.057056453078985214, + "learning_rate": 8.198083747338538e-05, + "loss": 0.009519323706626892, + "step": 127000 + }, + { + "epoch": 18.026969481902057, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.05799579620361328, + "eval_runtime": 33.0898, + "eval_samples_per_second": 475.283, + "eval_steps_per_second": 14.869, + "step": 127000 + }, + { + "epoch": 18.028388928317955, + "grad_norm": 5.088464736938477, + "learning_rate": 8.19794180269695e-05, + "loss": 0.031551048159599304, + "step": 127010 + }, + { + "epoch": 18.029808374733854, + "grad_norm": 18.893657684326172, + "learning_rate": 8.197799858055358e-05, + "loss": 0.0429678350687027, + "step": 127020 + }, + { + "epoch": 18.031227821149752, + "grad_norm": 9.892513275146484, + "learning_rate": 8.197657913413769e-05, + "loss": 0.03231886029243469, + "step": 127030 + }, + { + "epoch": 18.03264726756565, + "grad_norm": 3.9869258403778076, + "learning_rate": 8.197515968772179e-05, + "loss": 0.04821803271770477, + "step": 127040 + }, + { + "epoch": 18.03406671398155, + "grad_norm": 0.049460362643003464, + "learning_rate": 8.19737402413059e-05, + "loss": 0.04642572402954102, + "step": 127050 + }, + { + "epoch": 18.035486160397443, + "grad_norm": 1.9470139741897583, + "learning_rate": 8.197232079489e-05, + "loss": 0.04240458309650421, + "step": 127060 + }, + { + "epoch": 18.03690560681334, + "grad_norm": 0.018669215962290764, + "learning_rate": 8.197090134847409e-05, + "loss": 0.011113067716360092, + "step": 127070 + }, + { + "epoch": 18.03832505322924, + "grad_norm": 2.685368299484253, + "learning_rate": 8.19694819020582e-05, + "loss": 0.010963048040866851, + "step": 127080 + }, + { + "epoch": 18.03974449964514, + "grad_norm": 0.46036213636398315, + "learning_rate": 8.19680624556423e-05, + "loss": 0.02715737819671631, + "step": 127090 + }, + { + "epoch": 18.041163946061037, + "grad_norm": 0.6678931713104248, + "learning_rate": 8.196664300922641e-05, + "loss": 0.025738224387168884, + "step": 127100 + }, + { + "epoch": 18.042583392476935, + "grad_norm": 0.012589601799845695, + "learning_rate": 8.196522356281051e-05, + "loss": 0.002760981023311615, + "step": 127110 + }, + { + "epoch": 18.044002838892833, + "grad_norm": 4.038692951202393, + "learning_rate": 8.19638041163946e-05, + "loss": 0.025073921680450438, + "step": 127120 + }, + { + "epoch": 18.045422285308728, + "grad_norm": 1.019339680671692, + "learning_rate": 8.19623846699787e-05, + "loss": 0.004967309162020683, + "step": 127130 + }, + { + "epoch": 18.046841731724626, + "grad_norm": 0.03080017864704132, + "learning_rate": 8.196096522356281e-05, + "loss": 0.012518167495727539, + "step": 127140 + }, + { + "epoch": 18.048261178140525, + "grad_norm": 24.186779022216797, + "learning_rate": 8.195954577714691e-05, + "loss": 0.03561782538890838, + "step": 127150 + }, + { + "epoch": 18.049680624556423, + "grad_norm": 1.739776611328125, + "learning_rate": 8.195812633073102e-05, + "loss": 0.02224576622247696, + "step": 127160 + }, + { + "epoch": 18.05110007097232, + "grad_norm": 0.572066605091095, + "learning_rate": 8.195670688431512e-05, + "loss": 0.012349732220172882, + "step": 127170 + }, + { + "epoch": 18.05251951738822, + "grad_norm": 0.03597768768668175, + "learning_rate": 8.195528743789922e-05, + "loss": 0.04076186418533325, + "step": 127180 + }, + { + "epoch": 18.053938963804118, + "grad_norm": 25.167530059814453, + "learning_rate": 8.195386799148333e-05, + "loss": 0.0753936767578125, + "step": 127190 + }, + { + "epoch": 18.055358410220013, + "grad_norm": 1.2025281190872192, + "learning_rate": 8.195244854506743e-05, + "loss": 0.020939262211322786, + "step": 127200 + }, + { + "epoch": 18.05677785663591, + "grad_norm": 0.0326387844979763, + "learning_rate": 8.195102909865154e-05, + "loss": 0.05523158311843872, + "step": 127210 + }, + { + "epoch": 18.05819730305181, + "grad_norm": 0.41500118374824524, + "learning_rate": 8.194960965223563e-05, + "loss": 0.008329737931489944, + "step": 127220 + }, + { + "epoch": 18.059616749467708, + "grad_norm": 3.731995105743408, + "learning_rate": 8.194819020581973e-05, + "loss": 0.01609371155500412, + "step": 127230 + }, + { + "epoch": 18.061036195883606, + "grad_norm": 0.08213970810174942, + "learning_rate": 8.194677075940383e-05, + "loss": 0.05572362542152405, + "step": 127240 + }, + { + "epoch": 18.062455642299504, + "grad_norm": 0.02728845551609993, + "learning_rate": 8.194535131298794e-05, + "loss": 0.009065951406955718, + "step": 127250 + }, + { + "epoch": 18.063875088715402, + "grad_norm": 3.4261679649353027, + "learning_rate": 8.194393186657204e-05, + "loss": 0.021069920063018797, + "step": 127260 + }, + { + "epoch": 18.065294535131297, + "grad_norm": 0.04758109897375107, + "learning_rate": 8.194251242015615e-05, + "loss": 0.028346240520477295, + "step": 127270 + }, + { + "epoch": 18.066713981547196, + "grad_norm": 0.24030058085918427, + "learning_rate": 8.194109297374025e-05, + "loss": 0.02941884696483612, + "step": 127280 + }, + { + "epoch": 18.068133427963094, + "grad_norm": 0.5719775557518005, + "learning_rate": 8.193967352732434e-05, + "loss": 0.011897590756416321, + "step": 127290 + }, + { + "epoch": 18.069552874378992, + "grad_norm": 0.09043441712856293, + "learning_rate": 8.193825408090845e-05, + "loss": 0.020763903856277466, + "step": 127300 + }, + { + "epoch": 18.07097232079489, + "grad_norm": 0.005356388632208109, + "learning_rate": 8.193683463449255e-05, + "loss": 0.014215975999832153, + "step": 127310 + }, + { + "epoch": 18.07239176721079, + "grad_norm": 0.012542439624667168, + "learning_rate": 8.193541518807666e-05, + "loss": 0.00543864406645298, + "step": 127320 + }, + { + "epoch": 18.073811213626687, + "grad_norm": 1.8537201881408691, + "learning_rate": 8.193399574166075e-05, + "loss": 0.004514655843377113, + "step": 127330 + }, + { + "epoch": 18.075230660042582, + "grad_norm": 1.0644135475158691, + "learning_rate": 8.193257629524486e-05, + "loss": 0.031022971868515013, + "step": 127340 + }, + { + "epoch": 18.07665010645848, + "grad_norm": 0.22495824098587036, + "learning_rate": 8.193115684882895e-05, + "loss": 0.025671708583831786, + "step": 127350 + }, + { + "epoch": 18.07806955287438, + "grad_norm": 4.438430309295654, + "learning_rate": 8.192973740241306e-05, + "loss": 0.010372009128332138, + "step": 127360 + }, + { + "epoch": 18.079488999290277, + "grad_norm": 8.326705932617188, + "learning_rate": 8.192831795599718e-05, + "loss": 0.03074938654899597, + "step": 127370 + }, + { + "epoch": 18.080908445706175, + "grad_norm": 5.019266605377197, + "learning_rate": 8.192689850958126e-05, + "loss": 0.006851650774478912, + "step": 127380 + }, + { + "epoch": 18.082327892122073, + "grad_norm": 0.056960105895996094, + "learning_rate": 8.192547906316537e-05, + "loss": 0.027861076593399047, + "step": 127390 + }, + { + "epoch": 18.08374733853797, + "grad_norm": 0.039441630244255066, + "learning_rate": 8.192405961674947e-05, + "loss": 0.004483645036816597, + "step": 127400 + }, + { + "epoch": 18.085166784953866, + "grad_norm": 0.44658857583999634, + "learning_rate": 8.192264017033358e-05, + "loss": 0.01149245649576187, + "step": 127410 + }, + { + "epoch": 18.086586231369765, + "grad_norm": 1.7577241659164429, + "learning_rate": 8.192122072391768e-05, + "loss": 0.012128777801990509, + "step": 127420 + }, + { + "epoch": 18.088005677785663, + "grad_norm": 1.1049087047576904, + "learning_rate": 8.191980127750177e-05, + "loss": 0.024096983671188354, + "step": 127430 + }, + { + "epoch": 18.08942512420156, + "grad_norm": 3.239858388900757, + "learning_rate": 8.191838183108587e-05, + "loss": 0.059730923175811766, + "step": 127440 + }, + { + "epoch": 18.09084457061746, + "grad_norm": 0.11400025337934494, + "learning_rate": 8.191696238466998e-05, + "loss": 0.0018292196094989777, + "step": 127450 + }, + { + "epoch": 18.092264017033358, + "grad_norm": 0.0311578419059515, + "learning_rate": 8.191554293825409e-05, + "loss": 0.005093959718942642, + "step": 127460 + }, + { + "epoch": 18.093683463449256, + "grad_norm": 0.03095116652548313, + "learning_rate": 8.191412349183819e-05, + "loss": 0.0027309712022542953, + "step": 127470 + }, + { + "epoch": 18.09510290986515, + "grad_norm": 11.3790283203125, + "learning_rate": 8.191270404542229e-05, + "loss": 0.03980101048946381, + "step": 127480 + }, + { + "epoch": 18.09652235628105, + "grad_norm": 6.759974002838135, + "learning_rate": 8.191128459900639e-05, + "loss": 0.033737432956695554, + "step": 127490 + }, + { + "epoch": 18.097941802696948, + "grad_norm": 4.424436092376709, + "learning_rate": 8.19098651525905e-05, + "loss": 0.0036382827907800674, + "step": 127500 + }, + { + "epoch": 18.097941802696948, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04337623715400696, + "eval_runtime": 31.9534, + "eval_samples_per_second": 492.186, + "eval_steps_per_second": 15.397, + "step": 127500 + }, + { + "epoch": 18.099361249112846, + "grad_norm": 0.4387366771697998, + "learning_rate": 8.19084457061746e-05, + "loss": 0.049114978313446044, + "step": 127510 + }, + { + "epoch": 18.100780695528744, + "grad_norm": 0.765316367149353, + "learning_rate": 8.19070262597587e-05, + "loss": 0.013087628781795502, + "step": 127520 + }, + { + "epoch": 18.102200141944643, + "grad_norm": 0.1502961665391922, + "learning_rate": 8.19056068133428e-05, + "loss": 0.0034311629831790926, + "step": 127530 + }, + { + "epoch": 18.10361958836054, + "grad_norm": 1.7437764406204224, + "learning_rate": 8.19041873669269e-05, + "loss": 0.05554571747779846, + "step": 127540 + }, + { + "epoch": 18.105039034776436, + "grad_norm": 0.5341615080833435, + "learning_rate": 8.190276792051101e-05, + "loss": 0.03598266541957855, + "step": 127550 + }, + { + "epoch": 18.106458481192334, + "grad_norm": 0.07560352236032486, + "learning_rate": 8.190134847409511e-05, + "loss": 0.007796591520309449, + "step": 127560 + }, + { + "epoch": 18.107877927608232, + "grad_norm": 4.293848037719727, + "learning_rate": 8.189992902767922e-05, + "loss": 0.006240556016564369, + "step": 127570 + }, + { + "epoch": 18.10929737402413, + "grad_norm": 0.7019795775413513, + "learning_rate": 8.189850958126332e-05, + "loss": 0.012034883350133896, + "step": 127580 + }, + { + "epoch": 18.11071682044003, + "grad_norm": 0.013099047355353832, + "learning_rate": 8.189709013484741e-05, + "loss": 0.021384745836257935, + "step": 127590 + }, + { + "epoch": 18.112136266855927, + "grad_norm": 8.4081392288208, + "learning_rate": 8.189567068843151e-05, + "loss": 0.008062791824340821, + "step": 127600 + }, + { + "epoch": 18.113555713271825, + "grad_norm": 0.26725485920906067, + "learning_rate": 8.189425124201562e-05, + "loss": 0.006530357897281647, + "step": 127610 + }, + { + "epoch": 18.11497515968772, + "grad_norm": 0.20434775948524475, + "learning_rate": 8.189283179559972e-05, + "loss": 0.011984960734844207, + "step": 127620 + }, + { + "epoch": 18.11639460610362, + "grad_norm": 6.735098838806152, + "learning_rate": 8.189141234918383e-05, + "loss": 0.009172295033931733, + "step": 127630 + }, + { + "epoch": 18.117814052519517, + "grad_norm": 0.07924386858940125, + "learning_rate": 8.188999290276791e-05, + "loss": 0.0017124176025390624, + "step": 127640 + }, + { + "epoch": 18.119233498935415, + "grad_norm": 0.20373296737670898, + "learning_rate": 8.188857345635202e-05, + "loss": 0.004818763583898544, + "step": 127650 + }, + { + "epoch": 18.120652945351313, + "grad_norm": 0.3803417384624481, + "learning_rate": 8.188715400993614e-05, + "loss": 0.0035828322172164915, + "step": 127660 + }, + { + "epoch": 18.12207239176721, + "grad_norm": 14.244536399841309, + "learning_rate": 8.188573456352023e-05, + "loss": 0.02630659341812134, + "step": 127670 + }, + { + "epoch": 18.12349183818311, + "grad_norm": 9.004671096801758, + "learning_rate": 8.188431511710434e-05, + "loss": 0.010734491050243378, + "step": 127680 + }, + { + "epoch": 18.124911284599005, + "grad_norm": 8.941965103149414, + "learning_rate": 8.188289567068843e-05, + "loss": 0.0061449378728866575, + "step": 127690 + }, + { + "epoch": 18.126330731014903, + "grad_norm": 0.6067605018615723, + "learning_rate": 8.188147622427254e-05, + "loss": 0.003517572209239006, + "step": 127700 + }, + { + "epoch": 18.1277501774308, + "grad_norm": 0.03921455517411232, + "learning_rate": 8.188005677785664e-05, + "loss": 0.00920310840010643, + "step": 127710 + }, + { + "epoch": 18.1291696238467, + "grad_norm": 0.08954808861017227, + "learning_rate": 8.187863733144075e-05, + "loss": 0.020823955535888672, + "step": 127720 + }, + { + "epoch": 18.130589070262598, + "grad_norm": 0.1396220326423645, + "learning_rate": 8.187721788502484e-05, + "loss": 0.023295214772224425, + "step": 127730 + }, + { + "epoch": 18.132008516678496, + "grad_norm": 0.06990166008472443, + "learning_rate": 8.187579843860894e-05, + "loss": 0.00844774693250656, + "step": 127740 + }, + { + "epoch": 18.133427963094395, + "grad_norm": 0.69849693775177, + "learning_rate": 8.187437899219305e-05, + "loss": 0.030935484170913696, + "step": 127750 + }, + { + "epoch": 18.13484740951029, + "grad_norm": 0.022997183725237846, + "learning_rate": 8.187295954577715e-05, + "loss": 0.010376222431659698, + "step": 127760 + }, + { + "epoch": 18.136266855926188, + "grad_norm": 0.551956295967102, + "learning_rate": 8.187154009936126e-05, + "loss": 0.0124617338180542, + "step": 127770 + }, + { + "epoch": 18.137686302342086, + "grad_norm": 0.15997475385665894, + "learning_rate": 8.187012065294536e-05, + "loss": 0.00329936146736145, + "step": 127780 + }, + { + "epoch": 18.139105748757984, + "grad_norm": 2.299191951751709, + "learning_rate": 8.186870120652946e-05, + "loss": 0.004490911960601807, + "step": 127790 + }, + { + "epoch": 18.140525195173883, + "grad_norm": 0.010329111479222775, + "learning_rate": 8.186728176011355e-05, + "loss": 0.03573310077190399, + "step": 127800 + }, + { + "epoch": 18.14194464158978, + "grad_norm": 0.03730699419975281, + "learning_rate": 8.186586231369766e-05, + "loss": 0.008077595382928848, + "step": 127810 + }, + { + "epoch": 18.14336408800568, + "grad_norm": 0.1495765596628189, + "learning_rate": 8.186444286728176e-05, + "loss": 0.026599448919296265, + "step": 127820 + }, + { + "epoch": 18.144783534421574, + "grad_norm": 0.8765677809715271, + "learning_rate": 8.186302342086587e-05, + "loss": 0.006034733355045318, + "step": 127830 + }, + { + "epoch": 18.146202980837472, + "grad_norm": 0.0686149150133133, + "learning_rate": 8.186160397444997e-05, + "loss": 0.0200776606798172, + "step": 127840 + }, + { + "epoch": 18.14762242725337, + "grad_norm": 0.061525650322437286, + "learning_rate": 8.186018452803407e-05, + "loss": 0.06339784860610961, + "step": 127850 + }, + { + "epoch": 18.14904187366927, + "grad_norm": 0.015170774422585964, + "learning_rate": 8.185876508161818e-05, + "loss": 0.048648470640182497, + "step": 127860 + }, + { + "epoch": 18.150461320085167, + "grad_norm": 2.982309579849243, + "learning_rate": 8.185734563520228e-05, + "loss": 0.020402514934539796, + "step": 127870 + }, + { + "epoch": 18.151880766501066, + "grad_norm": 11.487321853637695, + "learning_rate": 8.185592618878639e-05, + "loss": 0.011919765919446944, + "step": 127880 + }, + { + "epoch": 18.153300212916964, + "grad_norm": 0.02453085035085678, + "learning_rate": 8.185450674237048e-05, + "loss": 0.0481383353471756, + "step": 127890 + }, + { + "epoch": 18.15471965933286, + "grad_norm": 4.876072883605957, + "learning_rate": 8.185308729595458e-05, + "loss": 0.006667395681142807, + "step": 127900 + }, + { + "epoch": 18.156139105748757, + "grad_norm": 1.0623502731323242, + "learning_rate": 8.185166784953868e-05, + "loss": 0.0022208701819181444, + "step": 127910 + }, + { + "epoch": 18.157558552164655, + "grad_norm": 0.1288275569677353, + "learning_rate": 8.185024840312279e-05, + "loss": 0.009044210612773895, + "step": 127920 + }, + { + "epoch": 18.158977998580554, + "grad_norm": 9.580000877380371, + "learning_rate": 8.184882895670689e-05, + "loss": 0.0466098964214325, + "step": 127930 + }, + { + "epoch": 18.160397444996452, + "grad_norm": 2.10084867477417, + "learning_rate": 8.1847409510291e-05, + "loss": 0.006572528183460236, + "step": 127940 + }, + { + "epoch": 18.16181689141235, + "grad_norm": 0.019893677905201912, + "learning_rate": 8.18459900638751e-05, + "loss": 0.03291449844837189, + "step": 127950 + }, + { + "epoch": 18.16323633782825, + "grad_norm": 6.739614486694336, + "learning_rate": 8.184457061745919e-05, + "loss": 0.02328267991542816, + "step": 127960 + }, + { + "epoch": 18.164655784244143, + "grad_norm": 9.85806941986084, + "learning_rate": 8.18431511710433e-05, + "loss": 0.029339200258255003, + "step": 127970 + }, + { + "epoch": 18.16607523066004, + "grad_norm": 1.8335438966751099, + "learning_rate": 8.18417317246274e-05, + "loss": 0.009429128468036651, + "step": 127980 + }, + { + "epoch": 18.16749467707594, + "grad_norm": 0.15850071609020233, + "learning_rate": 8.184031227821151e-05, + "loss": 0.0174192413687706, + "step": 127990 + }, + { + "epoch": 18.168914123491838, + "grad_norm": 10.395427703857422, + "learning_rate": 8.18388928317956e-05, + "loss": 0.08283510208129882, + "step": 128000 + }, + { + "epoch": 18.168914123491838, + "eval_accuracy": 0.976791505055001, + "eval_loss": 0.09529868513345718, + "eval_runtime": 32.2522, + "eval_samples_per_second": 487.625, + "eval_steps_per_second": 15.255, + "step": 128000 + }, + { + "epoch": 18.170333569907736, + "grad_norm": 0.5031605958938599, + "learning_rate": 8.18374733853797e-05, + "loss": 0.04021282494068146, + "step": 128010 + }, + { + "epoch": 18.171753016323635, + "grad_norm": 0.05247000977396965, + "learning_rate": 8.18360539389638e-05, + "loss": 0.029411664605140685, + "step": 128020 + }, + { + "epoch": 18.173172462739533, + "grad_norm": 0.3838053047657013, + "learning_rate": 8.183463449254791e-05, + "loss": 0.004196440801024437, + "step": 128030 + }, + { + "epoch": 18.174591909155428, + "grad_norm": 1.891951084136963, + "learning_rate": 8.183321504613201e-05, + "loss": 0.009262125194072723, + "step": 128040 + }, + { + "epoch": 18.176011355571326, + "grad_norm": 13.83259105682373, + "learning_rate": 8.183179559971611e-05, + "loss": 0.043445545434951785, + "step": 128050 + }, + { + "epoch": 18.177430801987224, + "grad_norm": 4.659943580627441, + "learning_rate": 8.183037615330022e-05, + "loss": 0.032861173152923584, + "step": 128060 + }, + { + "epoch": 18.178850248403123, + "grad_norm": 0.14418882131576538, + "learning_rate": 8.182895670688432e-05, + "loss": 0.005234483629465103, + "step": 128070 + }, + { + "epoch": 18.18026969481902, + "grad_norm": 0.6910912394523621, + "learning_rate": 8.182753726046843e-05, + "loss": 0.05057108402252197, + "step": 128080 + }, + { + "epoch": 18.18168914123492, + "grad_norm": 10.517608642578125, + "learning_rate": 8.182611781405253e-05, + "loss": 0.010358494520187379, + "step": 128090 + }, + { + "epoch": 18.183108587650818, + "grad_norm": 0.281340628862381, + "learning_rate": 8.182469836763662e-05, + "loss": 0.03498583734035492, + "step": 128100 + }, + { + "epoch": 18.184528034066712, + "grad_norm": 0.248043954372406, + "learning_rate": 8.182327892122072e-05, + "loss": 0.05613068342208862, + "step": 128110 + }, + { + "epoch": 18.18594748048261, + "grad_norm": 2.2355258464813232, + "learning_rate": 8.182185947480483e-05, + "loss": 0.041114142537117, + "step": 128120 + }, + { + "epoch": 18.18736692689851, + "grad_norm": 4.95033073425293, + "learning_rate": 8.182044002838893e-05, + "loss": 0.016805705428123475, + "step": 128130 + }, + { + "epoch": 18.188786373314407, + "grad_norm": 0.05342892184853554, + "learning_rate": 8.181902058197304e-05, + "loss": 0.03471288681030273, + "step": 128140 + }, + { + "epoch": 18.190205819730306, + "grad_norm": 10.261602401733398, + "learning_rate": 8.181760113555714e-05, + "loss": 0.047914010286331174, + "step": 128150 + }, + { + "epoch": 18.191625266146204, + "grad_norm": 0.45671752095222473, + "learning_rate": 8.181618168914123e-05, + "loss": 0.0416656494140625, + "step": 128160 + }, + { + "epoch": 18.193044712562102, + "grad_norm": 0.16699133813381195, + "learning_rate": 8.181476224272535e-05, + "loss": 0.037295478582382205, + "step": 128170 + }, + { + "epoch": 18.194464158977997, + "grad_norm": 0.020635658875107765, + "learning_rate": 8.181334279630944e-05, + "loss": 0.0075481578707695006, + "step": 128180 + }, + { + "epoch": 18.195883605393895, + "grad_norm": 1.0739357471466064, + "learning_rate": 8.181192334989355e-05, + "loss": 0.014541852474212646, + "step": 128190 + }, + { + "epoch": 18.197303051809794, + "grad_norm": 0.07027911394834518, + "learning_rate": 8.181050390347764e-05, + "loss": 0.0018665395677089692, + "step": 128200 + }, + { + "epoch": 18.198722498225692, + "grad_norm": 0.017957065254449844, + "learning_rate": 8.180908445706175e-05, + "loss": 0.008435264229774475, + "step": 128210 + }, + { + "epoch": 18.20014194464159, + "grad_norm": 0.46577832102775574, + "learning_rate": 8.180766501064585e-05, + "loss": 0.06814204454421997, + "step": 128220 + }, + { + "epoch": 18.20156139105749, + "grad_norm": 14.87221908569336, + "learning_rate": 8.180624556422996e-05, + "loss": 0.03074239194393158, + "step": 128230 + }, + { + "epoch": 18.202980837473387, + "grad_norm": 9.295565605163574, + "learning_rate": 8.180482611781405e-05, + "loss": 0.04867202043533325, + "step": 128240 + }, + { + "epoch": 18.20440028388928, + "grad_norm": 4.98245096206665, + "learning_rate": 8.180340667139817e-05, + "loss": 0.025966173410415648, + "step": 128250 + }, + { + "epoch": 18.20581973030518, + "grad_norm": 12.073112487792969, + "learning_rate": 8.180198722498226e-05, + "loss": 0.028777456283569335, + "step": 128260 + }, + { + "epoch": 18.207239176721078, + "grad_norm": 0.19803652167320251, + "learning_rate": 8.180056777856636e-05, + "loss": 0.0357023686170578, + "step": 128270 + }, + { + "epoch": 18.208658623136976, + "grad_norm": 0.04275793582201004, + "learning_rate": 8.179914833215047e-05, + "loss": 0.010345979034900666, + "step": 128280 + }, + { + "epoch": 18.210078069552875, + "grad_norm": 3.300391435623169, + "learning_rate": 8.179772888573457e-05, + "loss": 0.00928306058049202, + "step": 128290 + }, + { + "epoch": 18.211497515968773, + "grad_norm": 0.040431976318359375, + "learning_rate": 8.179630943931868e-05, + "loss": 0.001940181478857994, + "step": 128300 + }, + { + "epoch": 18.21291696238467, + "grad_norm": 2.5737407207489014, + "learning_rate": 8.179488999290276e-05, + "loss": 0.005912529304623604, + "step": 128310 + }, + { + "epoch": 18.214336408800566, + "grad_norm": 0.1068541407585144, + "learning_rate": 8.179347054648687e-05, + "loss": 0.013379514217376709, + "step": 128320 + }, + { + "epoch": 18.215755855216464, + "grad_norm": 1.6886368989944458, + "learning_rate": 8.179205110007097e-05, + "loss": 0.006964053213596344, + "step": 128330 + }, + { + "epoch": 18.217175301632363, + "grad_norm": 0.038944900035858154, + "learning_rate": 8.179063165365508e-05, + "loss": 0.022647054493427278, + "step": 128340 + }, + { + "epoch": 18.21859474804826, + "grad_norm": 0.4597911834716797, + "learning_rate": 8.178921220723918e-05, + "loss": 0.02247808873653412, + "step": 128350 + }, + { + "epoch": 18.22001419446416, + "grad_norm": 9.54317569732666, + "learning_rate": 8.178779276082328e-05, + "loss": 0.011771997064352035, + "step": 128360 + }, + { + "epoch": 18.221433640880058, + "grad_norm": 0.015923280268907547, + "learning_rate": 8.178637331440739e-05, + "loss": 0.005302245542407036, + "step": 128370 + }, + { + "epoch": 18.222853087295956, + "grad_norm": 4.213832855224609, + "learning_rate": 8.178495386799149e-05, + "loss": 0.007035575807094574, + "step": 128380 + }, + { + "epoch": 18.22427253371185, + "grad_norm": 0.07770871371030807, + "learning_rate": 8.17835344215756e-05, + "loss": 0.031133627891540526, + "step": 128390 + }, + { + "epoch": 18.22569198012775, + "grad_norm": 8.806855201721191, + "learning_rate": 8.17821149751597e-05, + "loss": 0.01694534718990326, + "step": 128400 + }, + { + "epoch": 18.227111426543647, + "grad_norm": 6.428601264953613, + "learning_rate": 8.178069552874379e-05, + "loss": 0.016386696696281434, + "step": 128410 + }, + { + "epoch": 18.228530872959546, + "grad_norm": 0.42039939761161804, + "learning_rate": 8.177927608232789e-05, + "loss": 0.018320520222187043, + "step": 128420 + }, + { + "epoch": 18.229950319375444, + "grad_norm": 0.013496828265488148, + "learning_rate": 8.1777856635912e-05, + "loss": 0.005241439118981362, + "step": 128430 + }, + { + "epoch": 18.231369765791342, + "grad_norm": 0.2199200540781021, + "learning_rate": 8.17764371894961e-05, + "loss": 0.006305563449859619, + "step": 128440 + }, + { + "epoch": 18.23278921220724, + "grad_norm": 0.1683681160211563, + "learning_rate": 8.177501774308021e-05, + "loss": 0.050753450393676756, + "step": 128450 + }, + { + "epoch": 18.234208658623135, + "grad_norm": 0.1312318742275238, + "learning_rate": 8.17735982966643e-05, + "loss": 0.030958676338195802, + "step": 128460 + }, + { + "epoch": 18.235628105039034, + "grad_norm": 0.24649330973625183, + "learning_rate": 8.17721788502484e-05, + "loss": 0.02686150074005127, + "step": 128470 + }, + { + "epoch": 18.237047551454932, + "grad_norm": 0.52034592628479, + "learning_rate": 8.177075940383251e-05, + "loss": 0.020387536287307738, + "step": 128480 + }, + { + "epoch": 18.23846699787083, + "grad_norm": 4.3982648849487305, + "learning_rate": 8.176933995741661e-05, + "loss": 0.017361976206302643, + "step": 128490 + }, + { + "epoch": 18.23988644428673, + "grad_norm": 0.3818369507789612, + "learning_rate": 8.176792051100072e-05, + "loss": 0.03366567492485047, + "step": 128500 + }, + { + "epoch": 18.23988644428673, + "eval_accuracy": 0.9851847141857951, + "eval_loss": 0.053453847765922546, + "eval_runtime": 32.9058, + "eval_samples_per_second": 477.94, + "eval_steps_per_second": 14.952, + "step": 128500 + }, + { + "epoch": 18.241305890702627, + "grad_norm": 12.275089263916016, + "learning_rate": 8.17665010645848e-05, + "loss": 0.07093088626861573, + "step": 128510 + }, + { + "epoch": 18.242725337118525, + "grad_norm": 0.02158868871629238, + "learning_rate": 8.176508161816892e-05, + "loss": 0.0045996904373168945, + "step": 128520 + }, + { + "epoch": 18.24414478353442, + "grad_norm": 0.19157670438289642, + "learning_rate": 8.176366217175301e-05, + "loss": 0.014573585987091065, + "step": 128530 + }, + { + "epoch": 18.24556422995032, + "grad_norm": 4.176166534423828, + "learning_rate": 8.176224272533712e-05, + "loss": 0.022593997418880463, + "step": 128540 + }, + { + "epoch": 18.246983676366217, + "grad_norm": 0.0027891797944903374, + "learning_rate": 8.176082327892122e-05, + "loss": 0.0024897228926420213, + "step": 128550 + }, + { + "epoch": 18.248403122782115, + "grad_norm": 0.1233685314655304, + "learning_rate": 8.175940383250532e-05, + "loss": 0.025276607275009154, + "step": 128560 + }, + { + "epoch": 18.249822569198013, + "grad_norm": 2.294764995574951, + "learning_rate": 8.175798438608943e-05, + "loss": 0.018644881248474122, + "step": 128570 + }, + { + "epoch": 18.25124201561391, + "grad_norm": 14.123501777648926, + "learning_rate": 8.175656493967353e-05, + "loss": 0.026422369480133056, + "step": 128580 + }, + { + "epoch": 18.25266146202981, + "grad_norm": 0.04175082966685295, + "learning_rate": 8.175514549325764e-05, + "loss": 0.021905991435050964, + "step": 128590 + }, + { + "epoch": 18.254080908445705, + "grad_norm": 0.9239396452903748, + "learning_rate": 8.175372604684174e-05, + "loss": 0.013399584591388703, + "step": 128600 + }, + { + "epoch": 18.255500354861603, + "grad_norm": 2.069931745529175, + "learning_rate": 8.175230660042585e-05, + "loss": 0.05277642607688904, + "step": 128610 + }, + { + "epoch": 18.2569198012775, + "grad_norm": 1.028808355331421, + "learning_rate": 8.175088715400993e-05, + "loss": 0.015741121768951417, + "step": 128620 + }, + { + "epoch": 18.2583392476934, + "grad_norm": 1.025839924812317, + "learning_rate": 8.174946770759404e-05, + "loss": 0.009937211871147156, + "step": 128630 + }, + { + "epoch": 18.259758694109298, + "grad_norm": 0.1733449101448059, + "learning_rate": 8.174804826117814e-05, + "loss": 0.0087957963347435, + "step": 128640 + }, + { + "epoch": 18.261178140525196, + "grad_norm": 5.729976654052734, + "learning_rate": 8.174662881476225e-05, + "loss": 0.052794671058654784, + "step": 128650 + }, + { + "epoch": 18.262597586941094, + "grad_norm": 1.5335612297058105, + "learning_rate": 8.174520936834635e-05, + "loss": 0.04668539762496948, + "step": 128660 + }, + { + "epoch": 18.26401703335699, + "grad_norm": 0.021340427920222282, + "learning_rate": 8.174378992193044e-05, + "loss": 0.029316258430480958, + "step": 128670 + }, + { + "epoch": 18.265436479772887, + "grad_norm": 1.2407418489456177, + "learning_rate": 8.174237047551456e-05, + "loss": 0.0165558785200119, + "step": 128680 + }, + { + "epoch": 18.266855926188786, + "grad_norm": 0.15023832023143768, + "learning_rate": 8.174095102909865e-05, + "loss": 0.004716331884264946, + "step": 128690 + }, + { + "epoch": 18.268275372604684, + "grad_norm": 3.0022218227386475, + "learning_rate": 8.173953158268276e-05, + "loss": 0.031516680121421815, + "step": 128700 + }, + { + "epoch": 18.269694819020582, + "grad_norm": 0.06068778783082962, + "learning_rate": 8.173811213626686e-05, + "loss": 0.011225759983062744, + "step": 128710 + }, + { + "epoch": 18.27111426543648, + "grad_norm": 14.479620933532715, + "learning_rate": 8.173669268985096e-05, + "loss": 0.0523698627948761, + "step": 128720 + }, + { + "epoch": 18.27253371185238, + "grad_norm": 0.26089462637901306, + "learning_rate": 8.173527324343506e-05, + "loss": 0.03181962370872497, + "step": 128730 + }, + { + "epoch": 18.273953158268274, + "grad_norm": 0.012346764095127583, + "learning_rate": 8.173385379701917e-05, + "loss": 0.012769991159439087, + "step": 128740 + }, + { + "epoch": 18.275372604684172, + "grad_norm": 0.034206002950668335, + "learning_rate": 8.173243435060326e-05, + "loss": 0.004806910455226898, + "step": 128750 + }, + { + "epoch": 18.27679205110007, + "grad_norm": 0.015452612191438675, + "learning_rate": 8.173101490418738e-05, + "loss": 0.0019456423819065095, + "step": 128760 + }, + { + "epoch": 18.27821149751597, + "grad_norm": 0.08761896938085556, + "learning_rate": 8.172959545777147e-05, + "loss": 0.005956121534109115, + "step": 128770 + }, + { + "epoch": 18.279630943931867, + "grad_norm": 0.0656166523694992, + "learning_rate": 8.172817601135557e-05, + "loss": 0.013135121762752533, + "step": 128780 + }, + { + "epoch": 18.281050390347765, + "grad_norm": 0.09261200577020645, + "learning_rate": 8.172675656493968e-05, + "loss": 0.01091306060552597, + "step": 128790 + }, + { + "epoch": 18.282469836763664, + "grad_norm": 3.3840973377227783, + "learning_rate": 8.172533711852378e-05, + "loss": 0.03014102280139923, + "step": 128800 + }, + { + "epoch": 18.28388928317956, + "grad_norm": 3.299243450164795, + "learning_rate": 8.172391767210789e-05, + "loss": 0.053286343812942505, + "step": 128810 + }, + { + "epoch": 18.285308729595457, + "grad_norm": 5.125652313232422, + "learning_rate": 8.172249822569197e-05, + "loss": 0.036524662375450136, + "step": 128820 + }, + { + "epoch": 18.286728176011355, + "grad_norm": 0.01079608965665102, + "learning_rate": 8.172107877927608e-05, + "loss": 0.012189614027738572, + "step": 128830 + }, + { + "epoch": 18.288147622427253, + "grad_norm": 5.404969215393066, + "learning_rate": 8.171965933286018e-05, + "loss": 0.06643599271774292, + "step": 128840 + }, + { + "epoch": 18.28956706884315, + "grad_norm": 0.21422381699085236, + "learning_rate": 8.171823988644429e-05, + "loss": 0.017375747859477996, + "step": 128850 + }, + { + "epoch": 18.29098651525905, + "grad_norm": 0.20465363562107086, + "learning_rate": 8.17168204400284e-05, + "loss": 0.031096231937408448, + "step": 128860 + }, + { + "epoch": 18.292405961674948, + "grad_norm": 1.680165410041809, + "learning_rate": 8.171540099361249e-05, + "loss": 0.024290363490581512, + "step": 128870 + }, + { + "epoch": 18.293825408090843, + "grad_norm": 0.2074621170759201, + "learning_rate": 8.17139815471966e-05, + "loss": 0.018671299517154693, + "step": 128880 + }, + { + "epoch": 18.29524485450674, + "grad_norm": 0.21123144030570984, + "learning_rate": 8.17125621007807e-05, + "loss": 0.060861396789550784, + "step": 128890 + }, + { + "epoch": 18.29666430092264, + "grad_norm": 11.028310775756836, + "learning_rate": 8.17111426543648e-05, + "loss": 0.07039762139320374, + "step": 128900 + }, + { + "epoch": 18.298083747338538, + "grad_norm": 0.025449946522712708, + "learning_rate": 8.17097232079489e-05, + "loss": 0.010196681320667266, + "step": 128910 + }, + { + "epoch": 18.299503193754436, + "grad_norm": 0.018262367695569992, + "learning_rate": 8.170830376153301e-05, + "loss": 0.018941739201545717, + "step": 128920 + }, + { + "epoch": 18.300922640170334, + "grad_norm": 3.6312954425811768, + "learning_rate": 8.17068843151171e-05, + "loss": 0.04633817672729492, + "step": 128930 + }, + { + "epoch": 18.302342086586233, + "grad_norm": 3.760207176208496, + "learning_rate": 8.170546486870121e-05, + "loss": 0.028416919708251952, + "step": 128940 + }, + { + "epoch": 18.303761533002127, + "grad_norm": 2.070991277694702, + "learning_rate": 8.170404542228532e-05, + "loss": 0.004716591536998748, + "step": 128950 + }, + { + "epoch": 18.305180979418026, + "grad_norm": 0.1530076414346695, + "learning_rate": 8.170262597586942e-05, + "loss": 0.012267137318849564, + "step": 128960 + }, + { + "epoch": 18.306600425833924, + "grad_norm": 0.007661824580281973, + "learning_rate": 8.170120652945353e-05, + "loss": 0.005905486643314362, + "step": 128970 + }, + { + "epoch": 18.308019872249822, + "grad_norm": 4.314486026763916, + "learning_rate": 8.169978708303761e-05, + "loss": 0.031454536318778994, + "step": 128980 + }, + { + "epoch": 18.30943931866572, + "grad_norm": 0.3191210627555847, + "learning_rate": 8.169836763662172e-05, + "loss": 0.01454719752073288, + "step": 128990 + }, + { + "epoch": 18.31085876508162, + "grad_norm": 0.23693464696407318, + "learning_rate": 8.169694819020582e-05, + "loss": 0.02261695861816406, + "step": 129000 + }, + { + "epoch": 18.31085876508162, + "eval_accuracy": 0.9855662236917403, + "eval_loss": 0.05503528565168381, + "eval_runtime": 32.4819, + "eval_samples_per_second": 484.177, + "eval_steps_per_second": 15.147, + "step": 129000 + }, + { + "epoch": 18.312278211497517, + "grad_norm": 0.4022933542728424, + "learning_rate": 8.169552874378993e-05, + "loss": 0.05248420238494873, + "step": 129010 + }, + { + "epoch": 18.313697657913412, + "grad_norm": 13.0270414352417, + "learning_rate": 8.169410929737403e-05, + "loss": 0.04522181451320648, + "step": 129020 + }, + { + "epoch": 18.31511710432931, + "grad_norm": 0.7602053284645081, + "learning_rate": 8.169268985095813e-05, + "loss": 0.023917488753795624, + "step": 129030 + }, + { + "epoch": 18.31653655074521, + "grad_norm": 4.949351787567139, + "learning_rate": 8.169127040454224e-05, + "loss": 0.021494348347187043, + "step": 129040 + }, + { + "epoch": 18.317955997161107, + "grad_norm": 0.1981838345527649, + "learning_rate": 8.168985095812633e-05, + "loss": 0.01364341527223587, + "step": 129050 + }, + { + "epoch": 18.319375443577005, + "grad_norm": 0.21024064719676971, + "learning_rate": 8.168843151171045e-05, + "loss": 0.00963202938437462, + "step": 129060 + }, + { + "epoch": 18.320794889992904, + "grad_norm": 0.08986670523881912, + "learning_rate": 8.168701206529454e-05, + "loss": 0.004476574808359146, + "step": 129070 + }, + { + "epoch": 18.322214336408802, + "grad_norm": Infinity, + "learning_rate": 8.168559261887864e-05, + "loss": 0.029144853353500366, + "step": 129080 + }, + { + "epoch": 18.323633782824697, + "grad_norm": 1.82876718044281, + "learning_rate": 8.168431511710434e-05, + "loss": 0.026475942134857176, + "step": 129090 + }, + { + "epoch": 18.325053229240595, + "grad_norm": 10.667966842651367, + "learning_rate": 8.168289567068844e-05, + "loss": 0.021684975922107698, + "step": 129100 + }, + { + "epoch": 18.326472675656493, + "grad_norm": 0.19591830670833588, + "learning_rate": 8.168147622427253e-05, + "loss": 0.005786832049489021, + "step": 129110 + }, + { + "epoch": 18.32789212207239, + "grad_norm": 0.5080521106719971, + "learning_rate": 8.168005677785664e-05, + "loss": 0.02582077980041504, + "step": 129120 + }, + { + "epoch": 18.32931156848829, + "grad_norm": 3.638640880584717, + "learning_rate": 8.167863733144074e-05, + "loss": 0.050702786445617674, + "step": 129130 + }, + { + "epoch": 18.330731014904188, + "grad_norm": 0.13650347292423248, + "learning_rate": 8.167721788502485e-05, + "loss": 0.0599769115447998, + "step": 129140 + }, + { + "epoch": 18.332150461320087, + "grad_norm": 0.37721243500709534, + "learning_rate": 8.167579843860894e-05, + "loss": 0.04912948906421662, + "step": 129150 + }, + { + "epoch": 18.33356990773598, + "grad_norm": 6.161162853240967, + "learning_rate": 8.167437899219305e-05, + "loss": 0.03150567710399628, + "step": 129160 + }, + { + "epoch": 18.33498935415188, + "grad_norm": 0.027903534471988678, + "learning_rate": 8.167295954577714e-05, + "loss": 0.011949583142995834, + "step": 129170 + }, + { + "epoch": 18.336408800567778, + "grad_norm": 0.12960362434387207, + "learning_rate": 8.167154009936126e-05, + "loss": 0.0031189464032649996, + "step": 129180 + }, + { + "epoch": 18.337828246983676, + "grad_norm": 1.6316189765930176, + "learning_rate": 8.167012065294535e-05, + "loss": 0.018705233931541443, + "step": 129190 + }, + { + "epoch": 18.339247693399575, + "grad_norm": 0.2289106547832489, + "learning_rate": 8.166870120652945e-05, + "loss": 0.00795489102602005, + "step": 129200 + }, + { + "epoch": 18.340667139815473, + "grad_norm": 10.622598648071289, + "learning_rate": 8.166728176011356e-05, + "loss": 0.015995678305625916, + "step": 129210 + }, + { + "epoch": 18.34208658623137, + "grad_norm": 0.01911492832005024, + "learning_rate": 8.166586231369766e-05, + "loss": 0.006051765382289886, + "step": 129220 + }, + { + "epoch": 18.343506032647266, + "grad_norm": 0.06445548683404922, + "learning_rate": 8.166444286728177e-05, + "loss": 0.07028995752334595, + "step": 129230 + }, + { + "epoch": 18.344925479063164, + "grad_norm": 0.06971293687820435, + "learning_rate": 8.166302342086587e-05, + "loss": 0.01684674471616745, + "step": 129240 + }, + { + "epoch": 18.346344925479062, + "grad_norm": 0.19688984751701355, + "learning_rate": 8.166160397444998e-05, + "loss": 0.0024049151688814163, + "step": 129250 + }, + { + "epoch": 18.34776437189496, + "grad_norm": 0.0021098207216709852, + "learning_rate": 8.166018452803406e-05, + "loss": 0.024603772163391113, + "step": 129260 + }, + { + "epoch": 18.34918381831086, + "grad_norm": 0.620466411113739, + "learning_rate": 8.165876508161817e-05, + "loss": 0.01730831414461136, + "step": 129270 + }, + { + "epoch": 18.350603264726757, + "grad_norm": 0.025463026016950607, + "learning_rate": 8.165734563520227e-05, + "loss": 0.01627231538295746, + "step": 129280 + }, + { + "epoch": 18.352022711142656, + "grad_norm": 0.13226747512817383, + "learning_rate": 8.165592618878638e-05, + "loss": 0.006072504445910454, + "step": 129290 + }, + { + "epoch": 18.35344215755855, + "grad_norm": 9.42383098602295, + "learning_rate": 8.165450674237048e-05, + "loss": 0.020982658863067626, + "step": 129300 + }, + { + "epoch": 18.35486160397445, + "grad_norm": 0.4462938904762268, + "learning_rate": 8.165308729595458e-05, + "loss": 0.03400824964046478, + "step": 129310 + }, + { + "epoch": 18.356281050390347, + "grad_norm": 2.0822784900665283, + "learning_rate": 8.165166784953869e-05, + "loss": 0.005411965027451515, + "step": 129320 + }, + { + "epoch": 18.357700496806245, + "grad_norm": 0.01747988536953926, + "learning_rate": 8.165024840312278e-05, + "loss": 0.012718930840492249, + "step": 129330 + }, + { + "epoch": 18.359119943222144, + "grad_norm": 0.06952252238988876, + "learning_rate": 8.16488289567069e-05, + "loss": 0.01600598692893982, + "step": 129340 + }, + { + "epoch": 18.360539389638042, + "grad_norm": 3.6428823471069336, + "learning_rate": 8.164740951029099e-05, + "loss": 0.005556048080325127, + "step": 129350 + }, + { + "epoch": 18.36195883605394, + "grad_norm": 2.1990156173706055, + "learning_rate": 8.164599006387509e-05, + "loss": 0.034307444095611574, + "step": 129360 + }, + { + "epoch": 18.363378282469835, + "grad_norm": 0.49816688895225525, + "learning_rate": 8.164457061745919e-05, + "loss": 0.047325742244720456, + "step": 129370 + }, + { + "epoch": 18.364797728885733, + "grad_norm": 6.322769641876221, + "learning_rate": 8.16431511710433e-05, + "loss": 0.015552473068237305, + "step": 129380 + }, + { + "epoch": 18.36621717530163, + "grad_norm": 0.07665533572435379, + "learning_rate": 8.16417317246274e-05, + "loss": 0.03991127610206604, + "step": 129390 + }, + { + "epoch": 18.36763662171753, + "grad_norm": 0.08783033490180969, + "learning_rate": 8.16403122782115e-05, + "loss": 0.008310206234455109, + "step": 129400 + }, + { + "epoch": 18.36905606813343, + "grad_norm": 0.8235506415367126, + "learning_rate": 8.16388928317956e-05, + "loss": 0.02230387181043625, + "step": 129410 + }, + { + "epoch": 18.370475514549327, + "grad_norm": 0.016514234244823456, + "learning_rate": 8.16374733853797e-05, + "loss": 0.004676712676882744, + "step": 129420 + }, + { + "epoch": 18.371894960965225, + "grad_norm": 6.880260944366455, + "learning_rate": 8.163605393896381e-05, + "loss": 0.04326528310775757, + "step": 129430 + }, + { + "epoch": 18.37331440738112, + "grad_norm": 15.991277694702148, + "learning_rate": 8.163463449254791e-05, + "loss": 0.03879193067550659, + "step": 129440 + }, + { + "epoch": 18.374733853797018, + "grad_norm": 0.3291440010070801, + "learning_rate": 8.163321504613202e-05, + "loss": 0.006866324692964554, + "step": 129450 + }, + { + "epoch": 18.376153300212916, + "grad_norm": 0.1651817411184311, + "learning_rate": 8.16317955997161e-05, + "loss": 0.047083538770675656, + "step": 129460 + }, + { + "epoch": 18.377572746628815, + "grad_norm": 1.0862044095993042, + "learning_rate": 8.163037615330022e-05, + "loss": 0.012643066048622132, + "step": 129470 + }, + { + "epoch": 18.378992193044713, + "grad_norm": 0.4422168433666229, + "learning_rate": 8.162895670688431e-05, + "loss": 0.00818169042468071, + "step": 129480 + }, + { + "epoch": 18.38041163946061, + "grad_norm": 5.287008762359619, + "learning_rate": 8.162753726046842e-05, + "loss": 0.012424495071172714, + "step": 129490 + }, + { + "epoch": 18.38183108587651, + "grad_norm": 0.3110944926738739, + "learning_rate": 8.162611781405252e-05, + "loss": 0.003440593555569649, + "step": 129500 + }, + { + "epoch": 18.38183108587651, + "eval_accuracy": 0.9833407515737267, + "eval_loss": 0.061810024082660675, + "eval_runtime": 31.9536, + "eval_samples_per_second": 492.183, + "eval_steps_per_second": 15.397, + "step": 129500 + }, + { + "epoch": 18.383250532292404, + "grad_norm": 5.51362943649292, + "learning_rate": 8.162469836763662e-05, + "loss": 0.045882344245910645, + "step": 129510 + }, + { + "epoch": 18.384669978708303, + "grad_norm": 1.5791926383972168, + "learning_rate": 8.162327892122073e-05, + "loss": 0.06756555438041686, + "step": 129520 + }, + { + "epoch": 18.3860894251242, + "grad_norm": 3.9684646129608154, + "learning_rate": 8.162185947480483e-05, + "loss": 0.02608048915863037, + "step": 129530 + }, + { + "epoch": 18.3875088715401, + "grad_norm": 0.2998262643814087, + "learning_rate": 8.162044002838894e-05, + "loss": 0.02499275803565979, + "step": 129540 + }, + { + "epoch": 18.388928317955997, + "grad_norm": 4.6388020515441895, + "learning_rate": 8.161902058197303e-05, + "loss": 0.010197123885154724, + "step": 129550 + }, + { + "epoch": 18.390347764371896, + "grad_norm": 0.12832055985927582, + "learning_rate": 8.161760113555713e-05, + "loss": 0.014759251475334167, + "step": 129560 + }, + { + "epoch": 18.391767210787794, + "grad_norm": 1.8525927066802979, + "learning_rate": 8.161618168914123e-05, + "loss": 0.012668058276176453, + "step": 129570 + }, + { + "epoch": 18.39318665720369, + "grad_norm": 0.3240136504173279, + "learning_rate": 8.161476224272534e-05, + "loss": 0.024926677346229553, + "step": 129580 + }, + { + "epoch": 18.394606103619587, + "grad_norm": 5.895405292510986, + "learning_rate": 8.161334279630944e-05, + "loss": 0.05909621119499207, + "step": 129590 + }, + { + "epoch": 18.396025550035485, + "grad_norm": 6.2054948806762695, + "learning_rate": 8.161192334989355e-05, + "loss": 0.026874464750289918, + "step": 129600 + }, + { + "epoch": 18.397444996451384, + "grad_norm": 7.646547794342041, + "learning_rate": 8.161050390347766e-05, + "loss": 0.009939579665660859, + "step": 129610 + }, + { + "epoch": 18.398864442867282, + "grad_norm": 0.11051219701766968, + "learning_rate": 8.160908445706174e-05, + "loss": 0.006405261158943176, + "step": 129620 + }, + { + "epoch": 18.40028388928318, + "grad_norm": 7.410157203674316, + "learning_rate": 8.160766501064585e-05, + "loss": 0.04566127061843872, + "step": 129630 + }, + { + "epoch": 18.40170333569908, + "grad_norm": 0.6017818450927734, + "learning_rate": 8.160624556422995e-05, + "loss": 0.041333383321762084, + "step": 129640 + }, + { + "epoch": 18.403122782114973, + "grad_norm": 7.644882678985596, + "learning_rate": 8.160482611781406e-05, + "loss": 0.008430808037519454, + "step": 129650 + }, + { + "epoch": 18.40454222853087, + "grad_norm": 0.5640460848808289, + "learning_rate": 8.160340667139816e-05, + "loss": 0.008810888230800628, + "step": 129660 + }, + { + "epoch": 18.40596167494677, + "grad_norm": 0.032049115747213364, + "learning_rate": 8.160198722498226e-05, + "loss": 0.03185172080993652, + "step": 129670 + }, + { + "epoch": 18.40738112136267, + "grad_norm": 4.317996978759766, + "learning_rate": 8.160056777856635e-05, + "loss": 0.03531225621700287, + "step": 129680 + }, + { + "epoch": 18.408800567778567, + "grad_norm": 0.04317004978656769, + "learning_rate": 8.159914833215047e-05, + "loss": 0.044870421290397644, + "step": 129690 + }, + { + "epoch": 18.410220014194465, + "grad_norm": 2.8784120082855225, + "learning_rate": 8.159772888573458e-05, + "loss": 0.028561246395111085, + "step": 129700 + }, + { + "epoch": 18.411639460610363, + "grad_norm": 0.2848926782608032, + "learning_rate": 8.159630943931867e-05, + "loss": 0.01058817058801651, + "step": 129710 + }, + { + "epoch": 18.413058907026258, + "grad_norm": 0.2673834562301636, + "learning_rate": 8.159488999290277e-05, + "loss": 0.0068480148911476135, + "step": 129720 + }, + { + "epoch": 18.414478353442156, + "grad_norm": 0.07594344019889832, + "learning_rate": 8.159347054648687e-05, + "loss": 0.005010564252734185, + "step": 129730 + }, + { + "epoch": 18.415897799858055, + "grad_norm": 0.028032490983605385, + "learning_rate": 8.159205110007098e-05, + "loss": 0.0075709976255893706, + "step": 129740 + }, + { + "epoch": 18.417317246273953, + "grad_norm": 0.05056433379650116, + "learning_rate": 8.159063165365508e-05, + "loss": 0.021717457473278044, + "step": 129750 + }, + { + "epoch": 18.41873669268985, + "grad_norm": 0.35659167170524597, + "learning_rate": 8.158921220723919e-05, + "loss": 0.0025276631116867066, + "step": 129760 + }, + { + "epoch": 18.42015613910575, + "grad_norm": 7.861473083496094, + "learning_rate": 8.158779276082327e-05, + "loss": 0.011818940937519073, + "step": 129770 + }, + { + "epoch": 18.421575585521648, + "grad_norm": 0.0440661758184433, + "learning_rate": 8.158637331440738e-05, + "loss": 0.025470623373985292, + "step": 129780 + }, + { + "epoch": 18.422995031937543, + "grad_norm": 0.26259395480155945, + "learning_rate": 8.158495386799148e-05, + "loss": 0.03641534149646759, + "step": 129790 + }, + { + "epoch": 18.42441447835344, + "grad_norm": 0.06135106831789017, + "learning_rate": 8.158353442157559e-05, + "loss": 0.014743022620677948, + "step": 129800 + }, + { + "epoch": 18.42583392476934, + "grad_norm": 0.306878000497818, + "learning_rate": 8.15821149751597e-05, + "loss": 0.04522762894630432, + "step": 129810 + }, + { + "epoch": 18.427253371185238, + "grad_norm": 0.21729759871959686, + "learning_rate": 8.158069552874379e-05, + "loss": 0.057735615968704225, + "step": 129820 + }, + { + "epoch": 18.428672817601136, + "grad_norm": 0.7416278719902039, + "learning_rate": 8.15792760823279e-05, + "loss": 0.016473343968391417, + "step": 129830 + }, + { + "epoch": 18.430092264017034, + "grad_norm": 1.3812849521636963, + "learning_rate": 8.1577856635912e-05, + "loss": 0.007637700438499451, + "step": 129840 + }, + { + "epoch": 18.431511710432932, + "grad_norm": 8.001340866088867, + "learning_rate": 8.15764371894961e-05, + "loss": 0.03369604349136353, + "step": 129850 + }, + { + "epoch": 18.432931156848827, + "grad_norm": 0.0657612532377243, + "learning_rate": 8.15750177430802e-05, + "loss": 0.0019177131354808808, + "step": 129860 + }, + { + "epoch": 18.434350603264726, + "grad_norm": 0.5097655653953552, + "learning_rate": 8.15735982966643e-05, + "loss": 0.023853468894958495, + "step": 129870 + }, + { + "epoch": 18.435770049680624, + "grad_norm": 0.03282688185572624, + "learning_rate": 8.15721788502484e-05, + "loss": 0.0029218826442956925, + "step": 129880 + }, + { + "epoch": 18.437189496096522, + "grad_norm": 1.0238633155822754, + "learning_rate": 8.157075940383251e-05, + "loss": 0.0808078944683075, + "step": 129890 + }, + { + "epoch": 18.43860894251242, + "grad_norm": 0.7021051645278931, + "learning_rate": 8.156933995741662e-05, + "loss": 0.011131210625171662, + "step": 129900 + }, + { + "epoch": 18.44002838892832, + "grad_norm": 0.019014930352568626, + "learning_rate": 8.156792051100072e-05, + "loss": 0.02635202705860138, + "step": 129910 + }, + { + "epoch": 18.441447835344217, + "grad_norm": 0.11540888994932175, + "learning_rate": 8.156650106458481e-05, + "loss": 0.0037258245050907136, + "step": 129920 + }, + { + "epoch": 18.442867281760112, + "grad_norm": 11.679533958435059, + "learning_rate": 8.156508161816891e-05, + "loss": 0.052882683277130124, + "step": 129930 + }, + { + "epoch": 18.44428672817601, + "grad_norm": 0.2892465889453888, + "learning_rate": 8.156366217175302e-05, + "loss": 0.029611861705780028, + "step": 129940 + }, + { + "epoch": 18.44570617459191, + "grad_norm": 0.1754387468099594, + "learning_rate": 8.156224272533712e-05, + "loss": 0.008679266273975372, + "step": 129950 + }, + { + "epoch": 18.447125621007807, + "grad_norm": 0.4253217875957489, + "learning_rate": 8.156082327892123e-05, + "loss": 0.025545185804367064, + "step": 129960 + }, + { + "epoch": 18.448545067423705, + "grad_norm": 1.0162055492401123, + "learning_rate": 8.155940383250533e-05, + "loss": 0.0023859657347202303, + "step": 129970 + }, + { + "epoch": 18.449964513839603, + "grad_norm": 0.161495178937912, + "learning_rate": 8.155798438608943e-05, + "loss": 0.009250961244106293, + "step": 129980 + }, + { + "epoch": 18.4513839602555, + "grad_norm": 0.13088825345039368, + "learning_rate": 8.155656493967354e-05, + "loss": 0.010349231958389282, + "step": 129990 + }, + { + "epoch": 18.4528034066714, + "grad_norm": 11.125255584716797, + "learning_rate": 8.155514549325763e-05, + "loss": 0.012452618777751922, + "step": 130000 + }, + { + "epoch": 18.4528034066714, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04782792553305626, + "eval_runtime": 31.6384, + "eval_samples_per_second": 497.086, + "eval_steps_per_second": 15.551, + "step": 130000 + }, + { + "epoch": 18.454222853087295, + "grad_norm": 8.363743782043457, + "learning_rate": 8.155372604684174e-05, + "loss": 0.04421942234039307, + "step": 130010 + }, + { + "epoch": 18.455642299503193, + "grad_norm": 0.00922936387360096, + "learning_rate": 8.155230660042584e-05, + "loss": 0.005544114112854004, + "step": 130020 + }, + { + "epoch": 18.45706174591909, + "grad_norm": 0.04554932191967964, + "learning_rate": 8.155088715400994e-05, + "loss": 0.04884060621261597, + "step": 130030 + }, + { + "epoch": 18.45848119233499, + "grad_norm": 0.05452294275164604, + "learning_rate": 8.154946770759404e-05, + "loss": 0.01580309122800827, + "step": 130040 + }, + { + "epoch": 18.459900638750888, + "grad_norm": 0.23617449402809143, + "learning_rate": 8.154804826117815e-05, + "loss": 0.02258615642786026, + "step": 130050 + }, + { + "epoch": 18.461320085166786, + "grad_norm": 0.11298935860395432, + "learning_rate": 8.154662881476224e-05, + "loss": 0.014024610817432403, + "step": 130060 + }, + { + "epoch": 18.462739531582685, + "grad_norm": 0.07385504245758057, + "learning_rate": 8.154520936834636e-05, + "loss": 0.004758263379335404, + "step": 130070 + }, + { + "epoch": 18.46415897799858, + "grad_norm": 0.18449749052524567, + "learning_rate": 8.154378992193045e-05, + "loss": 0.013836902379989625, + "step": 130080 + }, + { + "epoch": 18.465578424414478, + "grad_norm": 0.08751697838306427, + "learning_rate": 8.154237047551455e-05, + "loss": 0.04114283621311188, + "step": 130090 + }, + { + "epoch": 18.466997870830376, + "grad_norm": 0.015224000439047813, + "learning_rate": 8.154095102909866e-05, + "loss": 0.01883476823568344, + "step": 130100 + }, + { + "epoch": 18.468417317246274, + "grad_norm": 0.01533492747694254, + "learning_rate": 8.153953158268276e-05, + "loss": 0.012298651039600372, + "step": 130110 + }, + { + "epoch": 18.469836763662173, + "grad_norm": 1.5610331296920776, + "learning_rate": 8.153811213626687e-05, + "loss": 0.00967889130115509, + "step": 130120 + }, + { + "epoch": 18.47125621007807, + "grad_norm": 7.301101207733154, + "learning_rate": 8.153669268985095e-05, + "loss": 0.057591044902801515, + "step": 130130 + }, + { + "epoch": 18.47267565649397, + "grad_norm": 6.648977279663086, + "learning_rate": 8.153527324343506e-05, + "loss": 0.023354032635688783, + "step": 130140 + }, + { + "epoch": 18.474095102909864, + "grad_norm": 0.012980490922927856, + "learning_rate": 8.153385379701916e-05, + "loss": 0.02406987249851227, + "step": 130150 + }, + { + "epoch": 18.475514549325762, + "grad_norm": 0.22776415944099426, + "learning_rate": 8.153243435060327e-05, + "loss": 0.007413412630558014, + "step": 130160 + }, + { + "epoch": 18.47693399574166, + "grad_norm": 1.5166367292404175, + "learning_rate": 8.153101490418737e-05, + "loss": 0.06717569828033447, + "step": 130170 + }, + { + "epoch": 18.47835344215756, + "grad_norm": 2.784849166870117, + "learning_rate": 8.152959545777147e-05, + "loss": 0.031895536184310916, + "step": 130180 + }, + { + "epoch": 18.479772888573457, + "grad_norm": 0.029781892895698547, + "learning_rate": 8.152817601135558e-05, + "loss": 0.04838870763778687, + "step": 130190 + }, + { + "epoch": 18.481192334989355, + "grad_norm": 6.059544086456299, + "learning_rate": 8.152675656493968e-05, + "loss": 0.12346867322921753, + "step": 130200 + }, + { + "epoch": 18.482611781405254, + "grad_norm": 2.957576274871826, + "learning_rate": 8.152533711852379e-05, + "loss": 0.02838844358921051, + "step": 130210 + }, + { + "epoch": 18.48403122782115, + "grad_norm": 0.07354004681110382, + "learning_rate": 8.152391767210788e-05, + "loss": 0.06742051839828492, + "step": 130220 + }, + { + "epoch": 18.485450674237047, + "grad_norm": 0.6618748307228088, + "learning_rate": 8.152249822569198e-05, + "loss": 0.02543102204799652, + "step": 130230 + }, + { + "epoch": 18.486870120652945, + "grad_norm": 1.7090263366699219, + "learning_rate": 8.152107877927608e-05, + "loss": 0.02140275239944458, + "step": 130240 + }, + { + "epoch": 18.488289567068843, + "grad_norm": 1.1480499505996704, + "learning_rate": 8.151965933286019e-05, + "loss": 0.031545788049697876, + "step": 130250 + }, + { + "epoch": 18.48970901348474, + "grad_norm": 4.303988933563232, + "learning_rate": 8.151823988644429e-05, + "loss": 0.004767316952347755, + "step": 130260 + }, + { + "epoch": 18.49112845990064, + "grad_norm": 0.045829661190509796, + "learning_rate": 8.15168204400284e-05, + "loss": 0.015709532797336577, + "step": 130270 + }, + { + "epoch": 18.49254790631654, + "grad_norm": 0.04191277548670769, + "learning_rate": 8.15154009936125e-05, + "loss": 0.003898696228861809, + "step": 130280 + }, + { + "epoch": 18.493967352732433, + "grad_norm": 0.24794617295265198, + "learning_rate": 8.151398154719659e-05, + "loss": 0.005687400698661804, + "step": 130290 + }, + { + "epoch": 18.49538679914833, + "grad_norm": 9.907116889953613, + "learning_rate": 8.15125621007807e-05, + "loss": 0.028389474749565123, + "step": 130300 + }, + { + "epoch": 18.49680624556423, + "grad_norm": 0.5462149381637573, + "learning_rate": 8.15111426543648e-05, + "loss": 0.0044613339006900786, + "step": 130310 + }, + { + "epoch": 18.498225691980128, + "grad_norm": 0.2734536826610565, + "learning_rate": 8.150972320794891e-05, + "loss": 0.003214791417121887, + "step": 130320 + }, + { + "epoch": 18.499645138396026, + "grad_norm": 0.014535377733409405, + "learning_rate": 8.150830376153301e-05, + "loss": 0.031720873713493344, + "step": 130330 + }, + { + "epoch": 18.501064584811925, + "grad_norm": 0.18706893920898438, + "learning_rate": 8.150688431511711e-05, + "loss": 0.009772472828626633, + "step": 130340 + }, + { + "epoch": 18.502484031227823, + "grad_norm": 3.6424851417541504, + "learning_rate": 8.15054648687012e-05, + "loss": 0.04417179822921753, + "step": 130350 + }, + { + "epoch": 18.503903477643718, + "grad_norm": 0.017430992797017097, + "learning_rate": 8.150404542228532e-05, + "loss": 0.015689238905906677, + "step": 130360 + }, + { + "epoch": 18.505322924059616, + "grad_norm": 0.010013514198362827, + "learning_rate": 8.150262597586941e-05, + "loss": 0.014602565765380859, + "step": 130370 + }, + { + "epoch": 18.506742370475514, + "grad_norm": 0.896862268447876, + "learning_rate": 8.150120652945352e-05, + "loss": 0.0020655494183301924, + "step": 130380 + }, + { + "epoch": 18.508161816891413, + "grad_norm": 3.901364326477051, + "learning_rate": 8.149978708303762e-05, + "loss": 0.013213767111301422, + "step": 130390 + }, + { + "epoch": 18.50958126330731, + "grad_norm": 2.040121078491211, + "learning_rate": 8.149836763662172e-05, + "loss": 0.018503423035144805, + "step": 130400 + }, + { + "epoch": 18.51100070972321, + "grad_norm": 0.017969397827982903, + "learning_rate": 8.149694819020583e-05, + "loss": 0.009773757308721542, + "step": 130410 + }, + { + "epoch": 18.512420156139108, + "grad_norm": 0.004456004127860069, + "learning_rate": 8.149552874378993e-05, + "loss": 0.00311664380133152, + "step": 130420 + }, + { + "epoch": 18.513839602555002, + "grad_norm": 2.9926350116729736, + "learning_rate": 8.149410929737404e-05, + "loss": 0.02173994779586792, + "step": 130430 + }, + { + "epoch": 18.5152590489709, + "grad_norm": 0.16839352250099182, + "learning_rate": 8.149268985095812e-05, + "loss": 0.016191045939922332, + "step": 130440 + }, + { + "epoch": 18.5166784953868, + "grad_norm": 0.08863834291696548, + "learning_rate": 8.149127040454223e-05, + "loss": 0.022671495378017426, + "step": 130450 + }, + { + "epoch": 18.518097941802697, + "grad_norm": 1.1707170009613037, + "learning_rate": 8.148985095812633e-05, + "loss": 0.004408159479498863, + "step": 130460 + }, + { + "epoch": 18.519517388218595, + "grad_norm": 0.09221091866493225, + "learning_rate": 8.148843151171044e-05, + "loss": 0.01547957956790924, + "step": 130470 + }, + { + "epoch": 18.520936834634494, + "grad_norm": 0.13105377554893494, + "learning_rate": 8.148701206529454e-05, + "loss": 0.018665242195129394, + "step": 130480 + }, + { + "epoch": 18.522356281050392, + "grad_norm": 1.4174994230270386, + "learning_rate": 8.148559261887864e-05, + "loss": 0.027663955092430116, + "step": 130490 + }, + { + "epoch": 18.523775727466287, + "grad_norm": 8.597201347351074, + "learning_rate": 8.148417317246275e-05, + "loss": 0.0659090518951416, + "step": 130500 + }, + { + "epoch": 18.523775727466287, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.05306238681077957, + "eval_runtime": 32.6311, + "eval_samples_per_second": 481.963, + "eval_steps_per_second": 15.078, + "step": 130500 + }, + { + "epoch": 18.525195173882185, + "grad_norm": 3.4279048442840576, + "learning_rate": 8.148275372604684e-05, + "loss": 0.030299320816993713, + "step": 130510 + }, + { + "epoch": 18.526614620298083, + "grad_norm": 2.21703839302063, + "learning_rate": 8.148133427963095e-05, + "loss": 0.015195395052433013, + "step": 130520 + }, + { + "epoch": 18.528034066713982, + "grad_norm": 0.005461210384964943, + "learning_rate": 8.147991483321505e-05, + "loss": 0.006234246119856834, + "step": 130530 + }, + { + "epoch": 18.52945351312988, + "grad_norm": 6.675775051116943, + "learning_rate": 8.147849538679915e-05, + "loss": 0.023515474796295167, + "step": 130540 + }, + { + "epoch": 18.53087295954578, + "grad_norm": 2.0605430603027344, + "learning_rate": 8.147707594038325e-05, + "loss": 0.028863516449928284, + "step": 130550 + }, + { + "epoch": 18.532292405961677, + "grad_norm": 0.08040369302034378, + "learning_rate": 8.147565649396736e-05, + "loss": 0.015750017762184144, + "step": 130560 + }, + { + "epoch": 18.53371185237757, + "grad_norm": 10.392151832580566, + "learning_rate": 8.147423704755146e-05, + "loss": 0.051103293895721436, + "step": 130570 + }, + { + "epoch": 18.53513129879347, + "grad_norm": 0.018073299899697304, + "learning_rate": 8.147281760113557e-05, + "loss": 0.01106690764427185, + "step": 130580 + }, + { + "epoch": 18.536550745209368, + "grad_norm": 7.361319065093994, + "learning_rate": 8.147139815471966e-05, + "loss": 0.016362231969833375, + "step": 130590 + }, + { + "epoch": 18.537970191625266, + "grad_norm": 0.07301932573318481, + "learning_rate": 8.146997870830376e-05, + "loss": 0.03403286039829254, + "step": 130600 + }, + { + "epoch": 18.539389638041165, + "grad_norm": 2.030935764312744, + "learning_rate": 8.146855926188787e-05, + "loss": 0.04147332310676575, + "step": 130610 + }, + { + "epoch": 18.540809084457063, + "grad_norm": 0.02401145175099373, + "learning_rate": 8.146713981547197e-05, + "loss": 0.060898661613464355, + "step": 130620 + }, + { + "epoch": 18.54222853087296, + "grad_norm": 0.40541765093803406, + "learning_rate": 8.146572036905608e-05, + "loss": 0.0188491553068161, + "step": 130630 + }, + { + "epoch": 18.543647977288856, + "grad_norm": 0.05343012139201164, + "learning_rate": 8.146430092264016e-05, + "loss": 0.015254667401313782, + "step": 130640 + }, + { + "epoch": 18.545067423704754, + "grad_norm": 2.4271240234375, + "learning_rate": 8.146288147622427e-05, + "loss": 0.004261807724833489, + "step": 130650 + }, + { + "epoch": 18.546486870120653, + "grad_norm": 0.08628285676240921, + "learning_rate": 8.146146202980837e-05, + "loss": 0.0023907829076051713, + "step": 130660 + }, + { + "epoch": 18.54790631653655, + "grad_norm": 0.017023751512169838, + "learning_rate": 8.146004258339248e-05, + "loss": 0.026684251427650452, + "step": 130670 + }, + { + "epoch": 18.54932576295245, + "grad_norm": 0.0902213305234909, + "learning_rate": 8.145862313697658e-05, + "loss": 0.01770784556865692, + "step": 130680 + }, + { + "epoch": 18.550745209368348, + "grad_norm": 0.03622569516301155, + "learning_rate": 8.145720369056069e-05, + "loss": 0.01470385491847992, + "step": 130690 + }, + { + "epoch": 18.552164655784246, + "grad_norm": 0.4535423517227173, + "learning_rate": 8.145578424414479e-05, + "loss": 0.007574498653411865, + "step": 130700 + }, + { + "epoch": 18.55358410220014, + "grad_norm": 0.06368441134691238, + "learning_rate": 8.145436479772889e-05, + "loss": 0.05138496160507202, + "step": 130710 + }, + { + "epoch": 18.55500354861604, + "grad_norm": 0.03241115063428879, + "learning_rate": 8.1452945351313e-05, + "loss": 0.04685293734073639, + "step": 130720 + }, + { + "epoch": 18.556422995031937, + "grad_norm": 2.8476345539093018, + "learning_rate": 8.14515259048971e-05, + "loss": 0.024661506712436675, + "step": 130730 + }, + { + "epoch": 18.557842441447836, + "grad_norm": 1.1679760217666626, + "learning_rate": 8.14501064584812e-05, + "loss": 0.006464455276727676, + "step": 130740 + }, + { + "epoch": 18.559261887863734, + "grad_norm": 0.24937348067760468, + "learning_rate": 8.144868701206529e-05, + "loss": 0.026166808605194092, + "step": 130750 + }, + { + "epoch": 18.560681334279632, + "grad_norm": 2.66959285736084, + "learning_rate": 8.14472675656494e-05, + "loss": 0.0420440137386322, + "step": 130760 + }, + { + "epoch": 18.56210078069553, + "grad_norm": 13.330845832824707, + "learning_rate": 8.14458481192335e-05, + "loss": 0.08298577070236206, + "step": 130770 + }, + { + "epoch": 18.563520227111425, + "grad_norm": 0.13030719757080078, + "learning_rate": 8.144442867281761e-05, + "loss": 0.0045199781656265255, + "step": 130780 + }, + { + "epoch": 18.564939673527324, + "grad_norm": 0.3060011863708496, + "learning_rate": 8.14430092264017e-05, + "loss": 0.022911277413368226, + "step": 130790 + }, + { + "epoch": 18.566359119943222, + "grad_norm": 7.349360466003418, + "learning_rate": 8.14415897799858e-05, + "loss": 0.05471324324607849, + "step": 130800 + }, + { + "epoch": 18.56777856635912, + "grad_norm": 4.4315714836120605, + "learning_rate": 8.144017033356991e-05, + "loss": 0.013483393192291259, + "step": 130810 + }, + { + "epoch": 18.56919801277502, + "grad_norm": 1.7409733533859253, + "learning_rate": 8.143875088715401e-05, + "loss": 0.01805341839790344, + "step": 130820 + }, + { + "epoch": 18.570617459190917, + "grad_norm": 0.010257486253976822, + "learning_rate": 8.143733144073812e-05, + "loss": 0.016329763829708098, + "step": 130830 + }, + { + "epoch": 18.572036905606815, + "grad_norm": 11.223624229431152, + "learning_rate": 8.143591199432222e-05, + "loss": 0.07738075852394104, + "step": 130840 + }, + { + "epoch": 18.57345635202271, + "grad_norm": 0.13674846291542053, + "learning_rate": 8.143449254790632e-05, + "loss": 0.01924886107444763, + "step": 130850 + }, + { + "epoch": 18.574875798438608, + "grad_norm": 0.4946594834327698, + "learning_rate": 8.143307310149041e-05, + "loss": 0.029023009538650512, + "step": 130860 + }, + { + "epoch": 18.576295244854506, + "grad_norm": 0.5943418145179749, + "learning_rate": 8.143165365507453e-05, + "loss": 0.02415831983089447, + "step": 130870 + }, + { + "epoch": 18.577714691270405, + "grad_norm": 0.08107198774814606, + "learning_rate": 8.143023420865862e-05, + "loss": 0.003249601274728775, + "step": 130880 + }, + { + "epoch": 18.579134137686303, + "grad_norm": 0.24566154181957245, + "learning_rate": 8.142881476224273e-05, + "loss": 0.0384760707616806, + "step": 130890 + }, + { + "epoch": 18.5805535841022, + "grad_norm": 4.109685897827148, + "learning_rate": 8.142739531582683e-05, + "loss": 0.006746883690357208, + "step": 130900 + }, + { + "epoch": 18.5819730305181, + "grad_norm": 0.12463272362947464, + "learning_rate": 8.142597586941093e-05, + "loss": 0.021722891926765443, + "step": 130910 + }, + { + "epoch": 18.583392476933994, + "grad_norm": 0.04137643799185753, + "learning_rate": 8.142455642299504e-05, + "loss": 0.0485242635011673, + "step": 130920 + }, + { + "epoch": 18.584811923349893, + "grad_norm": 0.5120498538017273, + "learning_rate": 8.142313697657914e-05, + "loss": 0.01771296113729477, + "step": 130930 + }, + { + "epoch": 18.58623136976579, + "grad_norm": 0.3289685845375061, + "learning_rate": 8.142171753016325e-05, + "loss": 0.010862819850444794, + "step": 130940 + }, + { + "epoch": 18.58765081618169, + "grad_norm": 0.8856240510940552, + "learning_rate": 8.142029808374733e-05, + "loss": 0.01859651505947113, + "step": 130950 + }, + { + "epoch": 18.589070262597588, + "grad_norm": 11.222086906433105, + "learning_rate": 8.141887863733144e-05, + "loss": 0.038225024938583374, + "step": 130960 + }, + { + "epoch": 18.590489709013486, + "grad_norm": 9.186075210571289, + "learning_rate": 8.141745919091554e-05, + "loss": 0.027800050377845765, + "step": 130970 + }, + { + "epoch": 18.591909155429384, + "grad_norm": 2.4053432941436768, + "learning_rate": 8.141603974449965e-05, + "loss": 0.009478311240673064, + "step": 130980 + }, + { + "epoch": 18.59332860184528, + "grad_norm": 0.3215068578720093, + "learning_rate": 8.141462029808375e-05, + "loss": 0.050688672065734866, + "step": 130990 + }, + { + "epoch": 18.594748048261177, + "grad_norm": 1.0421249866485596, + "learning_rate": 8.141320085166785e-05, + "loss": 0.013289576768875122, + "step": 131000 + }, + { + "epoch": 18.594748048261177, + "eval_accuracy": 0.9817511286322884, + "eval_loss": 0.07066646963357925, + "eval_runtime": 33.9117, + "eval_samples_per_second": 463.763, + "eval_steps_per_second": 14.508, + "step": 131000 + }, + { + "epoch": 18.596167494677076, + "grad_norm": 0.13267114758491516, + "learning_rate": 8.141178140525196e-05, + "loss": 0.009116743505001069, + "step": 131010 + }, + { + "epoch": 18.597586941092974, + "grad_norm": 9.238930702209473, + "learning_rate": 8.141036195883605e-05, + "loss": 0.03145955801010132, + "step": 131020 + }, + { + "epoch": 18.599006387508872, + "grad_norm": 0.05779772996902466, + "learning_rate": 8.140894251242016e-05, + "loss": 0.011149019002914429, + "step": 131030 + }, + { + "epoch": 18.60042583392477, + "grad_norm": 0.08412010222673416, + "learning_rate": 8.140752306600426e-05, + "loss": 0.04695343375205994, + "step": 131040 + }, + { + "epoch": 18.60184528034067, + "grad_norm": 1.983355164527893, + "learning_rate": 8.140610361958837e-05, + "loss": 0.0149237260222435, + "step": 131050 + }, + { + "epoch": 18.603264726756564, + "grad_norm": 9.849051475524902, + "learning_rate": 8.140468417317246e-05, + "loss": 0.06015897989273071, + "step": 131060 + }, + { + "epoch": 18.604684173172462, + "grad_norm": 3.288647413253784, + "learning_rate": 8.140326472675657e-05, + "loss": 0.020872029662132262, + "step": 131070 + }, + { + "epoch": 18.60610361958836, + "grad_norm": 0.6425996422767639, + "learning_rate": 8.140184528034067e-05, + "loss": 0.02088252305984497, + "step": 131080 + }, + { + "epoch": 18.60752306600426, + "grad_norm": 1.5062687397003174, + "learning_rate": 8.140042583392478e-05, + "loss": 0.02488507628440857, + "step": 131090 + }, + { + "epoch": 18.608942512420157, + "grad_norm": 12.218483924865723, + "learning_rate": 8.139900638750889e-05, + "loss": 0.04160102307796478, + "step": 131100 + }, + { + "epoch": 18.610361958836055, + "grad_norm": 3.886253595352173, + "learning_rate": 8.139758694109297e-05, + "loss": 0.012016545236110687, + "step": 131110 + }, + { + "epoch": 18.611781405251953, + "grad_norm": 6.686439037322998, + "learning_rate": 8.139616749467708e-05, + "loss": 0.012572765350341797, + "step": 131120 + }, + { + "epoch": 18.613200851667848, + "grad_norm": 0.027732079848647118, + "learning_rate": 8.139474804826118e-05, + "loss": 0.0567524254322052, + "step": 131130 + }, + { + "epoch": 18.614620298083747, + "grad_norm": 0.08063772320747375, + "learning_rate": 8.139332860184529e-05, + "loss": 0.0034560371190309525, + "step": 131140 + }, + { + "epoch": 18.616039744499645, + "grad_norm": 0.16642643511295319, + "learning_rate": 8.139190915542939e-05, + "loss": 0.024695418775081635, + "step": 131150 + }, + { + "epoch": 18.617459190915543, + "grad_norm": 1.7551262378692627, + "learning_rate": 8.139048970901348e-05, + "loss": 0.026375973224639894, + "step": 131160 + }, + { + "epoch": 18.61887863733144, + "grad_norm": 7.997278213500977, + "learning_rate": 8.138907026259758e-05, + "loss": 0.01700562834739685, + "step": 131170 + }, + { + "epoch": 18.62029808374734, + "grad_norm": 1.091167688369751, + "learning_rate": 8.138765081618169e-05, + "loss": 0.009547965228557586, + "step": 131180 + }, + { + "epoch": 18.621717530163238, + "grad_norm": 11.500609397888184, + "learning_rate": 8.13862313697658e-05, + "loss": 0.07945277094841004, + "step": 131190 + }, + { + "epoch": 18.623136976579133, + "grad_norm": 6.0473432540893555, + "learning_rate": 8.13848119233499e-05, + "loss": 0.014636990427970887, + "step": 131200 + }, + { + "epoch": 18.62455642299503, + "grad_norm": 0.04876257851719856, + "learning_rate": 8.1383392476934e-05, + "loss": 0.039440539479255673, + "step": 131210 + }, + { + "epoch": 18.62597586941093, + "grad_norm": 1.9998284578323364, + "learning_rate": 8.13819730305181e-05, + "loss": 0.0699110209941864, + "step": 131220 + }, + { + "epoch": 18.627395315826828, + "grad_norm": 0.19066566228866577, + "learning_rate": 8.138055358410221e-05, + "loss": 0.017888715863227843, + "step": 131230 + }, + { + "epoch": 18.628814762242726, + "grad_norm": 0.1404818296432495, + "learning_rate": 8.13791341376863e-05, + "loss": 0.02370113581418991, + "step": 131240 + }, + { + "epoch": 18.630234208658624, + "grad_norm": 4.469192028045654, + "learning_rate": 8.137771469127042e-05, + "loss": 0.012822465598583221, + "step": 131250 + }, + { + "epoch": 18.631653655074523, + "grad_norm": 4.36820650100708, + "learning_rate": 8.13762952448545e-05, + "loss": 0.01055520549416542, + "step": 131260 + }, + { + "epoch": 18.633073101490417, + "grad_norm": 0.11531079560518265, + "learning_rate": 8.137487579843861e-05, + "loss": 0.005679406225681305, + "step": 131270 + }, + { + "epoch": 18.634492547906316, + "grad_norm": 0.1494099348783493, + "learning_rate": 8.137345635202272e-05, + "loss": 0.011889305710792542, + "step": 131280 + }, + { + "epoch": 18.635911994322214, + "grad_norm": 1.323119044303894, + "learning_rate": 8.137203690560682e-05, + "loss": 0.027772819995880126, + "step": 131290 + }, + { + "epoch": 18.637331440738112, + "grad_norm": 0.19452044367790222, + "learning_rate": 8.137061745919093e-05, + "loss": 0.022980180382728577, + "step": 131300 + }, + { + "epoch": 18.63875088715401, + "grad_norm": 0.009213218465447426, + "learning_rate": 8.136919801277501e-05, + "loss": 0.011797212064266205, + "step": 131310 + }, + { + "epoch": 18.64017033356991, + "grad_norm": 12.117408752441406, + "learning_rate": 8.136777856635912e-05, + "loss": 0.01006176769733429, + "step": 131320 + }, + { + "epoch": 18.641589779985807, + "grad_norm": 0.7869730591773987, + "learning_rate": 8.136635911994322e-05, + "loss": 0.014416629076004028, + "step": 131330 + }, + { + "epoch": 18.643009226401702, + "grad_norm": 0.23337319493293762, + "learning_rate": 8.136493967352733e-05, + "loss": 0.008844228088855743, + "step": 131340 + }, + { + "epoch": 18.6444286728176, + "grad_norm": 1.930612564086914, + "learning_rate": 8.136352022711143e-05, + "loss": 0.014108307659626007, + "step": 131350 + }, + { + "epoch": 18.6458481192335, + "grad_norm": 9.3480224609375, + "learning_rate": 8.136210078069553e-05, + "loss": 0.04676141738891602, + "step": 131360 + }, + { + "epoch": 18.647267565649397, + "grad_norm": 12.945405960083008, + "learning_rate": 8.136068133427964e-05, + "loss": 0.04466235339641571, + "step": 131370 + }, + { + "epoch": 18.648687012065295, + "grad_norm": 3.8032307624816895, + "learning_rate": 8.135926188786374e-05, + "loss": 0.013353703916072846, + "step": 131380 + }, + { + "epoch": 18.650106458481194, + "grad_norm": 0.9862574338912964, + "learning_rate": 8.135784244144785e-05, + "loss": 0.04951807558536529, + "step": 131390 + }, + { + "epoch": 18.651525904897092, + "grad_norm": 6.689640522003174, + "learning_rate": 8.135642299503194e-05, + "loss": 0.019863298535346983, + "step": 131400 + }, + { + "epoch": 18.652945351312987, + "grad_norm": 0.01660931296646595, + "learning_rate": 8.135500354861605e-05, + "loss": 0.005930447950959206, + "step": 131410 + }, + { + "epoch": 18.654364797728885, + "grad_norm": 0.058415528386831284, + "learning_rate": 8.135358410220014e-05, + "loss": 0.06661216616630554, + "step": 131420 + }, + { + "epoch": 18.655784244144783, + "grad_norm": 18.33035659790039, + "learning_rate": 8.135216465578425e-05, + "loss": 0.03377198576927185, + "step": 131430 + }, + { + "epoch": 18.65720369056068, + "grad_norm": 1.1209580898284912, + "learning_rate": 8.135074520936835e-05, + "loss": 0.019080647826194765, + "step": 131440 + }, + { + "epoch": 18.65862313697658, + "grad_norm": 0.12373685091733932, + "learning_rate": 8.134932576295246e-05, + "loss": 0.02793894410133362, + "step": 131450 + }, + { + "epoch": 18.660042583392478, + "grad_norm": 1.2706899642944336, + "learning_rate": 8.134790631653656e-05, + "loss": 0.0763306200504303, + "step": 131460 + }, + { + "epoch": 18.661462029808376, + "grad_norm": 0.7054558992385864, + "learning_rate": 8.134648687012065e-05, + "loss": 0.009234672784805298, + "step": 131470 + }, + { + "epoch": 18.66288147622427, + "grad_norm": 1.8768150806427002, + "learning_rate": 8.134506742370476e-05, + "loss": 0.04116539061069489, + "step": 131480 + }, + { + "epoch": 18.66430092264017, + "grad_norm": 2.3796298503875732, + "learning_rate": 8.134364797728886e-05, + "loss": 0.004192651808261871, + "step": 131490 + }, + { + "epoch": 18.665720369056068, + "grad_norm": 0.31088367104530334, + "learning_rate": 8.134222853087297e-05, + "loss": 0.020734292268753052, + "step": 131500 + }, + { + "epoch": 18.665720369056068, + "eval_accuracy": 0.9800979207731926, + "eval_loss": 0.07913683354854584, + "eval_runtime": 32.8344, + "eval_samples_per_second": 478.98, + "eval_steps_per_second": 14.984, + "step": 131500 + }, + { + "epoch": 18.667139815471966, + "grad_norm": 0.008138231933116913, + "learning_rate": 8.134080908445707e-05, + "loss": 0.02487410753965378, + "step": 131510 + }, + { + "epoch": 18.668559261887864, + "grad_norm": 0.1465875208377838, + "learning_rate": 8.133938963804117e-05, + "loss": 0.009092245995998383, + "step": 131520 + }, + { + "epoch": 18.669978708303763, + "grad_norm": 7.406838893890381, + "learning_rate": 8.133797019162526e-05, + "loss": 0.01487957090139389, + "step": 131530 + }, + { + "epoch": 18.67139815471966, + "grad_norm": 3.811093807220459, + "learning_rate": 8.133655074520937e-05, + "loss": 0.05009844303131104, + "step": 131540 + }, + { + "epoch": 18.672817601135556, + "grad_norm": 6.526002407073975, + "learning_rate": 8.133513129879347e-05, + "loss": 0.053089505434036253, + "step": 131550 + }, + { + "epoch": 18.674237047551454, + "grad_norm": 11.075752258300781, + "learning_rate": 8.133371185237758e-05, + "loss": 0.06426833271980285, + "step": 131560 + }, + { + "epoch": 18.675656493967352, + "grad_norm": 2.904355764389038, + "learning_rate": 8.133229240596168e-05, + "loss": 0.057389688491821286, + "step": 131570 + }, + { + "epoch": 18.67707594038325, + "grad_norm": 0.01689119264483452, + "learning_rate": 8.133087295954578e-05, + "loss": 0.01457415074110031, + "step": 131580 + }, + { + "epoch": 18.67849538679915, + "grad_norm": 2.3161025047302246, + "learning_rate": 8.132945351312989e-05, + "loss": 0.022415342926979064, + "step": 131590 + }, + { + "epoch": 18.679914833215047, + "grad_norm": 2.4645822048187256, + "learning_rate": 8.132803406671399e-05, + "loss": 0.011522973328828812, + "step": 131600 + }, + { + "epoch": 18.681334279630946, + "grad_norm": 0.06506139785051346, + "learning_rate": 8.13266146202981e-05, + "loss": 0.04800006151199341, + "step": 131610 + }, + { + "epoch": 18.68275372604684, + "grad_norm": 0.1301409751176834, + "learning_rate": 8.132519517388218e-05, + "loss": 0.021942104399204253, + "step": 131620 + }, + { + "epoch": 18.68417317246274, + "grad_norm": 0.07169385999441147, + "learning_rate": 8.132377572746629e-05, + "loss": 0.00830610990524292, + "step": 131630 + }, + { + "epoch": 18.685592618878637, + "grad_norm": 3.7510461807250977, + "learning_rate": 8.132235628105039e-05, + "loss": 0.004536581039428711, + "step": 131640 + }, + { + "epoch": 18.687012065294535, + "grad_norm": 8.096920013427734, + "learning_rate": 8.13209368346345e-05, + "loss": 0.023867668211460115, + "step": 131650 + }, + { + "epoch": 18.688431511710434, + "grad_norm": 2.1872310638427734, + "learning_rate": 8.13195173882186e-05, + "loss": 0.008351977169513702, + "step": 131660 + }, + { + "epoch": 18.689850958126332, + "grad_norm": 11.362651824951172, + "learning_rate": 8.13180979418027e-05, + "loss": 0.01560388207435608, + "step": 131670 + }, + { + "epoch": 18.69127040454223, + "grad_norm": 0.3655797839164734, + "learning_rate": 8.13166784953868e-05, + "loss": 0.009426388144493102, + "step": 131680 + }, + { + "epoch": 18.692689850958125, + "grad_norm": 0.06773939728736877, + "learning_rate": 8.13152590489709e-05, + "loss": 0.008931878209114074, + "step": 131690 + }, + { + "epoch": 18.694109297374023, + "grad_norm": 0.47058501839637756, + "learning_rate": 8.131383960255501e-05, + "loss": 0.0036383919417858125, + "step": 131700 + }, + { + "epoch": 18.69552874378992, + "grad_norm": 0.07784921675920486, + "learning_rate": 8.131242015613911e-05, + "loss": 0.02547445297241211, + "step": 131710 + }, + { + "epoch": 18.69694819020582, + "grad_norm": 1.0401661396026611, + "learning_rate": 8.131100070972322e-05, + "loss": 0.008947336673736572, + "step": 131720 + }, + { + "epoch": 18.698367636621718, + "grad_norm": 1.7095261812210083, + "learning_rate": 8.13095812633073e-05, + "loss": 0.012136232852935792, + "step": 131730 + }, + { + "epoch": 18.699787083037616, + "grad_norm": 3.366201639175415, + "learning_rate": 8.130816181689142e-05, + "loss": 0.005720870569348335, + "step": 131740 + }, + { + "epoch": 18.701206529453515, + "grad_norm": 0.0334438718855381, + "learning_rate": 8.130674237047551e-05, + "loss": 0.007759307324886322, + "step": 131750 + }, + { + "epoch": 18.70262597586941, + "grad_norm": 0.043541185557842255, + "learning_rate": 8.130532292405963e-05, + "loss": 0.01564426124095917, + "step": 131760 + }, + { + "epoch": 18.704045422285308, + "grad_norm": 0.08072488754987717, + "learning_rate": 8.130390347764372e-05, + "loss": 0.005764240026473999, + "step": 131770 + }, + { + "epoch": 18.705464868701206, + "grad_norm": 0.24349243938922882, + "learning_rate": 8.130248403122782e-05, + "loss": 0.015592911839485168, + "step": 131780 + }, + { + "epoch": 18.706884315117104, + "grad_norm": 0.29928338527679443, + "learning_rate": 8.130106458481193e-05, + "loss": 0.009814509004354478, + "step": 131790 + }, + { + "epoch": 18.708303761533003, + "grad_norm": 0.3477325737476349, + "learning_rate": 8.129964513839603e-05, + "loss": 0.005279907211661339, + "step": 131800 + }, + { + "epoch": 18.7097232079489, + "grad_norm": 7.982736110687256, + "learning_rate": 8.129822569198014e-05, + "loss": 0.04246063828468323, + "step": 131810 + }, + { + "epoch": 18.7111426543648, + "grad_norm": 0.4821631610393524, + "learning_rate": 8.129680624556424e-05, + "loss": 0.005226002261042595, + "step": 131820 + }, + { + "epoch": 18.712562100780694, + "grad_norm": 11.368393898010254, + "learning_rate": 8.129538679914833e-05, + "loss": 0.0491435170173645, + "step": 131830 + }, + { + "epoch": 18.713981547196592, + "grad_norm": 0.7549554705619812, + "learning_rate": 8.129396735273243e-05, + "loss": 0.0267733097076416, + "step": 131840 + }, + { + "epoch": 18.71540099361249, + "grad_norm": 4.594950199127197, + "learning_rate": 8.129254790631654e-05, + "loss": 0.010378662496805191, + "step": 131850 + }, + { + "epoch": 18.71682044002839, + "grad_norm": 0.2322712242603302, + "learning_rate": 8.129112845990064e-05, + "loss": 0.013620153069496155, + "step": 131860 + }, + { + "epoch": 18.718239886444287, + "grad_norm": 4.297412395477295, + "learning_rate": 8.128970901348475e-05, + "loss": 0.016456304490566252, + "step": 131870 + }, + { + "epoch": 18.719659332860186, + "grad_norm": 0.2964388430118561, + "learning_rate": 8.128828956706885e-05, + "loss": 0.025118935108184814, + "step": 131880 + }, + { + "epoch": 18.721078779276084, + "grad_norm": 3.0489017963409424, + "learning_rate": 8.128687012065295e-05, + "loss": 0.029944658279418945, + "step": 131890 + }, + { + "epoch": 18.72249822569198, + "grad_norm": 0.9195518493652344, + "learning_rate": 8.128545067423706e-05, + "loss": 0.04168401956558228, + "step": 131900 + }, + { + "epoch": 18.723917672107877, + "grad_norm": 0.03696005046367645, + "learning_rate": 8.128403122782115e-05, + "loss": 0.01858309209346771, + "step": 131910 + }, + { + "epoch": 18.725337118523775, + "grad_norm": 0.17158129811286926, + "learning_rate": 8.128261178140526e-05, + "loss": 0.011418993771076202, + "step": 131920 + }, + { + "epoch": 18.726756564939674, + "grad_norm": 2.986250638961792, + "learning_rate": 8.128119233498935e-05, + "loss": 0.04073112905025482, + "step": 131930 + }, + { + "epoch": 18.728176011355572, + "grad_norm": 2.4232630729675293, + "learning_rate": 8.127977288857346e-05, + "loss": 0.00799041911959648, + "step": 131940 + }, + { + "epoch": 18.72959545777147, + "grad_norm": 2.225045919418335, + "learning_rate": 8.127835344215756e-05, + "loss": 0.010526900738477707, + "step": 131950 + }, + { + "epoch": 18.73101490418737, + "grad_norm": 2.6924684047698975, + "learning_rate": 8.127693399574167e-05, + "loss": 0.010417158901691436, + "step": 131960 + }, + { + "epoch": 18.732434350603263, + "grad_norm": 0.05875149741768837, + "learning_rate": 8.127551454932577e-05, + "loss": 0.02316032499074936, + "step": 131970 + }, + { + "epoch": 18.73385379701916, + "grad_norm": 0.05400659516453743, + "learning_rate": 8.127409510290986e-05, + "loss": 0.07833380699157715, + "step": 131980 + }, + { + "epoch": 18.73527324343506, + "grad_norm": 0.07178810238838196, + "learning_rate": 8.127267565649397e-05, + "loss": 0.044096097350120544, + "step": 131990 + }, + { + "epoch": 18.73669268985096, + "grad_norm": 0.25669190287590027, + "learning_rate": 8.127125621007807e-05, + "loss": 0.030655372142791747, + "step": 132000 + }, + { + "epoch": 18.73669268985096, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.055439144372940063, + "eval_runtime": 32.4017, + "eval_samples_per_second": 485.376, + "eval_steps_per_second": 15.184, + "step": 132000 + }, + { + "epoch": 18.738112136266857, + "grad_norm": 0.1243971586227417, + "learning_rate": 8.126983676366218e-05, + "loss": 0.003086409717798233, + "step": 132010 + }, + { + "epoch": 18.739531582682755, + "grad_norm": 0.016958186402916908, + "learning_rate": 8.126841731724628e-05, + "loss": 0.007639577984809876, + "step": 132020 + }, + { + "epoch": 18.740951029098653, + "grad_norm": 7.979750633239746, + "learning_rate": 8.126699787083038e-05, + "loss": 0.02907142639160156, + "step": 132030 + }, + { + "epoch": 18.742370475514548, + "grad_norm": 2.4184060096740723, + "learning_rate": 8.126557842441447e-05, + "loss": 0.011970283091068267, + "step": 132040 + }, + { + "epoch": 18.743789921930446, + "grad_norm": 11.4110107421875, + "learning_rate": 8.126415897799858e-05, + "loss": 0.022178635001182556, + "step": 132050 + }, + { + "epoch": 18.745209368346345, + "grad_norm": 10.188706398010254, + "learning_rate": 8.126273953158268e-05, + "loss": 0.03458372950553894, + "step": 132060 + }, + { + "epoch": 18.746628814762243, + "grad_norm": 0.019085505977272987, + "learning_rate": 8.12613200851668e-05, + "loss": 0.004047043249011039, + "step": 132070 + }, + { + "epoch": 18.74804826117814, + "grad_norm": 2.709345817565918, + "learning_rate": 8.125990063875089e-05, + "loss": 0.025256985425949098, + "step": 132080 + }, + { + "epoch": 18.74946770759404, + "grad_norm": 14.07314682006836, + "learning_rate": 8.125848119233499e-05, + "loss": 0.02897200584411621, + "step": 132090 + }, + { + "epoch": 18.750887154009938, + "grad_norm": 0.06611883640289307, + "learning_rate": 8.12570617459191e-05, + "loss": 0.03179367482662201, + "step": 132100 + }, + { + "epoch": 18.752306600425833, + "grad_norm": 16.679271697998047, + "learning_rate": 8.12556422995032e-05, + "loss": 0.03063758313655853, + "step": 132110 + }, + { + "epoch": 18.75372604684173, + "grad_norm": 0.21415162086486816, + "learning_rate": 8.125422285308731e-05, + "loss": 0.0198274165391922, + "step": 132120 + }, + { + "epoch": 18.75514549325763, + "grad_norm": 1.131074070930481, + "learning_rate": 8.12528034066714e-05, + "loss": 0.03010278344154358, + "step": 132130 + }, + { + "epoch": 18.756564939673527, + "grad_norm": 0.1621093600988388, + "learning_rate": 8.12513839602555e-05, + "loss": 0.017974340915679933, + "step": 132140 + }, + { + "epoch": 18.757984386089426, + "grad_norm": 0.1724443882703781, + "learning_rate": 8.12499645138396e-05, + "loss": 0.016963428258895873, + "step": 132150 + }, + { + "epoch": 18.759403832505324, + "grad_norm": 0.005614873953163624, + "learning_rate": 8.124854506742371e-05, + "loss": 0.007243013381958008, + "step": 132160 + }, + { + "epoch": 18.760823278921222, + "grad_norm": 0.3981007933616638, + "learning_rate": 8.124712562100781e-05, + "loss": 0.04376653134822846, + "step": 132170 + }, + { + "epoch": 18.762242725337117, + "grad_norm": 5.097568035125732, + "learning_rate": 8.124570617459192e-05, + "loss": 0.010595297068357467, + "step": 132180 + }, + { + "epoch": 18.763662171753015, + "grad_norm": 3.115640163421631, + "learning_rate": 8.124428672817602e-05, + "loss": 0.03532519340515137, + "step": 132190 + }, + { + "epoch": 18.765081618168914, + "grad_norm": 0.12442761659622192, + "learning_rate": 8.124286728176011e-05, + "loss": 0.049129563570022586, + "step": 132200 + }, + { + "epoch": 18.766501064584812, + "grad_norm": 10.673492431640625, + "learning_rate": 8.124144783534422e-05, + "loss": 0.06168168783187866, + "step": 132210 + }, + { + "epoch": 18.76792051100071, + "grad_norm": 0.2644001841545105, + "learning_rate": 8.124002838892832e-05, + "loss": 0.007527868449687958, + "step": 132220 + }, + { + "epoch": 18.76933995741661, + "grad_norm": 0.3530026376247406, + "learning_rate": 8.123860894251243e-05, + "loss": 0.00555514357984066, + "step": 132230 + }, + { + "epoch": 18.770759403832507, + "grad_norm": 0.15322211384773254, + "learning_rate": 8.123718949609652e-05, + "loss": 0.043745231628417966, + "step": 132240 + }, + { + "epoch": 18.7721788502484, + "grad_norm": 0.2177436351776123, + "learning_rate": 8.123577004968063e-05, + "loss": 0.038435956835746764, + "step": 132250 + }, + { + "epoch": 18.7735982966643, + "grad_norm": 13.23100757598877, + "learning_rate": 8.123435060326472e-05, + "loss": 0.03151096999645233, + "step": 132260 + }, + { + "epoch": 18.7750177430802, + "grad_norm": 0.11837724596261978, + "learning_rate": 8.123293115684884e-05, + "loss": 0.009069995582103729, + "step": 132270 + }, + { + "epoch": 18.776437189496097, + "grad_norm": 0.14518579840660095, + "learning_rate": 8.123151171043293e-05, + "loss": 0.017826008796691894, + "step": 132280 + }, + { + "epoch": 18.777856635911995, + "grad_norm": 0.11879577487707138, + "learning_rate": 8.123009226401703e-05, + "loss": 0.016228602826595308, + "step": 132290 + }, + { + "epoch": 18.779276082327893, + "grad_norm": 2.983297348022461, + "learning_rate": 8.122867281760114e-05, + "loss": 0.009481226652860641, + "step": 132300 + }, + { + "epoch": 18.78069552874379, + "grad_norm": 1.8368933200836182, + "learning_rate": 8.122725337118524e-05, + "loss": 0.009812879562377929, + "step": 132310 + }, + { + "epoch": 18.782114975159686, + "grad_norm": 4.023184776306152, + "learning_rate": 8.122583392476935e-05, + "loss": 0.01623908579349518, + "step": 132320 + }, + { + "epoch": 18.783534421575585, + "grad_norm": 0.06996183842420578, + "learning_rate": 8.122441447835345e-05, + "loss": 0.002210826799273491, + "step": 132330 + }, + { + "epoch": 18.784953867991483, + "grad_norm": 0.025891892611980438, + "learning_rate": 8.122299503193754e-05, + "loss": 0.022765421867370607, + "step": 132340 + }, + { + "epoch": 18.78637331440738, + "grad_norm": 2.33005428314209, + "learning_rate": 8.122157558552164e-05, + "loss": 0.019331203401088716, + "step": 132350 + }, + { + "epoch": 18.78779276082328, + "grad_norm": 0.031298790127038956, + "learning_rate": 8.122015613910575e-05, + "loss": 0.01923527717590332, + "step": 132360 + }, + { + "epoch": 18.789212207239178, + "grad_norm": 0.45524969696998596, + "learning_rate": 8.121873669268985e-05, + "loss": 0.002389763668179512, + "step": 132370 + }, + { + "epoch": 18.790631653655076, + "grad_norm": 0.018605032935738564, + "learning_rate": 8.121731724627396e-05, + "loss": 0.013690409064292908, + "step": 132380 + }, + { + "epoch": 18.79205110007097, + "grad_norm": 0.103347048163414, + "learning_rate": 8.121589779985806e-05, + "loss": 0.04742929637432099, + "step": 132390 + }, + { + "epoch": 18.79347054648687, + "grad_norm": 0.85923832654953, + "learning_rate": 8.121447835344216e-05, + "loss": 0.004268684610724449, + "step": 132400 + }, + { + "epoch": 18.794889992902768, + "grad_norm": 0.4262538254261017, + "learning_rate": 8.121305890702627e-05, + "loss": 0.01819155514240265, + "step": 132410 + }, + { + "epoch": 18.796309439318666, + "grad_norm": 0.03946376219391823, + "learning_rate": 8.121163946061036e-05, + "loss": 0.033000385761260985, + "step": 132420 + }, + { + "epoch": 18.797728885734564, + "grad_norm": 0.0055903540924191475, + "learning_rate": 8.121022001419447e-05, + "loss": 0.01689928025007248, + "step": 132430 + }, + { + "epoch": 18.799148332150462, + "grad_norm": 1.8662238121032715, + "learning_rate": 8.120880056777857e-05, + "loss": 0.010890743136405945, + "step": 132440 + }, + { + "epoch": 18.80056777856636, + "grad_norm": 0.051471561193466187, + "learning_rate": 8.120738112136267e-05, + "loss": 0.02852175831794739, + "step": 132450 + }, + { + "epoch": 18.801987224982255, + "grad_norm": 0.11889596283435822, + "learning_rate": 8.120596167494677e-05, + "loss": 0.016614697873592377, + "step": 132460 + }, + { + "epoch": 18.803406671398154, + "grad_norm": 8.242818832397461, + "learning_rate": 8.120454222853088e-05, + "loss": 0.016646860539913176, + "step": 132470 + }, + { + "epoch": 18.804826117814052, + "grad_norm": 0.044861502945423126, + "learning_rate": 8.120312278211498e-05, + "loss": 0.013706690073013306, + "step": 132480 + }, + { + "epoch": 18.80624556422995, + "grad_norm": 0.5428833365440369, + "learning_rate": 8.120170333569909e-05, + "loss": 0.02067845016717911, + "step": 132490 + }, + { + "epoch": 18.80766501064585, + "grad_norm": 3.9887773990631104, + "learning_rate": 8.120028388928318e-05, + "loss": 0.007190878689289093, + "step": 132500 + }, + { + "epoch": 18.80766501064585, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.04539346694946289, + "eval_runtime": 32.5262, + "eval_samples_per_second": 483.518, + "eval_steps_per_second": 15.126, + "step": 132500 + }, + { + "epoch": 18.809084457061747, + "grad_norm": 2.4611332416534424, + "learning_rate": 8.119886444286728e-05, + "loss": 0.008373191952705384, + "step": 132510 + }, + { + "epoch": 18.810503903477645, + "grad_norm": 1.2000856399536133, + "learning_rate": 8.119744499645139e-05, + "loss": 0.029330307245254518, + "step": 132520 + }, + { + "epoch": 18.81192334989354, + "grad_norm": 0.08291121572256088, + "learning_rate": 8.119602555003549e-05, + "loss": 0.014740046858787537, + "step": 132530 + }, + { + "epoch": 18.81334279630944, + "grad_norm": 7.6800079345703125, + "learning_rate": 8.11946061036196e-05, + "loss": 0.009070151299238206, + "step": 132540 + }, + { + "epoch": 18.814762242725337, + "grad_norm": 2.5658936500549316, + "learning_rate": 8.119318665720368e-05, + "loss": 0.018622465431690216, + "step": 132550 + }, + { + "epoch": 18.816181689141235, + "grad_norm": 0.3958927094936371, + "learning_rate": 8.11917672107878e-05, + "loss": 0.012245003879070283, + "step": 132560 + }, + { + "epoch": 18.817601135557133, + "grad_norm": 0.7030619382858276, + "learning_rate": 8.119034776437189e-05, + "loss": 0.010460810363292694, + "step": 132570 + }, + { + "epoch": 18.81902058197303, + "grad_norm": 0.0067835235968232155, + "learning_rate": 8.1188928317956e-05, + "loss": 0.010131156444549561, + "step": 132580 + }, + { + "epoch": 18.82044002838893, + "grad_norm": 0.11783251166343689, + "learning_rate": 8.118750887154011e-05, + "loss": 0.04217685461044311, + "step": 132590 + }, + { + "epoch": 18.821859474804825, + "grad_norm": 1.0168848037719727, + "learning_rate": 8.11860894251242e-05, + "loss": 0.014414319396018982, + "step": 132600 + }, + { + "epoch": 18.823278921220723, + "grad_norm": 3.850590229034424, + "learning_rate": 8.118466997870831e-05, + "loss": 0.016610829532146452, + "step": 132610 + }, + { + "epoch": 18.82469836763662, + "grad_norm": 4.185357570648193, + "learning_rate": 8.11832505322924e-05, + "loss": 0.005007806792855262, + "step": 132620 + }, + { + "epoch": 18.82611781405252, + "grad_norm": 0.17095859348773956, + "learning_rate": 8.118183108587652e-05, + "loss": 0.01426699459552765, + "step": 132630 + }, + { + "epoch": 18.827537260468418, + "grad_norm": 6.672366619110107, + "learning_rate": 8.118041163946061e-05, + "loss": 0.0266873300075531, + "step": 132640 + }, + { + "epoch": 18.828956706884316, + "grad_norm": 0.004397100303322077, + "learning_rate": 8.117899219304471e-05, + "loss": 0.047002002596855164, + "step": 132650 + }, + { + "epoch": 18.830376153300215, + "grad_norm": 0.038003381341695786, + "learning_rate": 8.117757274662881e-05, + "loss": 0.022296585142612457, + "step": 132660 + }, + { + "epoch": 18.83179559971611, + "grad_norm": 0.055123794823884964, + "learning_rate": 8.117615330021292e-05, + "loss": 0.03208612203598023, + "step": 132670 + }, + { + "epoch": 18.833215046132008, + "grad_norm": 0.4400286376476288, + "learning_rate": 8.117473385379703e-05, + "loss": 0.006797407567501068, + "step": 132680 + }, + { + "epoch": 18.834634492547906, + "grad_norm": 6.971785068511963, + "learning_rate": 8.117331440738113e-05, + "loss": 0.029937750101089476, + "step": 132690 + }, + { + "epoch": 18.836053938963804, + "grad_norm": 0.39356622099876404, + "learning_rate": 8.117189496096523e-05, + "loss": 0.04213964343070984, + "step": 132700 + }, + { + "epoch": 18.837473385379703, + "grad_norm": 1.2587052583694458, + "learning_rate": 8.117047551454932e-05, + "loss": 0.015325626730918885, + "step": 132710 + }, + { + "epoch": 18.8388928317956, + "grad_norm": 1.8314698934555054, + "learning_rate": 8.116905606813343e-05, + "loss": 0.0008087139576673508, + "step": 132720 + }, + { + "epoch": 18.8403122782115, + "grad_norm": 5.425319194793701, + "learning_rate": 8.116763662171753e-05, + "loss": 0.036557963490486144, + "step": 132730 + }, + { + "epoch": 18.841731724627394, + "grad_norm": 0.8596503138542175, + "learning_rate": 8.116621717530164e-05, + "loss": 0.01629253327846527, + "step": 132740 + }, + { + "epoch": 18.843151171043292, + "grad_norm": 0.3366922438144684, + "learning_rate": 8.116479772888573e-05, + "loss": 0.006476753205060959, + "step": 132750 + }, + { + "epoch": 18.84457061745919, + "grad_norm": 0.43940088152885437, + "learning_rate": 8.116337828246984e-05, + "loss": 0.0200645849108696, + "step": 132760 + }, + { + "epoch": 18.84599006387509, + "grad_norm": 0.3031410276889801, + "learning_rate": 8.116195883605395e-05, + "loss": 0.005779065564274788, + "step": 132770 + }, + { + "epoch": 18.847409510290987, + "grad_norm": 0.7902480363845825, + "learning_rate": 8.116053938963805e-05, + "loss": 0.04022659361362457, + "step": 132780 + }, + { + "epoch": 18.848828956706885, + "grad_norm": 3.379422903060913, + "learning_rate": 8.115911994322216e-05, + "loss": 0.02182978093624115, + "step": 132790 + }, + { + "epoch": 18.850248403122784, + "grad_norm": 0.19076137244701385, + "learning_rate": 8.115770049680625e-05, + "loss": 0.012058556824922562, + "step": 132800 + }, + { + "epoch": 18.85166784953868, + "grad_norm": 0.011338554322719574, + "learning_rate": 8.115628105039035e-05, + "loss": 0.029507333040237428, + "step": 132810 + }, + { + "epoch": 18.853087295954577, + "grad_norm": 3.2964704036712646, + "learning_rate": 8.115486160397445e-05, + "loss": 0.005234985426068306, + "step": 132820 + }, + { + "epoch": 18.854506742370475, + "grad_norm": 0.04063963517546654, + "learning_rate": 8.115344215755856e-05, + "loss": 0.06484124064445496, + "step": 132830 + }, + { + "epoch": 18.855926188786373, + "grad_norm": 0.14326469600200653, + "learning_rate": 8.115202271114266e-05, + "loss": 0.04315328299999237, + "step": 132840 + }, + { + "epoch": 18.85734563520227, + "grad_norm": 0.8972834348678589, + "learning_rate": 8.115060326472677e-05, + "loss": 0.0273000031709671, + "step": 132850 + }, + { + "epoch": 18.85876508161817, + "grad_norm": 2.150590181350708, + "learning_rate": 8.114918381831087e-05, + "loss": 0.00660116970539093, + "step": 132860 + }, + { + "epoch": 18.86018452803407, + "grad_norm": 0.6882023215293884, + "learning_rate": 8.114776437189496e-05, + "loss": 0.019053636491298674, + "step": 132870 + }, + { + "epoch": 18.861603974449963, + "grad_norm": 0.007473459001630545, + "learning_rate": 8.114634492547907e-05, + "loss": 0.00932258814573288, + "step": 132880 + }, + { + "epoch": 18.86302342086586, + "grad_norm": 0.026022188365459442, + "learning_rate": 8.114492547906317e-05, + "loss": 0.01030050665140152, + "step": 132890 + }, + { + "epoch": 18.86444286728176, + "grad_norm": 1.0625227689743042, + "learning_rate": 8.114350603264728e-05, + "loss": 0.013700217008590698, + "step": 132900 + }, + { + "epoch": 18.865862313697658, + "grad_norm": 0.2801578938961029, + "learning_rate": 8.114208658623137e-05, + "loss": 0.010946492850780486, + "step": 132910 + }, + { + "epoch": 18.867281760113556, + "grad_norm": 16.305551528930664, + "learning_rate": 8.114066713981548e-05, + "loss": 0.03810953497886658, + "step": 132920 + }, + { + "epoch": 18.868701206529455, + "grad_norm": 3.9440793991088867, + "learning_rate": 8.113924769339957e-05, + "loss": 0.024695229530334473, + "step": 132930 + }, + { + "epoch": 18.870120652945353, + "grad_norm": 3.1893086433410645, + "learning_rate": 8.113782824698369e-05, + "loss": 0.02541588544845581, + "step": 132940 + }, + { + "epoch": 18.871540099361248, + "grad_norm": 1.5765419006347656, + "learning_rate": 8.113640880056778e-05, + "loss": 0.07892866134643554, + "step": 132950 + }, + { + "epoch": 18.872959545777146, + "grad_norm": 18.041797637939453, + "learning_rate": 8.113498935415188e-05, + "loss": 0.06794158816337585, + "step": 132960 + }, + { + "epoch": 18.874378992193044, + "grad_norm": 6.357567310333252, + "learning_rate": 8.113356990773599e-05, + "loss": 0.0168626606464386, + "step": 132970 + }, + { + "epoch": 18.875798438608943, + "grad_norm": 0.0431709848344326, + "learning_rate": 8.113229240596168e-05, + "loss": 0.06689361929893493, + "step": 132980 + }, + { + "epoch": 18.87721788502484, + "grad_norm": 0.09627433121204376, + "learning_rate": 8.113087295954577e-05, + "loss": 0.003917117789387703, + "step": 132990 + }, + { + "epoch": 18.87863733144074, + "grad_norm": 0.45058372616767883, + "learning_rate": 8.112945351312988e-05, + "loss": 0.04650241732597351, + "step": 133000 + }, + { + "epoch": 18.87863733144074, + "eval_accuracy": 0.9823233928912062, + "eval_loss": 0.06960802525281906, + "eval_runtime": 33.9214, + "eval_samples_per_second": 463.63, + "eval_steps_per_second": 14.504, + "step": 133000 + }, + { + "epoch": 18.880056777856637, + "grad_norm": 1.5822498798370361, + "learning_rate": 8.112803406671398e-05, + "loss": 0.042826077342033385, + "step": 133010 + }, + { + "epoch": 18.881476224272532, + "grad_norm": 2.025569438934326, + "learning_rate": 8.112661462029809e-05, + "loss": 0.024586796760559082, + "step": 133020 + }, + { + "epoch": 18.88289567068843, + "grad_norm": 0.025201864540576935, + "learning_rate": 8.112519517388219e-05, + "loss": 0.043155121803283694, + "step": 133030 + }, + { + "epoch": 18.88431511710433, + "grad_norm": 5.285928249359131, + "learning_rate": 8.112377572746629e-05, + "loss": 0.04339152872562409, + "step": 133040 + }, + { + "epoch": 18.885734563520227, + "grad_norm": 0.02572530508041382, + "learning_rate": 8.11223562810504e-05, + "loss": 0.044859197735786435, + "step": 133050 + }, + { + "epoch": 18.887154009936125, + "grad_norm": 1.1444945335388184, + "learning_rate": 8.11209368346345e-05, + "loss": 0.019503407180309296, + "step": 133060 + }, + { + "epoch": 18.888573456352024, + "grad_norm": 8.799510955810547, + "learning_rate": 8.11195173882186e-05, + "loss": 0.049301111698150636, + "step": 133070 + }, + { + "epoch": 18.889992902767922, + "grad_norm": 1.0782595872879028, + "learning_rate": 8.111809794180269e-05, + "loss": 0.06624003052711487, + "step": 133080 + }, + { + "epoch": 18.891412349183817, + "grad_norm": 0.3835201859474182, + "learning_rate": 8.11166784953868e-05, + "loss": 0.020512942969799042, + "step": 133090 + }, + { + "epoch": 18.892831795599715, + "grad_norm": 0.2754994034767151, + "learning_rate": 8.11152590489709e-05, + "loss": 0.002916569635272026, + "step": 133100 + }, + { + "epoch": 18.894251242015613, + "grad_norm": 12.092395782470703, + "learning_rate": 8.111383960255501e-05, + "loss": 0.03223346471786499, + "step": 133110 + }, + { + "epoch": 18.89567068843151, + "grad_norm": 0.173954039812088, + "learning_rate": 8.11124201561391e-05, + "loss": 0.040129071474075316, + "step": 133120 + }, + { + "epoch": 18.89709013484741, + "grad_norm": 5.376662731170654, + "learning_rate": 8.111100070972322e-05, + "loss": 0.03927198350429535, + "step": 133130 + }, + { + "epoch": 18.89850958126331, + "grad_norm": 0.8540018200874329, + "learning_rate": 8.110958126330732e-05, + "loss": 0.04299411475658417, + "step": 133140 + }, + { + "epoch": 18.899929027679207, + "grad_norm": 5.0016584396362305, + "learning_rate": 8.110816181689141e-05, + "loss": 0.0349914163351059, + "step": 133150 + }, + { + "epoch": 18.9013484740951, + "grad_norm": 2.7536025047302246, + "learning_rate": 8.110674237047552e-05, + "loss": 0.004328594729304314, + "step": 133160 + }, + { + "epoch": 18.902767920511, + "grad_norm": 8.211243629455566, + "learning_rate": 8.110532292405962e-05, + "loss": 0.03376585841178894, + "step": 133170 + }, + { + "epoch": 18.904187366926898, + "grad_norm": 0.545396089553833, + "learning_rate": 8.110390347764373e-05, + "loss": 0.038627082109451295, + "step": 133180 + }, + { + "epoch": 18.905606813342796, + "grad_norm": 0.3872591257095337, + "learning_rate": 8.110248403122782e-05, + "loss": 0.06577336192131042, + "step": 133190 + }, + { + "epoch": 18.907026259758695, + "grad_norm": 2.4864211082458496, + "learning_rate": 8.110106458481193e-05, + "loss": 0.03767527341842651, + "step": 133200 + }, + { + "epoch": 18.908445706174593, + "grad_norm": 0.5065162777900696, + "learning_rate": 8.109964513839602e-05, + "loss": 0.012854862213134765, + "step": 133210 + }, + { + "epoch": 18.90986515259049, + "grad_norm": 1.854183554649353, + "learning_rate": 8.109822569198013e-05, + "loss": 0.0450294703245163, + "step": 133220 + }, + { + "epoch": 18.911284599006386, + "grad_norm": 3.7991325855255127, + "learning_rate": 8.109680624556423e-05, + "loss": 0.009919236600399017, + "step": 133230 + }, + { + "epoch": 18.912704045422284, + "grad_norm": 0.4304609000682831, + "learning_rate": 8.109538679914833e-05, + "loss": 0.03659535348415375, + "step": 133240 + }, + { + "epoch": 18.914123491838183, + "grad_norm": 11.989385604858398, + "learning_rate": 8.109396735273244e-05, + "loss": 0.01260078400373459, + "step": 133250 + }, + { + "epoch": 18.91554293825408, + "grad_norm": 5.348883628845215, + "learning_rate": 8.109254790631654e-05, + "loss": 0.04840273857116699, + "step": 133260 + }, + { + "epoch": 18.91696238466998, + "grad_norm": 1.8591395616531372, + "learning_rate": 8.109112845990065e-05, + "loss": 0.008823969960212707, + "step": 133270 + }, + { + "epoch": 18.918381831085878, + "grad_norm": 3.6820366382598877, + "learning_rate": 8.108970901348475e-05, + "loss": 0.020488184690475465, + "step": 133280 + }, + { + "epoch": 18.919801277501776, + "grad_norm": 0.0690523087978363, + "learning_rate": 8.108828956706884e-05, + "loss": 0.005634373798966408, + "step": 133290 + }, + { + "epoch": 18.92122072391767, + "grad_norm": 4.969079971313477, + "learning_rate": 8.108687012065294e-05, + "loss": 0.025790277123451232, + "step": 133300 + }, + { + "epoch": 18.92264017033357, + "grad_norm": 0.35796651244163513, + "learning_rate": 8.108545067423705e-05, + "loss": 0.020431703329086302, + "step": 133310 + }, + { + "epoch": 18.924059616749467, + "grad_norm": 2.3078744411468506, + "learning_rate": 8.108403122782115e-05, + "loss": 0.021640455722808837, + "step": 133320 + }, + { + "epoch": 18.925479063165366, + "grad_norm": 0.25514811277389526, + "learning_rate": 8.108261178140526e-05, + "loss": 0.017673870921134947, + "step": 133330 + }, + { + "epoch": 18.926898509581264, + "grad_norm": 0.7440028786659241, + "learning_rate": 8.108119233498936e-05, + "loss": 0.008043202757835387, + "step": 133340 + }, + { + "epoch": 18.928317955997162, + "grad_norm": 4.3911895751953125, + "learning_rate": 8.107977288857345e-05, + "loss": 0.011199419945478439, + "step": 133350 + }, + { + "epoch": 18.92973740241306, + "grad_norm": 1.4787195920944214, + "learning_rate": 8.107835344215757e-05, + "loss": 0.028583604097366332, + "step": 133360 + }, + { + "epoch": 18.931156848828955, + "grad_norm": 2.194730281829834, + "learning_rate": 8.107693399574166e-05, + "loss": 0.03149364292621613, + "step": 133370 + }, + { + "epoch": 18.932576295244854, + "grad_norm": 1.954034686088562, + "learning_rate": 8.107551454932577e-05, + "loss": 0.012651622295379639, + "step": 133380 + }, + { + "epoch": 18.933995741660752, + "grad_norm": 13.001426696777344, + "learning_rate": 8.107409510290986e-05, + "loss": 0.028492489457130434, + "step": 133390 + }, + { + "epoch": 18.93541518807665, + "grad_norm": 0.1190931424498558, + "learning_rate": 8.107267565649397e-05, + "loss": 0.035154017806053164, + "step": 133400 + }, + { + "epoch": 18.93683463449255, + "grad_norm": 21.984230041503906, + "learning_rate": 8.107125621007807e-05, + "loss": 0.01489710807800293, + "step": 133410 + }, + { + "epoch": 18.938254080908447, + "grad_norm": 0.12106990069150925, + "learning_rate": 8.106983676366218e-05, + "loss": 0.03257212340831757, + "step": 133420 + }, + { + "epoch": 18.939673527324345, + "grad_norm": 0.6076300740242004, + "learning_rate": 8.106841731724629e-05, + "loss": 0.02213844209909439, + "step": 133430 + }, + { + "epoch": 18.94109297374024, + "grad_norm": 0.021633967757225037, + "learning_rate": 8.106699787083037e-05, + "loss": 0.004343704506754875, + "step": 133440 + }, + { + "epoch": 18.942512420156138, + "grad_norm": 4.2764763832092285, + "learning_rate": 8.106557842441448e-05, + "loss": 0.0022585604339838026, + "step": 133450 + }, + { + "epoch": 18.943931866572036, + "grad_norm": 0.11006913334131241, + "learning_rate": 8.106415897799858e-05, + "loss": 0.01216980665922165, + "step": 133460 + }, + { + "epoch": 18.945351312987935, + "grad_norm": 1.3304197788238525, + "learning_rate": 8.106273953158269e-05, + "loss": 0.010916084051132202, + "step": 133470 + }, + { + "epoch": 18.946770759403833, + "grad_norm": 0.7497724294662476, + "learning_rate": 8.106132008516679e-05, + "loss": 0.004376733303070068, + "step": 133480 + }, + { + "epoch": 18.94819020581973, + "grad_norm": 0.24626189470291138, + "learning_rate": 8.10599006387509e-05, + "loss": 0.005943173170089721, + "step": 133490 + }, + { + "epoch": 18.94960965223563, + "grad_norm": 3.5472681522369385, + "learning_rate": 8.105848119233498e-05, + "loss": 0.02357488125562668, + "step": 133500 + }, + { + "epoch": 18.94960965223563, + "eval_accuracy": 0.9898264131747949, + "eval_loss": 0.037218160927295685, + "eval_runtime": 32.7903, + "eval_samples_per_second": 479.623, + "eval_steps_per_second": 15.004, + "step": 133500 + }, + { + "epoch": 18.951029098651524, + "grad_norm": 0.05089549347758293, + "learning_rate": 8.10570617459191e-05, + "loss": 0.022551646828651427, + "step": 133510 + }, + { + "epoch": 18.952448545067423, + "grad_norm": 0.4436090588569641, + "learning_rate": 8.10556422995032e-05, + "loss": 0.01959022432565689, + "step": 133520 + }, + { + "epoch": 18.95386799148332, + "grad_norm": 1.9277952909469604, + "learning_rate": 8.10542228530873e-05, + "loss": 0.006059730798006058, + "step": 133530 + }, + { + "epoch": 18.95528743789922, + "grad_norm": 0.009567571803927422, + "learning_rate": 8.105280340667141e-05, + "loss": 0.005125709250569344, + "step": 133540 + }, + { + "epoch": 18.956706884315118, + "grad_norm": 0.0779455155134201, + "learning_rate": 8.10513839602555e-05, + "loss": 0.004056332260370254, + "step": 133550 + }, + { + "epoch": 18.958126330731016, + "grad_norm": 0.33066481351852417, + "learning_rate": 8.104996451383961e-05, + "loss": 0.039429694414138794, + "step": 133560 + }, + { + "epoch": 18.959545777146914, + "grad_norm": 0.033027224242687225, + "learning_rate": 8.10485450674237e-05, + "loss": 0.0031930457800626753, + "step": 133570 + }, + { + "epoch": 18.96096522356281, + "grad_norm": 0.028807181864976883, + "learning_rate": 8.104712562100782e-05, + "loss": 0.014510068297386169, + "step": 133580 + }, + { + "epoch": 18.962384669978707, + "grad_norm": 2.8515522480010986, + "learning_rate": 8.104570617459191e-05, + "loss": 0.01636279672384262, + "step": 133590 + }, + { + "epoch": 18.963804116394606, + "grad_norm": 2.692272901535034, + "learning_rate": 8.104428672817601e-05, + "loss": 0.006249012798070908, + "step": 133600 + }, + { + "epoch": 18.965223562810504, + "grad_norm": 0.1441679447889328, + "learning_rate": 8.104286728176012e-05, + "loss": 0.023763060569763184, + "step": 133610 + }, + { + "epoch": 18.966643009226402, + "grad_norm": 1.413166880607605, + "learning_rate": 8.104144783534422e-05, + "loss": 0.020673489570617674, + "step": 133620 + }, + { + "epoch": 18.9680624556423, + "grad_norm": 0.05413550138473511, + "learning_rate": 8.104002838892833e-05, + "loss": 0.0032124049961566926, + "step": 133630 + }, + { + "epoch": 18.9694819020582, + "grad_norm": 0.9596293568611145, + "learning_rate": 8.103860894251243e-05, + "loss": 0.016143888235092163, + "step": 133640 + }, + { + "epoch": 18.970901348474094, + "grad_norm": 0.23774424195289612, + "learning_rate": 8.103718949609653e-05, + "loss": 0.006650367379188537, + "step": 133650 + }, + { + "epoch": 18.972320794889992, + "grad_norm": 0.2125353366136551, + "learning_rate": 8.103577004968062e-05, + "loss": 0.003049859404563904, + "step": 133660 + }, + { + "epoch": 18.97374024130589, + "grad_norm": 15.742388725280762, + "learning_rate": 8.103435060326473e-05, + "loss": 0.03530228137969971, + "step": 133670 + }, + { + "epoch": 18.97515968772179, + "grad_norm": 2.1937851905822754, + "learning_rate": 8.103293115684883e-05, + "loss": 0.010408701002597808, + "step": 133680 + }, + { + "epoch": 18.976579134137687, + "grad_norm": 3.891181707382202, + "learning_rate": 8.103151171043294e-05, + "loss": 0.010996301472187043, + "step": 133690 + }, + { + "epoch": 18.977998580553585, + "grad_norm": 0.6607540249824524, + "learning_rate": 8.103009226401704e-05, + "loss": 0.005656911060214043, + "step": 133700 + }, + { + "epoch": 18.979418026969483, + "grad_norm": 0.36218443512916565, + "learning_rate": 8.102867281760114e-05, + "loss": 0.0029310058802366258, + "step": 133710 + }, + { + "epoch": 18.980837473385378, + "grad_norm": 6.956638336181641, + "learning_rate": 8.102725337118525e-05, + "loss": 0.037996691465377805, + "step": 133720 + }, + { + "epoch": 18.982256919801276, + "grad_norm": 6.165159702301025, + "learning_rate": 8.102583392476934e-05, + "loss": 0.025007152557373048, + "step": 133730 + }, + { + "epoch": 18.983676366217175, + "grad_norm": 2.438201904296875, + "learning_rate": 8.102441447835346e-05, + "loss": 0.03142993748188019, + "step": 133740 + }, + { + "epoch": 18.985095812633073, + "grad_norm": 0.006362368352711201, + "learning_rate": 8.102299503193754e-05, + "loss": 0.009314981102943421, + "step": 133750 + }, + { + "epoch": 18.98651525904897, + "grad_norm": 0.33121517300605774, + "learning_rate": 8.102157558552165e-05, + "loss": 0.012771032750606537, + "step": 133760 + }, + { + "epoch": 18.98793470546487, + "grad_norm": 0.0070489030331373215, + "learning_rate": 8.102015613910575e-05, + "loss": 0.001782979816198349, + "step": 133770 + }, + { + "epoch": 18.989354151880768, + "grad_norm": 0.011138354428112507, + "learning_rate": 8.101873669268986e-05, + "loss": 0.015682095289230348, + "step": 133780 + }, + { + "epoch": 18.990773598296663, + "grad_norm": 1.3822154998779297, + "learning_rate": 8.101731724627396e-05, + "loss": 0.017892464995384216, + "step": 133790 + }, + { + "epoch": 18.99219304471256, + "grad_norm": 2.4268062114715576, + "learning_rate": 8.101589779985805e-05, + "loss": 0.006347347795963287, + "step": 133800 + }, + { + "epoch": 18.99361249112846, + "grad_norm": 8.877155303955078, + "learning_rate": 8.101447835344216e-05, + "loss": 0.06186530590057373, + "step": 133810 + }, + { + "epoch": 18.995031937544358, + "grad_norm": 0.3811427652835846, + "learning_rate": 8.101305890702626e-05, + "loss": 0.015398317575454712, + "step": 133820 + }, + { + "epoch": 18.996451383960256, + "grad_norm": 0.15157096087932587, + "learning_rate": 8.101163946061037e-05, + "loss": 0.01020585596561432, + "step": 133830 + }, + { + "epoch": 18.997870830376154, + "grad_norm": 0.04400681331753731, + "learning_rate": 8.101022001419447e-05, + "loss": 0.0356217622756958, + "step": 133840 + }, + { + "epoch": 18.999290276792053, + "grad_norm": 1.2333203554153442, + "learning_rate": 8.100880056777858e-05, + "loss": 0.007522010058164596, + "step": 133850 + }, + { + "epoch": 19.000709723207947, + "grad_norm": 0.028951354324817657, + "learning_rate": 8.100738112136266e-05, + "loss": 0.016396063566207885, + "step": 133860 + }, + { + "epoch": 19.002129169623846, + "grad_norm": 0.028261512517929077, + "learning_rate": 8.100596167494678e-05, + "loss": 0.015919487178325652, + "step": 133870 + }, + { + "epoch": 19.003548616039744, + "grad_norm": 0.04226310923695564, + "learning_rate": 8.100454222853087e-05, + "loss": 0.02628237009048462, + "step": 133880 + }, + { + "epoch": 19.004968062455642, + "grad_norm": 0.2772606909275055, + "learning_rate": 8.100312278211498e-05, + "loss": 0.044051063060760495, + "step": 133890 + }, + { + "epoch": 19.00638750887154, + "grad_norm": 2.0943820476531982, + "learning_rate": 8.100170333569908e-05, + "loss": 0.003144432231783867, + "step": 133900 + }, + { + "epoch": 19.00780695528744, + "grad_norm": 0.044655878096818924, + "learning_rate": 8.100028388928318e-05, + "loss": 0.011542373895645141, + "step": 133910 + }, + { + "epoch": 19.009226401703337, + "grad_norm": 0.25398340821266174, + "learning_rate": 8.099886444286729e-05, + "loss": 0.018914687633514404, + "step": 133920 + }, + { + "epoch": 19.010645848119232, + "grad_norm": 0.08015237003564835, + "learning_rate": 8.099744499645139e-05, + "loss": 0.012097981572151185, + "step": 133930 + }, + { + "epoch": 19.01206529453513, + "grad_norm": 1.3208882808685303, + "learning_rate": 8.09960255500355e-05, + "loss": 0.030928075313568115, + "step": 133940 + }, + { + "epoch": 19.01348474095103, + "grad_norm": 0.009170650504529476, + "learning_rate": 8.09946061036196e-05, + "loss": 0.0023461733013391494, + "step": 133950 + }, + { + "epoch": 19.014904187366927, + "grad_norm": 8.084667205810547, + "learning_rate": 8.099318665720369e-05, + "loss": 0.017482061684131623, + "step": 133960 + }, + { + "epoch": 19.016323633782825, + "grad_norm": 16.573938369750977, + "learning_rate": 8.099176721078779e-05, + "loss": 0.02548588514328003, + "step": 133970 + }, + { + "epoch": 19.017743080198724, + "grad_norm": 2.9577677249908447, + "learning_rate": 8.09903477643719e-05, + "loss": 0.02603309750556946, + "step": 133980 + }, + { + "epoch": 19.019162526614622, + "grad_norm": 6.544336795806885, + "learning_rate": 8.0988928317956e-05, + "loss": 0.022039610147476196, + "step": 133990 + }, + { + "epoch": 19.020581973030517, + "grad_norm": 0.12144874036312103, + "learning_rate": 8.098750887154011e-05, + "loss": 0.011551780998706818, + "step": 134000 + }, + { + "epoch": 19.020581973030517, + "eval_accuracy": 0.9851211292681376, + "eval_loss": 0.059864241629838943, + "eval_runtime": 32.8201, + "eval_samples_per_second": 479.189, + "eval_steps_per_second": 14.991, + "step": 134000 + }, + { + "epoch": 19.022001419446415, + "grad_norm": 0.29416728019714355, + "learning_rate": 8.098608942512421e-05, + "loss": 0.061245912313461305, + "step": 134010 + }, + { + "epoch": 19.023420865862313, + "grad_norm": 0.021602200344204903, + "learning_rate": 8.09846699787083e-05, + "loss": 0.0019267242401838302, + "step": 134020 + }, + { + "epoch": 19.02484031227821, + "grad_norm": 0.3599778711795807, + "learning_rate": 8.098325053229242e-05, + "loss": 0.007973751425743103, + "step": 134030 + }, + { + "epoch": 19.02625975869411, + "grad_norm": 7.223097324371338, + "learning_rate": 8.098183108587651e-05, + "loss": 0.01779400110244751, + "step": 134040 + }, + { + "epoch": 19.027679205110008, + "grad_norm": 5.916549205780029, + "learning_rate": 8.098041163946062e-05, + "loss": 0.013569539785385132, + "step": 134050 + }, + { + "epoch": 19.029098651525906, + "grad_norm": 0.7093172669410706, + "learning_rate": 8.097899219304471e-05, + "loss": 0.01011343002319336, + "step": 134060 + }, + { + "epoch": 19.0305180979418, + "grad_norm": 0.5323686599731445, + "learning_rate": 8.097757274662882e-05, + "loss": 0.0035029586404562, + "step": 134070 + }, + { + "epoch": 19.0319375443577, + "grad_norm": 0.4400314390659332, + "learning_rate": 8.097615330021292e-05, + "loss": 0.0046637110412120816, + "step": 134080 + }, + { + "epoch": 19.033356990773598, + "grad_norm": 0.008682881481945515, + "learning_rate": 8.097473385379703e-05, + "loss": 0.0200369730591774, + "step": 134090 + }, + { + "epoch": 19.034776437189496, + "grad_norm": 0.03299645334482193, + "learning_rate": 8.097331440738112e-05, + "loss": 0.019125807285308837, + "step": 134100 + }, + { + "epoch": 19.036195883605394, + "grad_norm": 0.09096463769674301, + "learning_rate": 8.097189496096522e-05, + "loss": 0.0013361256569623947, + "step": 134110 + }, + { + "epoch": 19.037615330021293, + "grad_norm": 6.4878315925598145, + "learning_rate": 8.097047551454933e-05, + "loss": 0.028588488698005676, + "step": 134120 + }, + { + "epoch": 19.03903477643719, + "grad_norm": 0.0908045843243599, + "learning_rate": 8.096905606813343e-05, + "loss": 0.029576560854911803, + "step": 134130 + }, + { + "epoch": 19.040454222853086, + "grad_norm": 1.544521689414978, + "learning_rate": 8.096763662171754e-05, + "loss": 0.011837373673915862, + "step": 134140 + }, + { + "epoch": 19.041873669268984, + "grad_norm": 0.19672341644763947, + "learning_rate": 8.096621717530164e-05, + "loss": 0.006500184535980225, + "step": 134150 + }, + { + "epoch": 19.043293115684882, + "grad_norm": 0.08466586470603943, + "learning_rate": 8.096479772888575e-05, + "loss": 0.02133823335170746, + "step": 134160 + }, + { + "epoch": 19.04471256210078, + "grad_norm": 8.22453784942627, + "learning_rate": 8.096337828246983e-05, + "loss": 0.013680379092693328, + "step": 134170 + }, + { + "epoch": 19.04613200851668, + "grad_norm": 0.022004850208759308, + "learning_rate": 8.096195883605394e-05, + "loss": 0.041504427790641785, + "step": 134180 + }, + { + "epoch": 19.047551454932577, + "grad_norm": 1.6565831899642944, + "learning_rate": 8.096053938963804e-05, + "loss": 0.013404084742069245, + "step": 134190 + }, + { + "epoch": 19.048970901348476, + "grad_norm": 9.642035484313965, + "learning_rate": 8.095911994322215e-05, + "loss": 0.042037194967269896, + "step": 134200 + }, + { + "epoch": 19.05039034776437, + "grad_norm": 0.35203757882118225, + "learning_rate": 8.095770049680625e-05, + "loss": 0.031157463788986206, + "step": 134210 + }, + { + "epoch": 19.05180979418027, + "grad_norm": 9.968067169189453, + "learning_rate": 8.095628105039035e-05, + "loss": 0.0336113691329956, + "step": 134220 + }, + { + "epoch": 19.053229240596167, + "grad_norm": 0.3345049023628235, + "learning_rate": 8.095486160397446e-05, + "loss": 0.03366290926933289, + "step": 134230 + }, + { + "epoch": 19.054648687012065, + "grad_norm": 0.02600511722266674, + "learning_rate": 8.095344215755855e-05, + "loss": 0.021112054586410522, + "step": 134240 + }, + { + "epoch": 19.056068133427964, + "grad_norm": 0.06966177374124527, + "learning_rate": 8.095202271114267e-05, + "loss": 0.009837976098060608, + "step": 134250 + }, + { + "epoch": 19.057487579843862, + "grad_norm": 8.749176979064941, + "learning_rate": 8.095060326472676e-05, + "loss": 0.045498573780059816, + "step": 134260 + }, + { + "epoch": 19.05890702625976, + "grad_norm": 14.655566215515137, + "learning_rate": 8.094918381831086e-05, + "loss": 0.025707656145095827, + "step": 134270 + }, + { + "epoch": 19.060326472675655, + "grad_norm": 0.17003805935382843, + "learning_rate": 8.094776437189496e-05, + "loss": 0.012677818536758423, + "step": 134280 + }, + { + "epoch": 19.061745919091553, + "grad_norm": 5.308541774749756, + "learning_rate": 8.094634492547907e-05, + "loss": 0.02551833987236023, + "step": 134290 + }, + { + "epoch": 19.06316536550745, + "grad_norm": 0.01365375891327858, + "learning_rate": 8.094492547906317e-05, + "loss": 0.020414717495441437, + "step": 134300 + }, + { + "epoch": 19.06458481192335, + "grad_norm": 0.15614201128482819, + "learning_rate": 8.094350603264728e-05, + "loss": 0.023772512376308442, + "step": 134310 + }, + { + "epoch": 19.066004258339248, + "grad_norm": 0.07233654707670212, + "learning_rate": 8.094208658623137e-05, + "loss": 0.04059212505817413, + "step": 134320 + }, + { + "epoch": 19.067423704755146, + "grad_norm": 0.026217155158519745, + "learning_rate": 8.094066713981547e-05, + "loss": 0.0021272551268339155, + "step": 134330 + }, + { + "epoch": 19.068843151171045, + "grad_norm": 0.0481400303542614, + "learning_rate": 8.093924769339958e-05, + "loss": 0.05546886920928955, + "step": 134340 + }, + { + "epoch": 19.07026259758694, + "grad_norm": 19.32974624633789, + "learning_rate": 8.093782824698368e-05, + "loss": 0.03558627963066101, + "step": 134350 + }, + { + "epoch": 19.071682044002838, + "grad_norm": 0.06256501376628876, + "learning_rate": 8.093640880056779e-05, + "loss": 0.009789368510246277, + "step": 134360 + }, + { + "epoch": 19.073101490418736, + "grad_norm": 1.345982551574707, + "learning_rate": 8.093498935415187e-05, + "loss": 0.034044647216796876, + "step": 134370 + }, + { + "epoch": 19.074520936834634, + "grad_norm": 0.13332176208496094, + "learning_rate": 8.093356990773599e-05, + "loss": 0.002076190337538719, + "step": 134380 + }, + { + "epoch": 19.075940383250533, + "grad_norm": 9.257709503173828, + "learning_rate": 8.093215046132008e-05, + "loss": 0.037376236915588376, + "step": 134390 + }, + { + "epoch": 19.07735982966643, + "grad_norm": 0.0649016723036766, + "learning_rate": 8.09307310149042e-05, + "loss": 0.016293227672576904, + "step": 134400 + }, + { + "epoch": 19.07877927608233, + "grad_norm": 0.0038299565203487873, + "learning_rate": 8.092931156848829e-05, + "loss": 0.025033700466156005, + "step": 134410 + }, + { + "epoch": 19.080198722498224, + "grad_norm": 0.9183870553970337, + "learning_rate": 8.092789212207239e-05, + "loss": 0.010904945433139801, + "step": 134420 + }, + { + "epoch": 19.081618168914122, + "grad_norm": 0.7665177583694458, + "learning_rate": 8.09264726756565e-05, + "loss": 0.007632662355899811, + "step": 134430 + }, + { + "epoch": 19.08303761533002, + "grad_norm": 0.15135960280895233, + "learning_rate": 8.09250532292406e-05, + "loss": 0.008608837425708771, + "step": 134440 + }, + { + "epoch": 19.08445706174592, + "grad_norm": 0.017531752586364746, + "learning_rate": 8.092363378282471e-05, + "loss": 0.07331695556640624, + "step": 134450 + }, + { + "epoch": 19.085876508161817, + "grad_norm": 0.24432286620140076, + "learning_rate": 8.09222143364088e-05, + "loss": 0.011470304429531097, + "step": 134460 + }, + { + "epoch": 19.087295954577716, + "grad_norm": 0.009214629419147968, + "learning_rate": 8.09207948899929e-05, + "loss": 0.0062458343803882595, + "step": 134470 + }, + { + "epoch": 19.088715400993614, + "grad_norm": 0.03448065370321274, + "learning_rate": 8.0919375443577e-05, + "loss": 0.0029633276164531706, + "step": 134480 + }, + { + "epoch": 19.09013484740951, + "grad_norm": 0.473985880613327, + "learning_rate": 8.091795599716111e-05, + "loss": 0.011840125918388367, + "step": 134490 + }, + { + "epoch": 19.091554293825407, + "grad_norm": 0.008454610593616962, + "learning_rate": 8.091653655074521e-05, + "loss": 0.01086244136095047, + "step": 134500 + }, + { + "epoch": 19.091554293825407, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.04235000163316727, + "eval_runtime": 32.6911, + "eval_samples_per_second": 481.079, + "eval_steps_per_second": 15.05, + "step": 134500 + }, + { + "epoch": 19.092973740241305, + "grad_norm": 11.254133224487305, + "learning_rate": 8.091511710432932e-05, + "loss": 0.009967145323753358, + "step": 134510 + }, + { + "epoch": 19.094393186657204, + "grad_norm": 0.08455899357795715, + "learning_rate": 8.091369765791342e-05, + "loss": 0.01689300835132599, + "step": 134520 + }, + { + "epoch": 19.095812633073102, + "grad_norm": 0.8373426795005798, + "learning_rate": 8.091227821149751e-05, + "loss": 0.006131380051374436, + "step": 134530 + }, + { + "epoch": 19.097232079489, + "grad_norm": 0.337380588054657, + "learning_rate": 8.091085876508163e-05, + "loss": 0.006717808544635773, + "step": 134540 + }, + { + "epoch": 19.0986515259049, + "grad_norm": 1.9571492671966553, + "learning_rate": 8.090943931866572e-05, + "loss": 0.00572134368121624, + "step": 134550 + }, + { + "epoch": 19.100070972320793, + "grad_norm": 0.024347834289073944, + "learning_rate": 8.090801987224983e-05, + "loss": 0.0018487922847270966, + "step": 134560 + }, + { + "epoch": 19.10149041873669, + "grad_norm": 0.6312090754508972, + "learning_rate": 8.090660042583393e-05, + "loss": 0.006242816522717476, + "step": 134570 + }, + { + "epoch": 19.10290986515259, + "grad_norm": 1.1977765560150146, + "learning_rate": 8.090518097941803e-05, + "loss": 0.018849599361419677, + "step": 134580 + }, + { + "epoch": 19.10432931156849, + "grad_norm": 0.005179632920771837, + "learning_rate": 8.090376153300213e-05, + "loss": 0.0015044763684272766, + "step": 134590 + }, + { + "epoch": 19.105748757984387, + "grad_norm": 1.7063950300216675, + "learning_rate": 8.090234208658624e-05, + "loss": 0.01455221027135849, + "step": 134600 + }, + { + "epoch": 19.107168204400285, + "grad_norm": 14.58033275604248, + "learning_rate": 8.090092264017033e-05, + "loss": 0.03353846073150635, + "step": 134610 + }, + { + "epoch": 19.108587650816183, + "grad_norm": 5.8170647621154785, + "learning_rate": 8.089950319375444e-05, + "loss": 0.05731701850891113, + "step": 134620 + }, + { + "epoch": 19.110007097232078, + "grad_norm": 3.257978677749634, + "learning_rate": 8.089808374733854e-05, + "loss": 0.018321070075035095, + "step": 134630 + }, + { + "epoch": 19.111426543647976, + "grad_norm": 4.303315162658691, + "learning_rate": 8.089666430092264e-05, + "loss": 0.009317952394485473, + "step": 134640 + }, + { + "epoch": 19.112845990063875, + "grad_norm": 11.137117385864258, + "learning_rate": 8.089524485450675e-05, + "loss": 0.043749278783798216, + "step": 134650 + }, + { + "epoch": 19.114265436479773, + "grad_norm": 0.8089771270751953, + "learning_rate": 8.089382540809085e-05, + "loss": 0.03437838852405548, + "step": 134660 + }, + { + "epoch": 19.11568488289567, + "grad_norm": 0.16115444898605347, + "learning_rate": 8.089240596167496e-05, + "loss": 0.002757306769490242, + "step": 134670 + }, + { + "epoch": 19.11710432931157, + "grad_norm": 0.10439011454582214, + "learning_rate": 8.089098651525904e-05, + "loss": 0.024527326226234436, + "step": 134680 + }, + { + "epoch": 19.118523775727468, + "grad_norm": 0.03504888340830803, + "learning_rate": 8.088956706884315e-05, + "loss": 0.013709086179733276, + "step": 134690 + }, + { + "epoch": 19.119943222143363, + "grad_norm": 0.25188112258911133, + "learning_rate": 8.088814762242725e-05, + "loss": 0.023006241023540496, + "step": 134700 + }, + { + "epoch": 19.12136266855926, + "grad_norm": 0.06671018153429031, + "learning_rate": 8.088672817601136e-05, + "loss": 0.024523438513278963, + "step": 134710 + }, + { + "epoch": 19.12278211497516, + "grad_norm": 2.077873468399048, + "learning_rate": 8.088530872959546e-05, + "loss": 0.007115639746189117, + "step": 134720 + }, + { + "epoch": 19.124201561391057, + "grad_norm": 0.1570131927728653, + "learning_rate": 8.088388928317956e-05, + "loss": 0.00751805305480957, + "step": 134730 + }, + { + "epoch": 19.125621007806956, + "grad_norm": 0.027703123167157173, + "learning_rate": 8.088246983676367e-05, + "loss": 0.0043018519878387455, + "step": 134740 + }, + { + "epoch": 19.127040454222854, + "grad_norm": 14.652771949768066, + "learning_rate": 8.088105039034776e-05, + "loss": 0.020449712872505188, + "step": 134750 + }, + { + "epoch": 19.128459900638752, + "grad_norm": 7.697845458984375, + "learning_rate": 8.087963094393188e-05, + "loss": 0.01461091786623001, + "step": 134760 + }, + { + "epoch": 19.129879347054647, + "grad_norm": 0.2831219732761383, + "learning_rate": 8.087821149751597e-05, + "loss": 0.0011369768530130387, + "step": 134770 + }, + { + "epoch": 19.131298793470545, + "grad_norm": 0.17854426801204681, + "learning_rate": 8.087679205110007e-05, + "loss": 0.026823663711547853, + "step": 134780 + }, + { + "epoch": 19.132718239886444, + "grad_norm": 11.814420700073242, + "learning_rate": 8.087537260468417e-05, + "loss": 0.019499766826629638, + "step": 134790 + }, + { + "epoch": 19.134137686302342, + "grad_norm": 0.003499184036627412, + "learning_rate": 8.087395315826828e-05, + "loss": 0.010670372098684312, + "step": 134800 + }, + { + "epoch": 19.13555713271824, + "grad_norm": 2.334817886352539, + "learning_rate": 8.087253371185238e-05, + "loss": 0.020394699275493623, + "step": 134810 + }, + { + "epoch": 19.13697657913414, + "grad_norm": 17.925743103027344, + "learning_rate": 8.087111426543649e-05, + "loss": 0.027310144901275635, + "step": 134820 + }, + { + "epoch": 19.138396025550037, + "grad_norm": 0.029750006273388863, + "learning_rate": 8.086969481902058e-05, + "loss": 0.03104974627494812, + "step": 134830 + }, + { + "epoch": 19.13981547196593, + "grad_norm": 0.0954161286354065, + "learning_rate": 8.086827537260468e-05, + "loss": 0.041319483518600465, + "step": 134840 + }, + { + "epoch": 19.14123491838183, + "grad_norm": 0.026760157197713852, + "learning_rate": 8.086685592618879e-05, + "loss": 0.003623197972774506, + "step": 134850 + }, + { + "epoch": 19.14265436479773, + "grad_norm": 11.640395164489746, + "learning_rate": 8.086543647977289e-05, + "loss": 0.016904018819332123, + "step": 134860 + }, + { + "epoch": 19.144073811213627, + "grad_norm": 0.035630472004413605, + "learning_rate": 8.0864017033357e-05, + "loss": 0.007674677670001984, + "step": 134870 + }, + { + "epoch": 19.145493257629525, + "grad_norm": 10.433150291442871, + "learning_rate": 8.08625975869411e-05, + "loss": 0.02237817645072937, + "step": 134880 + }, + { + "epoch": 19.146912704045423, + "grad_norm": 0.3572457432746887, + "learning_rate": 8.08611781405252e-05, + "loss": 0.06344624161720276, + "step": 134890 + }, + { + "epoch": 19.14833215046132, + "grad_norm": 1.9099338054656982, + "learning_rate": 8.08597586941093e-05, + "loss": 0.02476053237915039, + "step": 134900 + }, + { + "epoch": 19.149751596877216, + "grad_norm": 4.4671311378479, + "learning_rate": 8.08583392476934e-05, + "loss": 0.024169033765792845, + "step": 134910 + }, + { + "epoch": 19.151171043293115, + "grad_norm": 1.4765146970748901, + "learning_rate": 8.085691980127752e-05, + "loss": 0.02690877616405487, + "step": 134920 + }, + { + "epoch": 19.152590489709013, + "grad_norm": 0.014275099150836468, + "learning_rate": 8.085550035486161e-05, + "loss": 0.008906839787960053, + "step": 134930 + }, + { + "epoch": 19.15400993612491, + "grad_norm": 0.4602917730808258, + "learning_rate": 8.085408090844571e-05, + "loss": 0.03422538936138153, + "step": 134940 + }, + { + "epoch": 19.15542938254081, + "grad_norm": 0.015164846554398537, + "learning_rate": 8.085266146202981e-05, + "loss": 0.04223176836967468, + "step": 134950 + }, + { + "epoch": 19.156848828956708, + "grad_norm": 6.864485740661621, + "learning_rate": 8.085124201561392e-05, + "loss": 0.052771955728530884, + "step": 134960 + }, + { + "epoch": 19.158268275372606, + "grad_norm": 0.4479176998138428, + "learning_rate": 8.084982256919802e-05, + "loss": 0.005521007999777794, + "step": 134970 + }, + { + "epoch": 19.1596877217885, + "grad_norm": 0.5584779977798462, + "learning_rate": 8.084840312278213e-05, + "loss": 0.024714210629463197, + "step": 134980 + }, + { + "epoch": 19.1611071682044, + "grad_norm": 3.680006742477417, + "learning_rate": 8.084698367636621e-05, + "loss": 0.015227210521697999, + "step": 134990 + }, + { + "epoch": 19.162526614620297, + "grad_norm": 0.018869522958993912, + "learning_rate": 8.084556422995032e-05, + "loss": 0.001959379017353058, + "step": 135000 + }, + { + "epoch": 19.162526614620297, + "eval_accuracy": 0.9850575443504801, + "eval_loss": 0.05439043045043945, + "eval_runtime": 32.4017, + "eval_samples_per_second": 485.375, + "eval_steps_per_second": 15.184, + "step": 135000 + }, + { + "epoch": 19.163946061036196, + "grad_norm": 0.021521301940083504, + "learning_rate": 8.084414478353443e-05, + "loss": 0.024100886285305025, + "step": 135010 + }, + { + "epoch": 19.165365507452094, + "grad_norm": 0.20940202474594116, + "learning_rate": 8.084272533711853e-05, + "loss": 0.05533415079116821, + "step": 135020 + }, + { + "epoch": 19.166784953867992, + "grad_norm": 0.4249863624572754, + "learning_rate": 8.084130589070264e-05, + "loss": 0.008059429377317429, + "step": 135030 + }, + { + "epoch": 19.16820440028389, + "grad_norm": 0.6474401354789734, + "learning_rate": 8.083988644428672e-05, + "loss": 0.014611485600471496, + "step": 135040 + }, + { + "epoch": 19.169623846699785, + "grad_norm": 0.08407270163297653, + "learning_rate": 8.083846699787084e-05, + "loss": 0.028042757511138917, + "step": 135050 + }, + { + "epoch": 19.171043293115684, + "grad_norm": 0.09232445806264877, + "learning_rate": 8.083704755145493e-05, + "loss": 0.025106951594352722, + "step": 135060 + }, + { + "epoch": 19.172462739531582, + "grad_norm": 6.32170295715332, + "learning_rate": 8.083562810503904e-05, + "loss": 0.013578245043754577, + "step": 135070 + }, + { + "epoch": 19.17388218594748, + "grad_norm": 0.01305151917040348, + "learning_rate": 8.083420865862314e-05, + "loss": 0.01248614490032196, + "step": 135080 + }, + { + "epoch": 19.17530163236338, + "grad_norm": 0.12669546902179718, + "learning_rate": 8.083278921220724e-05, + "loss": 0.011508259177207946, + "step": 135090 + }, + { + "epoch": 19.176721078779277, + "grad_norm": 2.873619794845581, + "learning_rate": 8.083136976579135e-05, + "loss": 0.03385049104690552, + "step": 135100 + }, + { + "epoch": 19.178140525195175, + "grad_norm": 8.848909378051758, + "learning_rate": 8.082995031937545e-05, + "loss": 0.03305015861988068, + "step": 135110 + }, + { + "epoch": 19.17955997161107, + "grad_norm": 5.337402820587158, + "learning_rate": 8.082853087295956e-05, + "loss": 0.011400558054447174, + "step": 135120 + }, + { + "epoch": 19.18097941802697, + "grad_norm": 1.4635014533996582, + "learning_rate": 8.082711142654365e-05, + "loss": 0.006388545036315918, + "step": 135130 + }, + { + "epoch": 19.182398864442867, + "grad_norm": 6.023395538330078, + "learning_rate": 8.082569198012775e-05, + "loss": 0.00347101092338562, + "step": 135140 + }, + { + "epoch": 19.183818310858765, + "grad_norm": 0.009968786500394344, + "learning_rate": 8.082427253371185e-05, + "loss": 0.06805426478385926, + "step": 135150 + }, + { + "epoch": 19.185237757274663, + "grad_norm": 0.17491233348846436, + "learning_rate": 8.082285308729596e-05, + "loss": 0.005591537803411484, + "step": 135160 + }, + { + "epoch": 19.18665720369056, + "grad_norm": 2.5479822158813477, + "learning_rate": 8.082143364088006e-05, + "loss": 0.013746441900730133, + "step": 135170 + }, + { + "epoch": 19.18807665010646, + "grad_norm": 1.8305355310440063, + "learning_rate": 8.082001419446417e-05, + "loss": 0.0023511968553066253, + "step": 135180 + }, + { + "epoch": 19.189496096522355, + "grad_norm": 1.3410097360610962, + "learning_rate": 8.081859474804827e-05, + "loss": 0.0441491037607193, + "step": 135190 + }, + { + "epoch": 19.190915542938253, + "grad_norm": 0.06187102571129799, + "learning_rate": 8.081717530163236e-05, + "loss": 0.019126801192760466, + "step": 135200 + }, + { + "epoch": 19.19233498935415, + "grad_norm": 0.324034720659256, + "learning_rate": 8.081575585521647e-05, + "loss": 0.02774391770362854, + "step": 135210 + }, + { + "epoch": 19.19375443577005, + "grad_norm": 0.09795793890953064, + "learning_rate": 8.081433640880057e-05, + "loss": 0.001038951799273491, + "step": 135220 + }, + { + "epoch": 19.195173882185948, + "grad_norm": 0.6911133527755737, + "learning_rate": 8.081291696238468e-05, + "loss": 0.020565421879291536, + "step": 135230 + }, + { + "epoch": 19.196593328601846, + "grad_norm": 0.024742472916841507, + "learning_rate": 8.081149751596878e-05, + "loss": 0.000863572210073471, + "step": 135240 + }, + { + "epoch": 19.198012775017745, + "grad_norm": 0.07502426952123642, + "learning_rate": 8.081007806955288e-05, + "loss": 0.01390819251537323, + "step": 135250 + }, + { + "epoch": 19.19943222143364, + "grad_norm": 0.006235470529645681, + "learning_rate": 8.080865862313698e-05, + "loss": 0.019389943778514863, + "step": 135260 + }, + { + "epoch": 19.200851667849538, + "grad_norm": 10.355511665344238, + "learning_rate": 8.080723917672109e-05, + "loss": 0.01820642948150635, + "step": 135270 + }, + { + "epoch": 19.202271114265436, + "grad_norm": 0.2703514099121094, + "learning_rate": 8.080581973030518e-05, + "loss": 0.01304187923669815, + "step": 135280 + }, + { + "epoch": 19.203690560681334, + "grad_norm": 0.03895045816898346, + "learning_rate": 8.08044002838893e-05, + "loss": 0.023221737146377562, + "step": 135290 + }, + { + "epoch": 19.205110007097232, + "grad_norm": 0.14150403439998627, + "learning_rate": 8.080298083747339e-05, + "loss": 0.00740317702293396, + "step": 135300 + }, + { + "epoch": 19.20652945351313, + "grad_norm": 0.2885378301143646, + "learning_rate": 8.080156139105749e-05, + "loss": 0.050024384260177614, + "step": 135310 + }, + { + "epoch": 19.20794889992903, + "grad_norm": 1.2044950723648071, + "learning_rate": 8.08001419446416e-05, + "loss": 0.006965817511081695, + "step": 135320 + }, + { + "epoch": 19.209368346344924, + "grad_norm": 17.22332000732422, + "learning_rate": 8.07987224982257e-05, + "loss": 0.032378381490707396, + "step": 135330 + }, + { + "epoch": 19.210787792760822, + "grad_norm": 0.031519077718257904, + "learning_rate": 8.079730305180981e-05, + "loss": 0.008376619219779969, + "step": 135340 + }, + { + "epoch": 19.21220723917672, + "grad_norm": 0.13358646631240845, + "learning_rate": 8.079588360539389e-05, + "loss": 0.0036485549062490463, + "step": 135350 + }, + { + "epoch": 19.21362668559262, + "grad_norm": 0.03483152762055397, + "learning_rate": 8.0794464158978e-05, + "loss": 0.0036309100687503815, + "step": 135360 + }, + { + "epoch": 19.215046132008517, + "grad_norm": 7.076496601104736, + "learning_rate": 8.07930447125621e-05, + "loss": 0.03006422519683838, + "step": 135370 + }, + { + "epoch": 19.216465578424415, + "grad_norm": 2.300114154815674, + "learning_rate": 8.079162526614621e-05, + "loss": 0.0059611741453409195, + "step": 135380 + }, + { + "epoch": 19.217885024840314, + "grad_norm": 1.368050217628479, + "learning_rate": 8.079020581973031e-05, + "loss": 0.02752087116241455, + "step": 135390 + }, + { + "epoch": 19.21930447125621, + "grad_norm": 0.16258902847766876, + "learning_rate": 8.07887863733144e-05, + "loss": 0.003470803052186966, + "step": 135400 + }, + { + "epoch": 19.220723917672107, + "grad_norm": 0.3476499021053314, + "learning_rate": 8.078736692689852e-05, + "loss": 0.020617493987083436, + "step": 135410 + }, + { + "epoch": 19.222143364088005, + "grad_norm": 0.781749427318573, + "learning_rate": 8.078594748048261e-05, + "loss": 0.0015029162168502808, + "step": 135420 + }, + { + "epoch": 19.223562810503903, + "grad_norm": 1.2078951597213745, + "learning_rate": 8.078452803406673e-05, + "loss": 0.01431012749671936, + "step": 135430 + }, + { + "epoch": 19.2249822569198, + "grad_norm": 0.09040942788124084, + "learning_rate": 8.078310858765082e-05, + "loss": 0.0208782359957695, + "step": 135440 + }, + { + "epoch": 19.2264017033357, + "grad_norm": 0.3921969532966614, + "learning_rate": 8.078168914123492e-05, + "loss": 0.020682938396930695, + "step": 135450 + }, + { + "epoch": 19.2278211497516, + "grad_norm": 0.14331085979938507, + "learning_rate": 8.078026969481902e-05, + "loss": 0.012914702296257019, + "step": 135460 + }, + { + "epoch": 19.229240596167493, + "grad_norm": 1.7515239715576172, + "learning_rate": 8.077885024840313e-05, + "loss": 0.007830117642879487, + "step": 135470 + }, + { + "epoch": 19.23066004258339, + "grad_norm": 2.976736068725586, + "learning_rate": 8.077743080198723e-05, + "loss": 0.0039331987500190735, + "step": 135480 + }, + { + "epoch": 19.23207948899929, + "grad_norm": 0.058782994747161865, + "learning_rate": 8.077601135557134e-05, + "loss": 0.004452567175030708, + "step": 135490 + }, + { + "epoch": 19.233498935415188, + "grad_norm": 0.6067075729370117, + "learning_rate": 8.077459190915543e-05, + "loss": 0.01762688010931015, + "step": 135500 + }, + { + "epoch": 19.233498935415188, + "eval_accuracy": 0.9860749030330006, + "eval_loss": 0.053748831152915955, + "eval_runtime": 33.162, + "eval_samples_per_second": 474.248, + "eval_steps_per_second": 14.836, + "step": 135500 + }, + { + "epoch": 19.234918381831086, + "grad_norm": 0.1507439762353897, + "learning_rate": 8.077317246273953e-05, + "loss": 0.030294719338417053, + "step": 135510 + }, + { + "epoch": 19.236337828246985, + "grad_norm": 9.084303855895996, + "learning_rate": 8.077175301632364e-05, + "loss": 0.011562639474868774, + "step": 135520 + }, + { + "epoch": 19.237757274662883, + "grad_norm": 2.929205894470215, + "learning_rate": 8.077033356990774e-05, + "loss": 0.06092924475669861, + "step": 135530 + }, + { + "epoch": 19.239176721078778, + "grad_norm": 10.090849876403809, + "learning_rate": 8.076891412349185e-05, + "loss": 0.02417505532503128, + "step": 135540 + }, + { + "epoch": 19.240596167494676, + "grad_norm": 0.04496016725897789, + "learning_rate": 8.076749467707593e-05, + "loss": 0.013739706575870514, + "step": 135550 + }, + { + "epoch": 19.242015613910574, + "grad_norm": 2.531179666519165, + "learning_rate": 8.076607523066005e-05, + "loss": 0.023041173815727234, + "step": 135560 + }, + { + "epoch": 19.243435060326473, + "grad_norm": 0.288542240858078, + "learning_rate": 8.076465578424414e-05, + "loss": 0.025524574518203735, + "step": 135570 + }, + { + "epoch": 19.24485450674237, + "grad_norm": 0.27630212903022766, + "learning_rate": 8.076323633782825e-05, + "loss": 0.035649356245994565, + "step": 135580 + }, + { + "epoch": 19.24627395315827, + "grad_norm": 0.21555322408676147, + "learning_rate": 8.076181689141235e-05, + "loss": 0.02202434986829758, + "step": 135590 + }, + { + "epoch": 19.247693399574167, + "grad_norm": 0.428025484085083, + "learning_rate": 8.076039744499646e-05, + "loss": 0.008564649522304535, + "step": 135600 + }, + { + "epoch": 19.249112845990062, + "grad_norm": 1.058066725730896, + "learning_rate": 8.075897799858056e-05, + "loss": 0.02624986469745636, + "step": 135610 + }, + { + "epoch": 19.25053229240596, + "grad_norm": 0.0745265781879425, + "learning_rate": 8.075755855216466e-05, + "loss": 0.023323173820972442, + "step": 135620 + }, + { + "epoch": 19.25195173882186, + "grad_norm": 10.500258445739746, + "learning_rate": 8.075613910574877e-05, + "loss": 0.02508750557899475, + "step": 135630 + }, + { + "epoch": 19.253371185237757, + "grad_norm": 0.005136394407600164, + "learning_rate": 8.075471965933287e-05, + "loss": 0.00155564583837986, + "step": 135640 + }, + { + "epoch": 19.254790631653655, + "grad_norm": 0.07520145177841187, + "learning_rate": 8.075330021291698e-05, + "loss": 0.01268192082643509, + "step": 135650 + }, + { + "epoch": 19.256210078069554, + "grad_norm": 0.08487255871295929, + "learning_rate": 8.075188076650106e-05, + "loss": 0.002196522429585457, + "step": 135660 + }, + { + "epoch": 19.257629524485452, + "grad_norm": 1.8649828433990479, + "learning_rate": 8.075046132008517e-05, + "loss": 0.0182554692029953, + "step": 135670 + }, + { + "epoch": 19.259048970901347, + "grad_norm": 0.21836072206497192, + "learning_rate": 8.074904187366927e-05, + "loss": 0.008518166840076447, + "step": 135680 + }, + { + "epoch": 19.260468417317245, + "grad_norm": 0.07416136562824249, + "learning_rate": 8.074762242725338e-05, + "loss": 0.020471793413162232, + "step": 135690 + }, + { + "epoch": 19.261887863733143, + "grad_norm": 0.04966128617525101, + "learning_rate": 8.074620298083748e-05, + "loss": 0.008542297780513764, + "step": 135700 + }, + { + "epoch": 19.26330731014904, + "grad_norm": 4.001149654388428, + "learning_rate": 8.074478353442157e-05, + "loss": 0.07088310718536377, + "step": 135710 + }, + { + "epoch": 19.26472675656494, + "grad_norm": 17.749656677246094, + "learning_rate": 8.074336408800568e-05, + "loss": 0.027709412574768066, + "step": 135720 + }, + { + "epoch": 19.26614620298084, + "grad_norm": 2.207138776779175, + "learning_rate": 8.074194464158978e-05, + "loss": 0.032978209853172305, + "step": 135730 + }, + { + "epoch": 19.267565649396737, + "grad_norm": 0.8686169981956482, + "learning_rate": 8.074052519517389e-05, + "loss": 0.02525452971458435, + "step": 135740 + }, + { + "epoch": 19.26898509581263, + "grad_norm": 0.48470228910446167, + "learning_rate": 8.073910574875799e-05, + "loss": 0.04213707149028778, + "step": 135750 + }, + { + "epoch": 19.27040454222853, + "grad_norm": 5.371331691741943, + "learning_rate": 8.073768630234209e-05, + "loss": 0.04031191170215607, + "step": 135760 + }, + { + "epoch": 19.271823988644428, + "grad_norm": 3.2471160888671875, + "learning_rate": 8.073626685592619e-05, + "loss": 0.012299670279026032, + "step": 135770 + }, + { + "epoch": 19.273243435060326, + "grad_norm": 0.3554064631462097, + "learning_rate": 8.07348474095103e-05, + "loss": 0.03185656070709229, + "step": 135780 + }, + { + "epoch": 19.274662881476225, + "grad_norm": 0.8942446708679199, + "learning_rate": 8.07334279630944e-05, + "loss": 0.020659103989601135, + "step": 135790 + }, + { + "epoch": 19.276082327892123, + "grad_norm": 0.14760670065879822, + "learning_rate": 8.07320085166785e-05, + "loss": 0.02072883993387222, + "step": 135800 + }, + { + "epoch": 19.27750177430802, + "grad_norm": 0.3239462971687317, + "learning_rate": 8.07305890702626e-05, + "loss": 0.002651532366871834, + "step": 135810 + }, + { + "epoch": 19.278921220723916, + "grad_norm": 8.159717559814453, + "learning_rate": 8.07291696238467e-05, + "loss": 0.014492425322532653, + "step": 135820 + }, + { + "epoch": 19.280340667139814, + "grad_norm": 0.27178525924682617, + "learning_rate": 8.072775017743081e-05, + "loss": 0.018134912848472594, + "step": 135830 + }, + { + "epoch": 19.281760113555713, + "grad_norm": 0.029552889987826347, + "learning_rate": 8.072633073101491e-05, + "loss": 0.00932794064283371, + "step": 135840 + }, + { + "epoch": 19.28317955997161, + "grad_norm": 3.9056999683380127, + "learning_rate": 8.072491128459902e-05, + "loss": 0.007778584957122803, + "step": 135850 + }, + { + "epoch": 19.28459900638751, + "grad_norm": 2.5379421710968018, + "learning_rate": 8.07234918381831e-05, + "loss": 0.01643350124359131, + "step": 135860 + }, + { + "epoch": 19.286018452803408, + "grad_norm": 2.7979471683502197, + "learning_rate": 8.072207239176721e-05, + "loss": 0.022671455144882204, + "step": 135870 + }, + { + "epoch": 19.287437899219306, + "grad_norm": 4.062838554382324, + "learning_rate": 8.072065294535131e-05, + "loss": 0.036980432271957395, + "step": 135880 + }, + { + "epoch": 19.2888573456352, + "grad_norm": 0.6683847308158875, + "learning_rate": 8.071923349893542e-05, + "loss": 0.004612940177321434, + "step": 135890 + }, + { + "epoch": 19.2902767920511, + "grad_norm": 0.08301468193531036, + "learning_rate": 8.071781405251952e-05, + "loss": 0.03441115915775299, + "step": 135900 + }, + { + "epoch": 19.291696238466997, + "grad_norm": 1.0100123882293701, + "learning_rate": 8.071639460610362e-05, + "loss": 0.004451007023453713, + "step": 135910 + }, + { + "epoch": 19.293115684882896, + "grad_norm": 0.06388121843338013, + "learning_rate": 8.071497515968773e-05, + "loss": 0.003654952719807625, + "step": 135920 + }, + { + "epoch": 19.294535131298794, + "grad_norm": 10.2322359085083, + "learning_rate": 8.071355571327182e-05, + "loss": 0.04433683156967163, + "step": 135930 + }, + { + "epoch": 19.295954577714692, + "grad_norm": 0.38884854316711426, + "learning_rate": 8.071213626685594e-05, + "loss": 0.030773085355758668, + "step": 135940 + }, + { + "epoch": 19.29737402413059, + "grad_norm": 0.6468043923377991, + "learning_rate": 8.071071682044003e-05, + "loss": 0.02660565674304962, + "step": 135950 + }, + { + "epoch": 19.298793470546485, + "grad_norm": 0.015155358240008354, + "learning_rate": 8.070929737402414e-05, + "loss": 0.015232937037944793, + "step": 135960 + }, + { + "epoch": 19.300212916962384, + "grad_norm": 0.1312216967344284, + "learning_rate": 8.070787792760823e-05, + "loss": 0.06477776765823365, + "step": 135970 + }, + { + "epoch": 19.301632363378282, + "grad_norm": 0.007115287706255913, + "learning_rate": 8.070645848119234e-05, + "loss": 0.00944189801812172, + "step": 135980 + }, + { + "epoch": 19.30305180979418, + "grad_norm": 2.0040953159332275, + "learning_rate": 8.070503903477644e-05, + "loss": 0.02078152447938919, + "step": 135990 + }, + { + "epoch": 19.30447125621008, + "grad_norm": 0.0038018589839339256, + "learning_rate": 8.070361958836055e-05, + "loss": 0.004979583621025086, + "step": 136000 + }, + { + "epoch": 19.30447125621008, + "eval_accuracy": 0.9849939594328225, + "eval_loss": 0.0570266917347908, + "eval_runtime": 33.364, + "eval_samples_per_second": 471.376, + "eval_steps_per_second": 14.746, + "step": 136000 + }, + { + "epoch": 19.305890702625977, + "grad_norm": 0.06633606553077698, + "learning_rate": 8.070220014194464e-05, + "loss": 0.0009724553674459457, + "step": 136010 + }, + { + "epoch": 19.307310149041875, + "grad_norm": 3.7323360443115234, + "learning_rate": 8.070078069552874e-05, + "loss": 0.009088954329490662, + "step": 136020 + }, + { + "epoch": 19.30872959545777, + "grad_norm": 0.17580485343933105, + "learning_rate": 8.069936124911285e-05, + "loss": 0.010520386695861816, + "step": 136030 + }, + { + "epoch": 19.310149041873668, + "grad_norm": 3.0736656188964844, + "learning_rate": 8.069794180269695e-05, + "loss": 0.010726400464773179, + "step": 136040 + }, + { + "epoch": 19.311568488289566, + "grad_norm": 12.675605773925781, + "learning_rate": 8.069652235628106e-05, + "loss": 0.020539800822734832, + "step": 136050 + }, + { + "epoch": 19.312987934705465, + "grad_norm": 14.800597190856934, + "learning_rate": 8.069510290986516e-05, + "loss": 0.03159726858139038, + "step": 136060 + }, + { + "epoch": 19.314407381121363, + "grad_norm": 0.014132910408079624, + "learning_rate": 8.069368346344926e-05, + "loss": 0.007642312347888947, + "step": 136070 + }, + { + "epoch": 19.31582682753726, + "grad_norm": 0.3927164077758789, + "learning_rate": 8.069226401703335e-05, + "loss": 0.02600858509540558, + "step": 136080 + }, + { + "epoch": 19.31724627395316, + "grad_norm": 0.3638625741004944, + "learning_rate": 8.069084457061746e-05, + "loss": 0.013808271288871765, + "step": 136090 + }, + { + "epoch": 19.318665720369054, + "grad_norm": 0.25037309527397156, + "learning_rate": 8.068942512420156e-05, + "loss": 0.0123081237077713, + "step": 136100 + }, + { + "epoch": 19.320085166784953, + "grad_norm": 0.08696846663951874, + "learning_rate": 8.068800567778567e-05, + "loss": 0.0217467337846756, + "step": 136110 + }, + { + "epoch": 19.32150461320085, + "grad_norm": 0.10587045550346375, + "learning_rate": 8.068658623136977e-05, + "loss": 0.010732536017894746, + "step": 136120 + }, + { + "epoch": 19.32292405961675, + "grad_norm": 2.229795455932617, + "learning_rate": 8.068516678495387e-05, + "loss": 0.019755491614341737, + "step": 136130 + }, + { + "epoch": 19.324343506032648, + "grad_norm": 0.13305720686912537, + "learning_rate": 8.068374733853798e-05, + "loss": 0.011332526803016663, + "step": 136140 + }, + { + "epoch": 19.325762952448546, + "grad_norm": 0.05416805297136307, + "learning_rate": 8.068232789212208e-05, + "loss": 0.010781797766685485, + "step": 136150 + }, + { + "epoch": 19.327182398864444, + "grad_norm": 0.052029553800821304, + "learning_rate": 8.068090844570619e-05, + "loss": 0.025164490938186644, + "step": 136160 + }, + { + "epoch": 19.32860184528034, + "grad_norm": 0.32842883467674255, + "learning_rate": 8.067948899929027e-05, + "loss": 0.00847855508327484, + "step": 136170 + }, + { + "epoch": 19.330021291696237, + "grad_norm": 1.8143550157546997, + "learning_rate": 8.067806955287438e-05, + "loss": 0.015515325963497162, + "step": 136180 + }, + { + "epoch": 19.331440738112136, + "grad_norm": 8.770236015319824, + "learning_rate": 8.067665010645848e-05, + "loss": 0.03948104083538055, + "step": 136190 + }, + { + "epoch": 19.332860184528034, + "grad_norm": 11.575773239135742, + "learning_rate": 8.067523066004259e-05, + "loss": 0.042207008600234984, + "step": 136200 + }, + { + "epoch": 19.334279630943932, + "grad_norm": 0.3276245892047882, + "learning_rate": 8.067381121362669e-05, + "loss": 0.023077908158302306, + "step": 136210 + }, + { + "epoch": 19.33569907735983, + "grad_norm": 2.758852243423462, + "learning_rate": 8.067239176721078e-05, + "loss": 0.08552910685539246, + "step": 136220 + }, + { + "epoch": 19.33711852377573, + "grad_norm": 3.7656126022338867, + "learning_rate": 8.06709723207949e-05, + "loss": 0.011476999521255494, + "step": 136230 + }, + { + "epoch": 19.338537970191624, + "grad_norm": 8.272939682006836, + "learning_rate": 8.066955287437899e-05, + "loss": 0.02469935715198517, + "step": 136240 + }, + { + "epoch": 19.339957416607522, + "grad_norm": 0.0628243237733841, + "learning_rate": 8.06681334279631e-05, + "loss": 0.007306870818138122, + "step": 136250 + }, + { + "epoch": 19.34137686302342, + "grad_norm": 10.782951354980469, + "learning_rate": 8.06667139815472e-05, + "loss": 0.011383648216724395, + "step": 136260 + }, + { + "epoch": 19.34279630943932, + "grad_norm": 0.1845972090959549, + "learning_rate": 8.06652945351313e-05, + "loss": 0.020281805098056792, + "step": 136270 + }, + { + "epoch": 19.344215755855217, + "grad_norm": 1.6857872009277344, + "learning_rate": 8.06638750887154e-05, + "loss": 0.0024313628673553467, + "step": 136280 + }, + { + "epoch": 19.345635202271115, + "grad_norm": 0.6315666437149048, + "learning_rate": 8.06624556422995e-05, + "loss": 0.027705147862434387, + "step": 136290 + }, + { + "epoch": 19.347054648687013, + "grad_norm": 5.49241828918457, + "learning_rate": 8.06610361958836e-05, + "loss": 0.006352344155311584, + "step": 136300 + }, + { + "epoch": 19.348474095102908, + "grad_norm": 0.10188443958759308, + "learning_rate": 8.065961674946771e-05, + "loss": 0.023860082030296326, + "step": 136310 + }, + { + "epoch": 19.349893541518806, + "grad_norm": 2.3538029193878174, + "learning_rate": 8.065819730305183e-05, + "loss": 0.0065624013543128966, + "step": 136320 + }, + { + "epoch": 19.351312987934705, + "grad_norm": 5.646554946899414, + "learning_rate": 8.065677785663591e-05, + "loss": 0.012268754839897155, + "step": 136330 + }, + { + "epoch": 19.352732434350603, + "grad_norm": 0.18078984320163727, + "learning_rate": 8.065535841022002e-05, + "loss": 0.008300693333148956, + "step": 136340 + }, + { + "epoch": 19.3541518807665, + "grad_norm": 0.032229866832494736, + "learning_rate": 8.065393896380412e-05, + "loss": 0.01744912415742874, + "step": 136350 + }, + { + "epoch": 19.3555713271824, + "grad_norm": 0.04063647240400314, + "learning_rate": 8.065251951738823e-05, + "loss": 0.010966229438781738, + "step": 136360 + }, + { + "epoch": 19.356990773598298, + "grad_norm": 0.2905770540237427, + "learning_rate": 8.065110007097233e-05, + "loss": 0.03950079083442688, + "step": 136370 + }, + { + "epoch": 19.358410220014193, + "grad_norm": 0.28508496284484863, + "learning_rate": 8.064968062455642e-05, + "loss": 0.01320267766714096, + "step": 136380 + }, + { + "epoch": 19.35982966643009, + "grad_norm": 0.727344274520874, + "learning_rate": 8.064826117814052e-05, + "loss": 0.020682677626609802, + "step": 136390 + }, + { + "epoch": 19.36124911284599, + "grad_norm": 0.1807493418455124, + "learning_rate": 8.064684173172463e-05, + "loss": 0.009905187785625458, + "step": 136400 + }, + { + "epoch": 19.362668559261888, + "grad_norm": 10.234251976013184, + "learning_rate": 8.064542228530874e-05, + "loss": 0.04010664820671082, + "step": 136410 + }, + { + "epoch": 19.364088005677786, + "grad_norm": 0.504630446434021, + "learning_rate": 8.064400283889284e-05, + "loss": 0.013863730430603027, + "step": 136420 + }, + { + "epoch": 19.365507452093684, + "grad_norm": 0.6506510376930237, + "learning_rate": 8.064258339247694e-05, + "loss": 0.0075215592980384825, + "step": 136430 + }, + { + "epoch": 19.366926898509583, + "grad_norm": 0.2712944447994232, + "learning_rate": 8.064116394606103e-05, + "loss": 0.01054297536611557, + "step": 136440 + }, + { + "epoch": 19.368346344925477, + "grad_norm": 0.02783302217721939, + "learning_rate": 8.063974449964515e-05, + "loss": 0.01245361790060997, + "step": 136450 + }, + { + "epoch": 19.369765791341376, + "grad_norm": 0.8528205156326294, + "learning_rate": 8.063832505322924e-05, + "loss": 0.05546398758888245, + "step": 136460 + }, + { + "epoch": 19.371185237757274, + "grad_norm": 3.5433225631713867, + "learning_rate": 8.063690560681335e-05, + "loss": 0.010478836297988892, + "step": 136470 + }, + { + "epoch": 19.372604684173172, + "grad_norm": 2.782099485397339, + "learning_rate": 8.063548616039744e-05, + "loss": 0.006739428639411927, + "step": 136480 + }, + { + "epoch": 19.37402413058907, + "grad_norm": 0.13132494688034058, + "learning_rate": 8.063406671398155e-05, + "loss": 0.0035120531916618346, + "step": 136490 + }, + { + "epoch": 19.37544357700497, + "grad_norm": 0.8922194838523865, + "learning_rate": 8.063264726756566e-05, + "loss": 0.018763212859630583, + "step": 136500 + }, + { + "epoch": 19.37544357700497, + "eval_accuracy": 0.9877916958097539, + "eval_loss": 0.042008642107248306, + "eval_runtime": 33.4344, + "eval_samples_per_second": 470.383, + "eval_steps_per_second": 14.715, + "step": 136500 + }, + { + "epoch": 19.376863023420867, + "grad_norm": 1.9103628396987915, + "learning_rate": 8.063122782114976e-05, + "loss": 0.03362273871898651, + "step": 136510 + }, + { + "epoch": 19.378282469836762, + "grad_norm": 0.08694620430469513, + "learning_rate": 8.062980837473387e-05, + "loss": 0.021892617642879485, + "step": 136520 + }, + { + "epoch": 19.37970191625266, + "grad_norm": 6.578057765960693, + "learning_rate": 8.062838892831795e-05, + "loss": 0.026524189114570617, + "step": 136530 + }, + { + "epoch": 19.38112136266856, + "grad_norm": 0.4580860137939453, + "learning_rate": 8.062696948190206e-05, + "loss": 0.018569007515907288, + "step": 136540 + }, + { + "epoch": 19.382540809084457, + "grad_norm": 7.30665397644043, + "learning_rate": 8.062555003548616e-05, + "loss": 0.03951026201248169, + "step": 136550 + }, + { + "epoch": 19.383960255500355, + "grad_norm": 6.436558246612549, + "learning_rate": 8.062413058907027e-05, + "loss": 0.02224210202693939, + "step": 136560 + }, + { + "epoch": 19.385379701916253, + "grad_norm": 0.022254034876823425, + "learning_rate": 8.062271114265437e-05, + "loss": 0.06707167029380798, + "step": 136570 + }, + { + "epoch": 19.386799148332152, + "grad_norm": 1.2745411396026611, + "learning_rate": 8.062129169623847e-05, + "loss": 0.022767841815948486, + "step": 136580 + }, + { + "epoch": 19.388218594748047, + "grad_norm": 0.22142945230007172, + "learning_rate": 8.061987224982258e-05, + "loss": 0.06356111168861389, + "step": 136590 + }, + { + "epoch": 19.389638041163945, + "grad_norm": 0.017324067652225494, + "learning_rate": 8.061845280340667e-05, + "loss": 0.008104667067527771, + "step": 136600 + }, + { + "epoch": 19.391057487579843, + "grad_norm": 1.24216628074646, + "learning_rate": 8.061703335699078e-05, + "loss": 0.01950208395719528, + "step": 136610 + }, + { + "epoch": 19.39247693399574, + "grad_norm": 1.775840401649475, + "learning_rate": 8.061561391057488e-05, + "loss": 0.005649669468402863, + "step": 136620 + }, + { + "epoch": 19.39389638041164, + "grad_norm": 0.4429466128349304, + "learning_rate": 8.061419446415898e-05, + "loss": 0.0057801961898803714, + "step": 136630 + }, + { + "epoch": 19.395315826827538, + "grad_norm": 0.035086508840322495, + "learning_rate": 8.061277501774308e-05, + "loss": 0.010500229895114899, + "step": 136640 + }, + { + "epoch": 19.396735273243436, + "grad_norm": 0.6924319863319397, + "learning_rate": 8.061135557132719e-05, + "loss": 0.004986953735351562, + "step": 136650 + }, + { + "epoch": 19.39815471965933, + "grad_norm": 1.0726513862609863, + "learning_rate": 8.060993612491129e-05, + "loss": 0.040648224949836734, + "step": 136660 + }, + { + "epoch": 19.39957416607523, + "grad_norm": 8.95368766784668, + "learning_rate": 8.06085166784954e-05, + "loss": 0.02026577889919281, + "step": 136670 + }, + { + "epoch": 19.400993612491128, + "grad_norm": 0.01722809486091137, + "learning_rate": 8.06070972320795e-05, + "loss": 0.01899726241827011, + "step": 136680 + }, + { + "epoch": 19.402413058907026, + "grad_norm": 9.28128719329834, + "learning_rate": 8.060567778566359e-05, + "loss": 0.016845521330833436, + "step": 136690 + }, + { + "epoch": 19.403832505322924, + "grad_norm": 6.689579486846924, + "learning_rate": 8.06042583392477e-05, + "loss": 0.03843151926994324, + "step": 136700 + }, + { + "epoch": 19.405251951738823, + "grad_norm": 0.0509391613304615, + "learning_rate": 8.06028388928318e-05, + "loss": 0.008636415004730225, + "step": 136710 + }, + { + "epoch": 19.40667139815472, + "grad_norm": 0.10041744261980057, + "learning_rate": 8.060141944641591e-05, + "loss": 0.03410317003726959, + "step": 136720 + }, + { + "epoch": 19.408090844570616, + "grad_norm": 1.3901797533035278, + "learning_rate": 8.060000000000001e-05, + "loss": 0.011419706791639329, + "step": 136730 + }, + { + "epoch": 19.409510290986514, + "grad_norm": 0.08800628036260605, + "learning_rate": 8.05985805535841e-05, + "loss": 0.019385628402233124, + "step": 136740 + }, + { + "epoch": 19.410929737402412, + "grad_norm": 2.3375251293182373, + "learning_rate": 8.05971611071682e-05, + "loss": 0.040280142426490785, + "step": 136750 + }, + { + "epoch": 19.41234918381831, + "grad_norm": 1.1224855184555054, + "learning_rate": 8.059574166075231e-05, + "loss": 0.01825178861618042, + "step": 136760 + }, + { + "epoch": 19.41376863023421, + "grad_norm": 2.9055190086364746, + "learning_rate": 8.059432221433641e-05, + "loss": 0.03754159212112427, + "step": 136770 + }, + { + "epoch": 19.415188076650107, + "grad_norm": 11.777843475341797, + "learning_rate": 8.059290276792052e-05, + "loss": 0.05444482564926147, + "step": 136780 + }, + { + "epoch": 19.416607523066006, + "grad_norm": 0.005288857501000166, + "learning_rate": 8.059148332150462e-05, + "loss": 0.06572566032409669, + "step": 136790 + }, + { + "epoch": 19.4180269694819, + "grad_norm": 0.06092311069369316, + "learning_rate": 8.059006387508872e-05, + "loss": 0.039466175436973575, + "step": 136800 + }, + { + "epoch": 19.4194464158978, + "grad_norm": 0.3441387116909027, + "learning_rate": 8.058864442867283e-05, + "loss": 0.032243740558624265, + "step": 136810 + }, + { + "epoch": 19.420865862313697, + "grad_norm": 0.6930990219116211, + "learning_rate": 8.058722498225692e-05, + "loss": 0.005177357420325279, + "step": 136820 + }, + { + "epoch": 19.422285308729595, + "grad_norm": 7.753748416900635, + "learning_rate": 8.058580553584104e-05, + "loss": 0.0090839721262455, + "step": 136830 + }, + { + "epoch": 19.423704755145494, + "grad_norm": 0.012613370083272457, + "learning_rate": 8.058438608942512e-05, + "loss": 0.0094209223985672, + "step": 136840 + }, + { + "epoch": 19.425124201561392, + "grad_norm": 6.884276866912842, + "learning_rate": 8.058296664300923e-05, + "loss": 0.013114632666110992, + "step": 136850 + }, + { + "epoch": 19.42654364797729, + "grad_norm": 0.020718177780508995, + "learning_rate": 8.058154719659333e-05, + "loss": 0.006486819684505462, + "step": 136860 + }, + { + "epoch": 19.427963094393185, + "grad_norm": 3.612637996673584, + "learning_rate": 8.058012775017744e-05, + "loss": 0.022154442965984344, + "step": 136870 + }, + { + "epoch": 19.429382540809083, + "grad_norm": 5.871427536010742, + "learning_rate": 8.057870830376154e-05, + "loss": 0.04860754907131195, + "step": 136880 + }, + { + "epoch": 19.43080198722498, + "grad_norm": 1.2898229360580444, + "learning_rate": 8.057728885734563e-05, + "loss": 0.029851025342941283, + "step": 136890 + }, + { + "epoch": 19.43222143364088, + "grad_norm": 0.04951537400484085, + "learning_rate": 8.057586941092974e-05, + "loss": 0.006623544543981552, + "step": 136900 + }, + { + "epoch": 19.433640880056778, + "grad_norm": 0.006419728510081768, + "learning_rate": 8.057444996451384e-05, + "loss": 0.007638537138700485, + "step": 136910 + }, + { + "epoch": 19.435060326472676, + "grad_norm": 2.1527960300445557, + "learning_rate": 8.057303051809795e-05, + "loss": 0.004995374009013176, + "step": 136920 + }, + { + "epoch": 19.436479772888575, + "grad_norm": 0.4171311855316162, + "learning_rate": 8.057161107168205e-05, + "loss": 0.02046493738889694, + "step": 136930 + }, + { + "epoch": 19.43789921930447, + "grad_norm": 3.070953607559204, + "learning_rate": 8.057019162526615e-05, + "loss": 0.00913369506597519, + "step": 136940 + }, + { + "epoch": 19.439318665720368, + "grad_norm": 0.25687679648399353, + "learning_rate": 8.056877217885024e-05, + "loss": 0.005855118855834007, + "step": 136950 + }, + { + "epoch": 19.440738112136266, + "grad_norm": 0.04907587170600891, + "learning_rate": 8.056735273243436e-05, + "loss": 0.022059416770935057, + "step": 136960 + }, + { + "epoch": 19.442157558552164, + "grad_norm": 1.4365266561508179, + "learning_rate": 8.056593328601845e-05, + "loss": 0.025617489218711854, + "step": 136970 + }, + { + "epoch": 19.443577004968063, + "grad_norm": 0.06065221130847931, + "learning_rate": 8.056451383960256e-05, + "loss": 0.011717283725738525, + "step": 136980 + }, + { + "epoch": 19.44499645138396, + "grad_norm": 0.294193834066391, + "learning_rate": 8.056309439318666e-05, + "loss": 0.017100825905799866, + "step": 136990 + }, + { + "epoch": 19.44641589779986, + "grad_norm": 0.29871729016304016, + "learning_rate": 8.056167494677076e-05, + "loss": 0.022216755151748657, + "step": 137000 + }, + { + "epoch": 19.44641589779986, + "eval_accuracy": 0.9853118840211101, + "eval_loss": 0.056656353175640106, + "eval_runtime": 33.7296, + "eval_samples_per_second": 466.268, + "eval_steps_per_second": 14.587, + "step": 137000 + }, + { + "epoch": 19.447835344215754, + "grad_norm": 15.346325874328613, + "learning_rate": 8.056025550035487e-05, + "loss": 0.05268167853355408, + "step": 137010 + }, + { + "epoch": 19.449254790631652, + "grad_norm": 0.06122482940554619, + "learning_rate": 8.055883605393897e-05, + "loss": 0.001168125495314598, + "step": 137020 + }, + { + "epoch": 19.45067423704755, + "grad_norm": 0.06554862856864929, + "learning_rate": 8.055741660752308e-05, + "loss": 0.03985010087490082, + "step": 137030 + }, + { + "epoch": 19.45209368346345, + "grad_norm": 0.04417649656534195, + "learning_rate": 8.055599716110718e-05, + "loss": 0.0025506459176540376, + "step": 137040 + }, + { + "epoch": 19.453513129879347, + "grad_norm": 0.17910021543502808, + "learning_rate": 8.055457771469127e-05, + "loss": 0.011689558625221252, + "step": 137050 + }, + { + "epoch": 19.454932576295246, + "grad_norm": 7.048887252807617, + "learning_rate": 8.055315826827537e-05, + "loss": 0.011845612525939941, + "step": 137060 + }, + { + "epoch": 19.456352022711144, + "grad_norm": 0.5460303425788879, + "learning_rate": 8.055188076650107e-05, + "loss": 0.04180977344512939, + "step": 137070 + }, + { + "epoch": 19.45777146912704, + "grad_norm": 0.05867021530866623, + "learning_rate": 8.055046132008517e-05, + "loss": 0.026071444153785706, + "step": 137080 + }, + { + "epoch": 19.459190915542937, + "grad_norm": 0.015164356678724289, + "learning_rate": 8.054904187366928e-05, + "loss": 0.00244579054415226, + "step": 137090 + }, + { + "epoch": 19.460610361958835, + "grad_norm": 1.2402942180633545, + "learning_rate": 8.054762242725337e-05, + "loss": 0.013297773897647858, + "step": 137100 + }, + { + "epoch": 19.462029808374734, + "grad_norm": 0.3962368369102478, + "learning_rate": 8.054620298083749e-05, + "loss": 0.0031079955399036406, + "step": 137110 + }, + { + "epoch": 19.463449254790632, + "grad_norm": 0.12435641884803772, + "learning_rate": 8.054478353442157e-05, + "loss": 0.009549376368522645, + "step": 137120 + }, + { + "epoch": 19.46486870120653, + "grad_norm": 0.3245566487312317, + "learning_rate": 8.054336408800568e-05, + "loss": 0.025057733058929443, + "step": 137130 + }, + { + "epoch": 19.46628814762243, + "grad_norm": 0.7982626557350159, + "learning_rate": 8.054194464158978e-05, + "loss": 0.022643449902534484, + "step": 137140 + }, + { + "epoch": 19.467707594038323, + "grad_norm": 0.22677786648273468, + "learning_rate": 8.054052519517389e-05, + "loss": 0.019562676548957825, + "step": 137150 + }, + { + "epoch": 19.46912704045422, + "grad_norm": 0.8642492294311523, + "learning_rate": 8.0539105748758e-05, + "loss": 0.0022503107786178587, + "step": 137160 + }, + { + "epoch": 19.47054648687012, + "grad_norm": 0.018497856333851814, + "learning_rate": 8.053768630234208e-05, + "loss": 0.015945518016815187, + "step": 137170 + }, + { + "epoch": 19.471965933286018, + "grad_norm": 0.13687346875667572, + "learning_rate": 8.05362668559262e-05, + "loss": 0.02993578016757965, + "step": 137180 + }, + { + "epoch": 19.473385379701917, + "grad_norm": 0.4365893304347992, + "learning_rate": 8.053484740951029e-05, + "loss": 0.013093468546867371, + "step": 137190 + }, + { + "epoch": 19.474804826117815, + "grad_norm": 0.4910050332546234, + "learning_rate": 8.05334279630944e-05, + "loss": 0.030686333775520325, + "step": 137200 + }, + { + "epoch": 19.476224272533713, + "grad_norm": 0.07861913740634918, + "learning_rate": 8.05320085166785e-05, + "loss": 0.036781036853790285, + "step": 137210 + }, + { + "epoch": 19.477643718949608, + "grad_norm": 0.2435026317834854, + "learning_rate": 8.05305890702626e-05, + "loss": 0.008760906010866164, + "step": 137220 + }, + { + "epoch": 19.479063165365506, + "grad_norm": 0.11083705723285675, + "learning_rate": 8.05291696238467e-05, + "loss": 0.025510281324386597, + "step": 137230 + }, + { + "epoch": 19.480482611781405, + "grad_norm": 6.562149524688721, + "learning_rate": 8.05277501774308e-05, + "loss": 0.013110196590423584, + "step": 137240 + }, + { + "epoch": 19.481902058197303, + "grad_norm": 0.015752902254462242, + "learning_rate": 8.052633073101492e-05, + "loss": 0.004713873565196991, + "step": 137250 + }, + { + "epoch": 19.4833215046132, + "grad_norm": 0.4762270152568817, + "learning_rate": 8.052491128459901e-05, + "loss": 0.0068397440016269686, + "step": 137260 + }, + { + "epoch": 19.4847409510291, + "grad_norm": 1.1166408061981201, + "learning_rate": 8.052349183818311e-05, + "loss": 0.006467262655496598, + "step": 137270 + }, + { + "epoch": 19.486160397444998, + "grad_norm": 2.362943649291992, + "learning_rate": 8.052207239176721e-05, + "loss": 0.0054939169436693195, + "step": 137280 + }, + { + "epoch": 19.487579843860892, + "grad_norm": 6.425047874450684, + "learning_rate": 8.052065294535132e-05, + "loss": 0.05379377603530884, + "step": 137290 + }, + { + "epoch": 19.48899929027679, + "grad_norm": 0.8402780294418335, + "learning_rate": 8.051923349893542e-05, + "loss": 0.002586003392934799, + "step": 137300 + }, + { + "epoch": 19.49041873669269, + "grad_norm": 2.7678849697113037, + "learning_rate": 8.051795599716112e-05, + "loss": 0.0818613350391388, + "step": 137310 + }, + { + "epoch": 19.491838183108587, + "grad_norm": 0.03431839123368263, + "learning_rate": 8.051653655074521e-05, + "loss": 0.013517959415912629, + "step": 137320 + }, + { + "epoch": 19.493257629524486, + "grad_norm": 0.013028129935264587, + "learning_rate": 8.051511710432932e-05, + "loss": 0.03265658915042877, + "step": 137330 + }, + { + "epoch": 19.494677075940384, + "grad_norm": 2.9575202465057373, + "learning_rate": 8.051369765791342e-05, + "loss": 0.015177388489246369, + "step": 137340 + }, + { + "epoch": 19.496096522356282, + "grad_norm": 0.03283777832984924, + "learning_rate": 8.051227821149752e-05, + "loss": 0.034216710925102235, + "step": 137350 + }, + { + "epoch": 19.497515968772177, + "grad_norm": 0.10726157575845718, + "learning_rate": 8.051085876508162e-05, + "loss": 0.013336297869682313, + "step": 137360 + }, + { + "epoch": 19.498935415188075, + "grad_norm": 0.10614597797393799, + "learning_rate": 8.050943931866573e-05, + "loss": 0.0020912285894155503, + "step": 137370 + }, + { + "epoch": 19.500354861603974, + "grad_norm": 0.15989543497562408, + "learning_rate": 8.050801987224982e-05, + "loss": 0.005193191021680832, + "step": 137380 + }, + { + "epoch": 19.501774308019872, + "grad_norm": 0.03939535468816757, + "learning_rate": 8.050660042583393e-05, + "loss": 0.001727219671010971, + "step": 137390 + }, + { + "epoch": 19.50319375443577, + "grad_norm": 0.04129151254892349, + "learning_rate": 8.050518097941803e-05, + "loss": 0.009687118232250214, + "step": 137400 + }, + { + "epoch": 19.50461320085167, + "grad_norm": 9.898896217346191, + "learning_rate": 8.050376153300213e-05, + "loss": 0.02417587786912918, + "step": 137410 + }, + { + "epoch": 19.506032647267567, + "grad_norm": 0.6055755019187927, + "learning_rate": 8.050234208658624e-05, + "loss": 0.012402527779340745, + "step": 137420 + }, + { + "epoch": 19.50745209368346, + "grad_norm": 0.5404500365257263, + "learning_rate": 8.050092264017034e-05, + "loss": 0.01648992598056793, + "step": 137430 + }, + { + "epoch": 19.50887154009936, + "grad_norm": 4.68674898147583, + "learning_rate": 8.049950319375445e-05, + "loss": 0.028700920939445495, + "step": 137440 + }, + { + "epoch": 19.51029098651526, + "grad_norm": 6.661839485168457, + "learning_rate": 8.049808374733853e-05, + "loss": 0.023427054286003113, + "step": 137450 + }, + { + "epoch": 19.511710432931157, + "grad_norm": 0.07466240972280502, + "learning_rate": 8.049666430092264e-05, + "loss": 0.015752318501472472, + "step": 137460 + }, + { + "epoch": 19.513129879347055, + "grad_norm": 0.1380309760570526, + "learning_rate": 8.049524485450674e-05, + "loss": 0.009265802055597305, + "step": 137470 + }, + { + "epoch": 19.514549325762953, + "grad_norm": 0.3679251968860626, + "learning_rate": 8.049382540809085e-05, + "loss": 0.002256740629673004, + "step": 137480 + }, + { + "epoch": 19.51596877217885, + "grad_norm": 0.18063372373580933, + "learning_rate": 8.049240596167495e-05, + "loss": 0.0041745990514755246, + "step": 137490 + }, + { + "epoch": 19.517388218594746, + "grad_norm": 8.763851165771484, + "learning_rate": 8.049098651525905e-05, + "loss": 0.0062512621283531185, + "step": 137500 + }, + { + "epoch": 19.517388218594746, + "eval_accuracy": 0.988046035480384, + "eval_loss": 0.05227066949009895, + "eval_runtime": 34.1858, + "eval_samples_per_second": 460.045, + "eval_steps_per_second": 14.392, + "step": 137500 + }, + { + "epoch": 19.518807665010645, + "grad_norm": 4.479283809661865, + "learning_rate": 8.048956706884316e-05, + "loss": 0.007906591892242432, + "step": 137510 + }, + { + "epoch": 19.520227111426543, + "grad_norm": 4.453495025634766, + "learning_rate": 8.048814762242725e-05, + "loss": 0.007586041092872619, + "step": 137520 + }, + { + "epoch": 19.52164655784244, + "grad_norm": 2.1542985439300537, + "learning_rate": 8.048672817601137e-05, + "loss": 0.01741510033607483, + "step": 137530 + }, + { + "epoch": 19.52306600425834, + "grad_norm": 0.005334902089089155, + "learning_rate": 8.048530872959546e-05, + "loss": 0.02001422792673111, + "step": 137540 + }, + { + "epoch": 19.524485450674238, + "grad_norm": 21.645727157592773, + "learning_rate": 8.048388928317956e-05, + "loss": 0.029499968886375426, + "step": 137550 + }, + { + "epoch": 19.525904897090136, + "grad_norm": 7.598186492919922, + "learning_rate": 8.048246983676366e-05, + "loss": 0.077846360206604, + "step": 137560 + }, + { + "epoch": 19.52732434350603, + "grad_norm": 0.08335864543914795, + "learning_rate": 8.048105039034777e-05, + "loss": 0.0755756139755249, + "step": 137570 + }, + { + "epoch": 19.52874378992193, + "grad_norm": 11.596985816955566, + "learning_rate": 8.047963094393187e-05, + "loss": 0.04176556766033172, + "step": 137580 + }, + { + "epoch": 19.530163236337827, + "grad_norm": 0.6123652458190918, + "learning_rate": 8.047821149751598e-05, + "loss": 0.005750620737671852, + "step": 137590 + }, + { + "epoch": 19.531582682753726, + "grad_norm": 0.0034010582603514194, + "learning_rate": 8.047679205110007e-05, + "loss": 0.002270437404513359, + "step": 137600 + }, + { + "epoch": 19.533002129169624, + "grad_norm": 0.09473180025815964, + "learning_rate": 8.047537260468417e-05, + "loss": 0.006962215900421143, + "step": 137610 + }, + { + "epoch": 19.534421575585522, + "grad_norm": 1.2294211387634277, + "learning_rate": 8.047395315826828e-05, + "loss": 0.023443740606307984, + "step": 137620 + }, + { + "epoch": 19.53584102200142, + "grad_norm": 0.6309512853622437, + "learning_rate": 8.047253371185238e-05, + "loss": 0.03320013284683228, + "step": 137630 + }, + { + "epoch": 19.537260468417315, + "grad_norm": 2.6269350051879883, + "learning_rate": 8.047111426543649e-05, + "loss": 0.022580428421497344, + "step": 137640 + }, + { + "epoch": 19.538679914833214, + "grad_norm": 14.174617767333984, + "learning_rate": 8.046969481902059e-05, + "loss": 0.031341654062271115, + "step": 137650 + }, + { + "epoch": 19.540099361249112, + "grad_norm": 0.0707431212067604, + "learning_rate": 8.046827537260469e-05, + "loss": 0.0052914883941411976, + "step": 137660 + }, + { + "epoch": 19.54151880766501, + "grad_norm": 6.212255954742432, + "learning_rate": 8.046685592618878e-05, + "loss": 0.008136822283267975, + "step": 137670 + }, + { + "epoch": 19.54293825408091, + "grad_norm": 0.12009210139513016, + "learning_rate": 8.04654364797729e-05, + "loss": 0.02860853374004364, + "step": 137680 + }, + { + "epoch": 19.544357700496807, + "grad_norm": 0.17526400089263916, + "learning_rate": 8.046401703335699e-05, + "loss": 0.0434247076511383, + "step": 137690 + }, + { + "epoch": 19.545777146912705, + "grad_norm": 0.8788767457008362, + "learning_rate": 8.04625975869411e-05, + "loss": 0.03675092458724975, + "step": 137700 + }, + { + "epoch": 19.5471965933286, + "grad_norm": 3.4288830757141113, + "learning_rate": 8.04611781405252e-05, + "loss": 0.022850652039051057, + "step": 137710 + }, + { + "epoch": 19.5486160397445, + "grad_norm": 12.267441749572754, + "learning_rate": 8.04597586941093e-05, + "loss": 0.04059075117111206, + "step": 137720 + }, + { + "epoch": 19.550035486160397, + "grad_norm": 6.900718688964844, + "learning_rate": 8.045833924769341e-05, + "loss": 0.0090223990380764, + "step": 137730 + }, + { + "epoch": 19.551454932576295, + "grad_norm": 0.49269208312034607, + "learning_rate": 8.04569198012775e-05, + "loss": 0.05740405917167664, + "step": 137740 + }, + { + "epoch": 19.552874378992193, + "grad_norm": 15.897799491882324, + "learning_rate": 8.045550035486162e-05, + "loss": 0.030545425415039063, + "step": 137750 + }, + { + "epoch": 19.55429382540809, + "grad_norm": 0.03522428870201111, + "learning_rate": 8.04540809084457e-05, + "loss": 0.008827247470617295, + "step": 137760 + }, + { + "epoch": 19.55571327182399, + "grad_norm": 0.38180023431777954, + "learning_rate": 8.045266146202981e-05, + "loss": 0.0034719817340373993, + "step": 137770 + }, + { + "epoch": 19.557132718239885, + "grad_norm": 0.09027215838432312, + "learning_rate": 8.045124201561391e-05, + "loss": 0.012767669558525086, + "step": 137780 + }, + { + "epoch": 19.558552164655783, + "grad_norm": 14.821966171264648, + "learning_rate": 8.044982256919802e-05, + "loss": 0.024404963850975035, + "step": 137790 + }, + { + "epoch": 19.55997161107168, + "grad_norm": 0.0908409133553505, + "learning_rate": 8.044840312278212e-05, + "loss": 0.004921406880021096, + "step": 137800 + }, + { + "epoch": 19.56139105748758, + "grad_norm": 7.770122051239014, + "learning_rate": 8.044698367636621e-05, + "loss": 0.012073308229446411, + "step": 137810 + }, + { + "epoch": 19.562810503903478, + "grad_norm": 11.927591323852539, + "learning_rate": 8.044556422995033e-05, + "loss": 0.03332340717315674, + "step": 137820 + }, + { + "epoch": 19.564229950319376, + "grad_norm": 3.9551355838775635, + "learning_rate": 8.044414478353442e-05, + "loss": 0.02835809588432312, + "step": 137830 + }, + { + "epoch": 19.565649396735274, + "grad_norm": 0.30653974413871765, + "learning_rate": 8.044272533711853e-05, + "loss": 0.014580333232879638, + "step": 137840 + }, + { + "epoch": 19.56706884315117, + "grad_norm": 0.9868557453155518, + "learning_rate": 8.044130589070263e-05, + "loss": 0.029064875841140748, + "step": 137850 + }, + { + "epoch": 19.568488289567068, + "grad_norm": 8.199562072753906, + "learning_rate": 8.043988644428673e-05, + "loss": 0.04607329964637756, + "step": 137860 + }, + { + "epoch": 19.569907735982966, + "grad_norm": 0.038842763751745224, + "learning_rate": 8.043846699787083e-05, + "loss": 0.020066358149051666, + "step": 137870 + }, + { + "epoch": 19.571327182398864, + "grad_norm": 0.06427346915006638, + "learning_rate": 8.043704755145494e-05, + "loss": 0.020113299787044524, + "step": 137880 + }, + { + "epoch": 19.572746628814762, + "grad_norm": 0.018326854333281517, + "learning_rate": 8.043562810503903e-05, + "loss": 0.013383975625038147, + "step": 137890 + }, + { + "epoch": 19.57416607523066, + "grad_norm": 16.43416976928711, + "learning_rate": 8.043420865862314e-05, + "loss": 0.040350151062011716, + "step": 137900 + }, + { + "epoch": 19.57558552164656, + "grad_norm": 0.596812903881073, + "learning_rate": 8.043278921220724e-05, + "loss": 0.004540695995092392, + "step": 137910 + }, + { + "epoch": 19.577004968062454, + "grad_norm": 0.22771741449832916, + "learning_rate": 8.043136976579134e-05, + "loss": 0.005516242608428002, + "step": 137920 + }, + { + "epoch": 19.578424414478352, + "grad_norm": 0.12957964837551117, + "learning_rate": 8.042995031937545e-05, + "loss": 0.002573397010564804, + "step": 137930 + }, + { + "epoch": 19.57984386089425, + "grad_norm": 0.14717452228069305, + "learning_rate": 8.042853087295955e-05, + "loss": 0.002750430628657341, + "step": 137940 + }, + { + "epoch": 19.58126330731015, + "grad_norm": 0.2578049302101135, + "learning_rate": 8.042711142654366e-05, + "loss": 0.015633463859558105, + "step": 137950 + }, + { + "epoch": 19.582682753726047, + "grad_norm": 0.0091329924762249, + "learning_rate": 8.042569198012774e-05, + "loss": 0.01024954691529274, + "step": 137960 + }, + { + "epoch": 19.584102200141945, + "grad_norm": 9.630097389221191, + "learning_rate": 8.042427253371185e-05, + "loss": 0.017204731702804565, + "step": 137970 + }, + { + "epoch": 19.585521646557844, + "grad_norm": 0.018298881128430367, + "learning_rate": 8.042285308729595e-05, + "loss": 0.0032829798758029936, + "step": 137980 + }, + { + "epoch": 19.58694109297374, + "grad_norm": 3.565556764602661, + "learning_rate": 8.042143364088006e-05, + "loss": 0.01979718804359436, + "step": 137990 + }, + { + "epoch": 19.588360539389637, + "grad_norm": 3.9285240173339844, + "learning_rate": 8.042001419446417e-05, + "loss": 0.023946885764598847, + "step": 138000 + }, + { + "epoch": 19.588360539389637, + "eval_accuracy": 0.9872830164684937, + "eval_loss": 0.050660859793424606, + "eval_runtime": 34.0231, + "eval_samples_per_second": 462.245, + "eval_steps_per_second": 14.461, + "step": 138000 + }, + { + "epoch": 19.589779985805535, + "grad_norm": 7.529336452484131, + "learning_rate": 8.041859474804827e-05, + "loss": 0.0438548356294632, + "step": 138010 + }, + { + "epoch": 19.591199432221433, + "grad_norm": 0.0073790960013866425, + "learning_rate": 8.041717530163237e-05, + "loss": 0.029686707258224487, + "step": 138020 + }, + { + "epoch": 19.59261887863733, + "grad_norm": 2.1580491065979004, + "learning_rate": 8.041575585521646e-05, + "loss": 0.022687962651252745, + "step": 138030 + }, + { + "epoch": 19.59403832505323, + "grad_norm": 6.535014629364014, + "learning_rate": 8.041433640880058e-05, + "loss": 0.008999032527208328, + "step": 138040 + }, + { + "epoch": 19.59545777146913, + "grad_norm": 3.3322510719299316, + "learning_rate": 8.041291696238467e-05, + "loss": 0.017945469915866853, + "step": 138050 + }, + { + "epoch": 19.596877217885023, + "grad_norm": 0.1896844357252121, + "learning_rate": 8.041149751596878e-05, + "loss": 0.017649631202220916, + "step": 138060 + }, + { + "epoch": 19.59829666430092, + "grad_norm": 6.295166015625, + "learning_rate": 8.041007806955287e-05, + "loss": 0.04750989973545074, + "step": 138070 + }, + { + "epoch": 19.59971611071682, + "grad_norm": 0.003022885648533702, + "learning_rate": 8.040865862313698e-05, + "loss": 0.01910801827907562, + "step": 138080 + }, + { + "epoch": 19.601135557132718, + "grad_norm": 14.71426773071289, + "learning_rate": 8.040723917672109e-05, + "loss": 0.03957200050354004, + "step": 138090 + }, + { + "epoch": 19.602555003548616, + "grad_norm": 0.1794685423374176, + "learning_rate": 8.040581973030519e-05, + "loss": 0.008308248966932297, + "step": 138100 + }, + { + "epoch": 19.603974449964515, + "grad_norm": 8.984095573425293, + "learning_rate": 8.04044002838893e-05, + "loss": 0.046796905994415286, + "step": 138110 + }, + { + "epoch": 19.605393896380413, + "grad_norm": 9.828641891479492, + "learning_rate": 8.040298083747338e-05, + "loss": 0.05411611199378967, + "step": 138120 + }, + { + "epoch": 19.606813342796308, + "grad_norm": 0.029189540073275566, + "learning_rate": 8.040156139105749e-05, + "loss": 0.04993451535701752, + "step": 138130 + }, + { + "epoch": 19.608232789212206, + "grad_norm": 1.7016907930374146, + "learning_rate": 8.040014194464159e-05, + "loss": 0.015500412881374359, + "step": 138140 + }, + { + "epoch": 19.609652235628104, + "grad_norm": 0.015035353600978851, + "learning_rate": 8.03987224982257e-05, + "loss": 0.019698965549468993, + "step": 138150 + }, + { + "epoch": 19.611071682044003, + "grad_norm": 12.345022201538086, + "learning_rate": 8.03973030518098e-05, + "loss": 0.03160671889781952, + "step": 138160 + }, + { + "epoch": 19.6124911284599, + "grad_norm": 0.5339881777763367, + "learning_rate": 8.03958836053939e-05, + "loss": 0.007254537940025329, + "step": 138170 + }, + { + "epoch": 19.6139105748758, + "grad_norm": 0.9007466435432434, + "learning_rate": 8.039446415897801e-05, + "loss": 0.036528339982032774, + "step": 138180 + }, + { + "epoch": 19.615330021291697, + "grad_norm": 1.618058443069458, + "learning_rate": 8.03930447125621e-05, + "loss": 0.004342170059680938, + "step": 138190 + }, + { + "epoch": 19.616749467707596, + "grad_norm": 0.0197339728474617, + "learning_rate": 8.039162526614622e-05, + "loss": 0.055916589498519895, + "step": 138200 + }, + { + "epoch": 19.61816891412349, + "grad_norm": 0.10298982262611389, + "learning_rate": 8.039020581973031e-05, + "loss": 0.05364638566970825, + "step": 138210 + }, + { + "epoch": 19.61958836053939, + "grad_norm": 1.0998340845108032, + "learning_rate": 8.038878637331441e-05, + "loss": 0.03447889089584351, + "step": 138220 + }, + { + "epoch": 19.621007806955287, + "grad_norm": 1.7219066619873047, + "learning_rate": 8.038736692689851e-05, + "loss": 0.022036303579807282, + "step": 138230 + }, + { + "epoch": 19.622427253371185, + "grad_norm": 0.3388693928718567, + "learning_rate": 8.038594748048262e-05, + "loss": 0.030643409490585326, + "step": 138240 + }, + { + "epoch": 19.623846699787084, + "grad_norm": 0.4383505582809448, + "learning_rate": 8.038452803406672e-05, + "loss": 0.0045415710657835005, + "step": 138250 + }, + { + "epoch": 19.625266146202982, + "grad_norm": 2.166379451751709, + "learning_rate": 8.038310858765083e-05, + "loss": 0.022049549221992492, + "step": 138260 + }, + { + "epoch": 19.62668559261888, + "grad_norm": 11.74815845489502, + "learning_rate": 8.038168914123491e-05, + "loss": 0.015070411562919616, + "step": 138270 + }, + { + "epoch": 19.628105039034775, + "grad_norm": 13.44038200378418, + "learning_rate": 8.038026969481902e-05, + "loss": 0.033355483412742616, + "step": 138280 + }, + { + "epoch": 19.629524485450673, + "grad_norm": 3.0908122062683105, + "learning_rate": 8.037885024840313e-05, + "loss": 0.018821220099925994, + "step": 138290 + }, + { + "epoch": 19.63094393186657, + "grad_norm": 0.18924617767333984, + "learning_rate": 8.037743080198723e-05, + "loss": 0.004630044847726822, + "step": 138300 + }, + { + "epoch": 19.63236337828247, + "grad_norm": 0.19256950914859772, + "learning_rate": 8.037601135557134e-05, + "loss": 0.010572195053100586, + "step": 138310 + }, + { + "epoch": 19.63378282469837, + "grad_norm": 0.09992443770170212, + "learning_rate": 8.037459190915542e-05, + "loss": 0.022526408731937408, + "step": 138320 + }, + { + "epoch": 19.635202271114267, + "grad_norm": 0.23397351801395416, + "learning_rate": 8.037317246273954e-05, + "loss": 0.0019818637520074844, + "step": 138330 + }, + { + "epoch": 19.636621717530165, + "grad_norm": 0.0030143605545163155, + "learning_rate": 8.037175301632363e-05, + "loss": 0.009634774923324586, + "step": 138340 + }, + { + "epoch": 19.63804116394606, + "grad_norm": 2.1461405754089355, + "learning_rate": 8.037033356990774e-05, + "loss": 0.015274564921855926, + "step": 138350 + }, + { + "epoch": 19.639460610361958, + "grad_norm": 0.09206445515155792, + "learning_rate": 8.036891412349184e-05, + "loss": 0.012372268736362458, + "step": 138360 + }, + { + "epoch": 19.640880056777856, + "grad_norm": 0.015879858285188675, + "learning_rate": 8.036749467707595e-05, + "loss": 0.028729519248008727, + "step": 138370 + }, + { + "epoch": 19.642299503193755, + "grad_norm": 0.01536844577640295, + "learning_rate": 8.036607523066005e-05, + "loss": 0.06816805005073548, + "step": 138380 + }, + { + "epoch": 19.643718949609653, + "grad_norm": 0.4715758264064789, + "learning_rate": 8.036465578424415e-05, + "loss": 0.03528289496898651, + "step": 138390 + }, + { + "epoch": 19.64513839602555, + "grad_norm": 5.921476364135742, + "learning_rate": 8.036323633782826e-05, + "loss": 0.017167089879512785, + "step": 138400 + }, + { + "epoch": 19.64655784244145, + "grad_norm": 0.18063852190971375, + "learning_rate": 8.036181689141235e-05, + "loss": 0.03132513463497162, + "step": 138410 + }, + { + "epoch": 19.647977288857344, + "grad_norm": 2.3100228309631348, + "learning_rate": 8.036039744499647e-05, + "loss": 0.041241294145584105, + "step": 138420 + }, + { + "epoch": 19.649396735273243, + "grad_norm": 0.04371634125709534, + "learning_rate": 8.035897799858055e-05, + "loss": 0.042258650064468384, + "step": 138430 + }, + { + "epoch": 19.65081618168914, + "grad_norm": 4.336278438568115, + "learning_rate": 8.035755855216466e-05, + "loss": 0.033323565125465394, + "step": 138440 + }, + { + "epoch": 19.65223562810504, + "grad_norm": 0.19271036982536316, + "learning_rate": 8.035613910574876e-05, + "loss": 0.03666624128818512, + "step": 138450 + }, + { + "epoch": 19.653655074520938, + "grad_norm": 0.12941347062587738, + "learning_rate": 8.035471965933287e-05, + "loss": 0.00817318558692932, + "step": 138460 + }, + { + "epoch": 19.655074520936836, + "grad_norm": 1.3784598112106323, + "learning_rate": 8.035330021291697e-05, + "loss": 0.00503128431737423, + "step": 138470 + }, + { + "epoch": 19.656493967352734, + "grad_norm": 0.35918229818344116, + "learning_rate": 8.035188076650106e-05, + "loss": 0.011499008536338806, + "step": 138480 + }, + { + "epoch": 19.65791341376863, + "grad_norm": 1.7034169435501099, + "learning_rate": 8.035046132008517e-05, + "loss": 0.01096985787153244, + "step": 138490 + }, + { + "epoch": 19.659332860184527, + "grad_norm": 8.493868827819824, + "learning_rate": 8.034904187366927e-05, + "loss": 0.042811107635498044, + "step": 138500 + }, + { + "epoch": 19.659332860184527, + "eval_accuracy": 0.986011318115343, + "eval_loss": 0.051940690726041794, + "eval_runtime": 32.9397, + "eval_samples_per_second": 477.449, + "eval_steps_per_second": 14.936, + "step": 138500 + }, + { + "epoch": 19.660752306600425, + "grad_norm": 11.311059951782227, + "learning_rate": 8.034762242725338e-05, + "loss": 0.020223698019981383, + "step": 138510 + }, + { + "epoch": 19.662171753016324, + "grad_norm": 0.9719066619873047, + "learning_rate": 8.034620298083748e-05, + "loss": 0.03731773793697357, + "step": 138520 + }, + { + "epoch": 19.663591199432222, + "grad_norm": 0.04316421225667, + "learning_rate": 8.034478353442158e-05, + "loss": 0.006004315614700317, + "step": 138530 + }, + { + "epoch": 19.66501064584812, + "grad_norm": 0.058855392038822174, + "learning_rate": 8.034336408800568e-05, + "loss": 0.017175130546092987, + "step": 138540 + }, + { + "epoch": 19.66643009226402, + "grad_norm": 0.013701051473617554, + "learning_rate": 8.034194464158979e-05, + "loss": 0.05722183585166931, + "step": 138550 + }, + { + "epoch": 19.667849538679913, + "grad_norm": 0.054416071623563766, + "learning_rate": 8.034052519517388e-05, + "loss": 0.03281792104244232, + "step": 138560 + }, + { + "epoch": 19.669268985095812, + "grad_norm": 0.026844222098588943, + "learning_rate": 8.0339105748758e-05, + "loss": 0.03597028255462646, + "step": 138570 + }, + { + "epoch": 19.67068843151171, + "grad_norm": 0.2913174331188202, + "learning_rate": 8.033768630234209e-05, + "loss": 0.07092903852462769, + "step": 138580 + }, + { + "epoch": 19.67210787792761, + "grad_norm": 0.08362773805856705, + "learning_rate": 8.033626685592619e-05, + "loss": 0.011146743595600129, + "step": 138590 + }, + { + "epoch": 19.673527324343507, + "grad_norm": 5.899541854858398, + "learning_rate": 8.03348474095103e-05, + "loss": 0.01189286783337593, + "step": 138600 + }, + { + "epoch": 19.674946770759405, + "grad_norm": 2.3376224040985107, + "learning_rate": 8.03334279630944e-05, + "loss": 0.004916764795780182, + "step": 138610 + }, + { + "epoch": 19.676366217175303, + "grad_norm": 1.4671872854232788, + "learning_rate": 8.033200851667851e-05, + "loss": 0.015109889209270477, + "step": 138620 + }, + { + "epoch": 19.677785663591198, + "grad_norm": 0.12707369029521942, + "learning_rate": 8.033058907026259e-05, + "loss": 0.0031056158244609834, + "step": 138630 + }, + { + "epoch": 19.679205110007096, + "grad_norm": 0.43206775188446045, + "learning_rate": 8.03291696238467e-05, + "loss": 0.013545922935009003, + "step": 138640 + }, + { + "epoch": 19.680624556422995, + "grad_norm": 0.06616683304309845, + "learning_rate": 8.03277501774308e-05, + "loss": 0.01973980963230133, + "step": 138650 + }, + { + "epoch": 19.682044002838893, + "grad_norm": 0.08876846730709076, + "learning_rate": 8.032633073101491e-05, + "loss": 0.030627089738845825, + "step": 138660 + }, + { + "epoch": 19.68346344925479, + "grad_norm": 0.38616421818733215, + "learning_rate": 8.032491128459901e-05, + "loss": 0.01867861896753311, + "step": 138670 + }, + { + "epoch": 19.68488289567069, + "grad_norm": 12.174175262451172, + "learning_rate": 8.03234918381831e-05, + "loss": 0.04813571572303772, + "step": 138680 + }, + { + "epoch": 19.686302342086588, + "grad_norm": 0.02565019391477108, + "learning_rate": 8.032207239176722e-05, + "loss": 0.006767947971820831, + "step": 138690 + }, + { + "epoch": 19.687721788502483, + "grad_norm": 4.341635227203369, + "learning_rate": 8.032065294535131e-05, + "loss": 0.017228135466575624, + "step": 138700 + }, + { + "epoch": 19.68914123491838, + "grad_norm": 6.133458614349365, + "learning_rate": 8.031923349893543e-05, + "loss": 0.011636064946651458, + "step": 138710 + }, + { + "epoch": 19.69056068133428, + "grad_norm": 3.0059027671813965, + "learning_rate": 8.031781405251952e-05, + "loss": 0.012886139750480651, + "step": 138720 + }, + { + "epoch": 19.691980127750178, + "grad_norm": 2.7686121463775635, + "learning_rate": 8.031639460610363e-05, + "loss": 0.032272160053253174, + "step": 138730 + }, + { + "epoch": 19.693399574166076, + "grad_norm": 0.11860307306051254, + "learning_rate": 8.031497515968772e-05, + "loss": 0.02647608518600464, + "step": 138740 + }, + { + "epoch": 19.694819020581974, + "grad_norm": 0.8833091855049133, + "learning_rate": 8.031355571327183e-05, + "loss": 0.0020510002970695494, + "step": 138750 + }, + { + "epoch": 19.696238466997873, + "grad_norm": 0.8521214127540588, + "learning_rate": 8.031213626685593e-05, + "loss": 0.009417210519313813, + "step": 138760 + }, + { + "epoch": 19.697657913413767, + "grad_norm": 4.539623260498047, + "learning_rate": 8.031071682044004e-05, + "loss": 0.00844470113515854, + "step": 138770 + }, + { + "epoch": 19.699077359829666, + "grad_norm": 0.4428519010543823, + "learning_rate": 8.030929737402413e-05, + "loss": 0.06552926301956177, + "step": 138780 + }, + { + "epoch": 19.700496806245564, + "grad_norm": 0.4736507534980774, + "learning_rate": 8.030787792760823e-05, + "loss": 0.028255379199981688, + "step": 138790 + }, + { + "epoch": 19.701916252661462, + "grad_norm": 0.004549229517579079, + "learning_rate": 8.030645848119234e-05, + "loss": 0.04177501499652862, + "step": 138800 + }, + { + "epoch": 19.70333569907736, + "grad_norm": 0.1801537275314331, + "learning_rate": 8.030503903477644e-05, + "loss": 0.02348633110523224, + "step": 138810 + }, + { + "epoch": 19.70475514549326, + "grad_norm": 10.447527885437012, + "learning_rate": 8.030361958836055e-05, + "loss": 0.029440933465957643, + "step": 138820 + }, + { + "epoch": 19.706174591909157, + "grad_norm": 14.18250846862793, + "learning_rate": 8.030220014194465e-05, + "loss": 0.028277039527893066, + "step": 138830 + }, + { + "epoch": 19.707594038325052, + "grad_norm": 0.3381829261779785, + "learning_rate": 8.030078069552875e-05, + "loss": 0.05422346591949463, + "step": 138840 + }, + { + "epoch": 19.70901348474095, + "grad_norm": 4.960470676422119, + "learning_rate": 8.029936124911284e-05, + "loss": 0.057622271776199344, + "step": 138850 + }, + { + "epoch": 19.71043293115685, + "grad_norm": 0.07115144282579422, + "learning_rate": 8.029794180269695e-05, + "loss": 0.035918551683425906, + "step": 138860 + }, + { + "epoch": 19.711852377572747, + "grad_norm": 2.7241153717041016, + "learning_rate": 8.029652235628105e-05, + "loss": 0.030051085352897643, + "step": 138870 + }, + { + "epoch": 19.713271823988645, + "grad_norm": 0.11607307940721512, + "learning_rate": 8.029510290986516e-05, + "loss": 0.022767902910709382, + "step": 138880 + }, + { + "epoch": 19.714691270404543, + "grad_norm": 0.5954616665840149, + "learning_rate": 8.029368346344926e-05, + "loss": 0.024277564883232117, + "step": 138890 + }, + { + "epoch": 19.71611071682044, + "grad_norm": 0.011503017507493496, + "learning_rate": 8.029226401703336e-05, + "loss": 0.022690418362617492, + "step": 138900 + }, + { + "epoch": 19.717530163236336, + "grad_norm": 0.4360412061214447, + "learning_rate": 8.029084457061747e-05, + "loss": 0.014873498678207397, + "step": 138910 + }, + { + "epoch": 19.718949609652235, + "grad_norm": 10.609220504760742, + "learning_rate": 8.028942512420157e-05, + "loss": 0.02726553976535797, + "step": 138920 + }, + { + "epoch": 19.720369056068133, + "grad_norm": 10.142011642456055, + "learning_rate": 8.028800567778568e-05, + "loss": 0.032623404264450075, + "step": 138930 + }, + { + "epoch": 19.72178850248403, + "grad_norm": 0.6974702477455139, + "learning_rate": 8.028658623136976e-05, + "loss": 0.003662829473614693, + "step": 138940 + }, + { + "epoch": 19.72320794889993, + "grad_norm": 2.2197811603546143, + "learning_rate": 8.028516678495387e-05, + "loss": 0.02944878935813904, + "step": 138950 + }, + { + "epoch": 19.724627395315828, + "grad_norm": 0.06566993147134781, + "learning_rate": 8.028374733853797e-05, + "loss": 0.006655294448137283, + "step": 138960 + }, + { + "epoch": 19.726046841731726, + "grad_norm": 0.04445694759488106, + "learning_rate": 8.028232789212208e-05, + "loss": 0.0028432216495275497, + "step": 138970 + }, + { + "epoch": 19.72746628814762, + "grad_norm": 0.1260414719581604, + "learning_rate": 8.028090844570618e-05, + "loss": 0.005070878565311432, + "step": 138980 + }, + { + "epoch": 19.72888573456352, + "grad_norm": 0.020465118810534477, + "learning_rate": 8.027948899929027e-05, + "loss": 0.019085513055324556, + "step": 138990 + }, + { + "epoch": 19.730305180979418, + "grad_norm": 8.512038230895996, + "learning_rate": 8.027806955287438e-05, + "loss": 0.04188350141048432, + "step": 139000 + }, + { + "epoch": 19.730305180979418, + "eval_accuracy": 0.9787626375023845, + "eval_loss": 0.07654014974832535, + "eval_runtime": 33.0569, + "eval_samples_per_second": 475.756, + "eval_steps_per_second": 14.883, + "step": 139000 + }, + { + "epoch": 19.731724627395316, + "grad_norm": 0.19638408720493317, + "learning_rate": 8.027665010645848e-05, + "loss": 0.09535208940505982, + "step": 139010 + }, + { + "epoch": 19.733144073811214, + "grad_norm": 10.96106243133545, + "learning_rate": 8.027523066004259e-05, + "loss": 0.017398083209991456, + "step": 139020 + }, + { + "epoch": 19.734563520227113, + "grad_norm": 4.2082133293151855, + "learning_rate": 8.027381121362669e-05, + "loss": 0.006317382305860519, + "step": 139030 + }, + { + "epoch": 19.73598296664301, + "grad_norm": 4.548252582550049, + "learning_rate": 8.027239176721079e-05, + "loss": 0.02355315685272217, + "step": 139040 + }, + { + "epoch": 19.737402413058906, + "grad_norm": 3.755049228668213, + "learning_rate": 8.027097232079489e-05, + "loss": 0.018561244010925293, + "step": 139050 + }, + { + "epoch": 19.738821859474804, + "grad_norm": 17.563447952270508, + "learning_rate": 8.0269552874379e-05, + "loss": 0.00837668552994728, + "step": 139060 + }, + { + "epoch": 19.740241305890702, + "grad_norm": 0.0856865718960762, + "learning_rate": 8.02681334279631e-05, + "loss": 0.01875850260257721, + "step": 139070 + }, + { + "epoch": 19.7416607523066, + "grad_norm": 1.0457096099853516, + "learning_rate": 8.02667139815472e-05, + "loss": 0.006809623539447784, + "step": 139080 + }, + { + "epoch": 19.7430801987225, + "grad_norm": 0.6450104117393494, + "learning_rate": 8.02652945351313e-05, + "loss": 0.010183577239513398, + "step": 139090 + }, + { + "epoch": 19.744499645138397, + "grad_norm": 1.8897494077682495, + "learning_rate": 8.02638750887154e-05, + "loss": 0.016523340344429018, + "step": 139100 + }, + { + "epoch": 19.745919091554295, + "grad_norm": 7.488572120666504, + "learning_rate": 8.026245564229951e-05, + "loss": 0.016927893459796905, + "step": 139110 + }, + { + "epoch": 19.74733853797019, + "grad_norm": 0.09959287941455841, + "learning_rate": 8.026103619588361e-05, + "loss": 0.004210712760686875, + "step": 139120 + }, + { + "epoch": 19.74875798438609, + "grad_norm": 0.032705750316381454, + "learning_rate": 8.025961674946772e-05, + "loss": 0.005322665721178055, + "step": 139130 + }, + { + "epoch": 19.750177430801987, + "grad_norm": 1.9483076333999634, + "learning_rate": 8.025819730305182e-05, + "loss": 0.013049685955047607, + "step": 139140 + }, + { + "epoch": 19.751596877217885, + "grad_norm": 0.4516810476779938, + "learning_rate": 8.025677785663591e-05, + "loss": 0.010224513709545135, + "step": 139150 + }, + { + "epoch": 19.753016323633783, + "grad_norm": 0.03128146752715111, + "learning_rate": 8.025535841022001e-05, + "loss": 0.04900606870651245, + "step": 139160 + }, + { + "epoch": 19.75443577004968, + "grad_norm": 1.1649566888809204, + "learning_rate": 8.025393896380412e-05, + "loss": 0.01893424540758133, + "step": 139170 + }, + { + "epoch": 19.75585521646558, + "grad_norm": 0.4992629587650299, + "learning_rate": 8.025251951738822e-05, + "loss": 0.029203197360038756, + "step": 139180 + }, + { + "epoch": 19.757274662881475, + "grad_norm": 7.769445896148682, + "learning_rate": 8.025110007097233e-05, + "loss": 0.030580288171768187, + "step": 139190 + }, + { + "epoch": 19.758694109297373, + "grad_norm": 0.7296946048736572, + "learning_rate": 8.024968062455643e-05, + "loss": 0.03568483293056488, + "step": 139200 + }, + { + "epoch": 19.76011355571327, + "grad_norm": 0.1032835841178894, + "learning_rate": 8.024826117814052e-05, + "loss": 0.029664459824562072, + "step": 139210 + }, + { + "epoch": 19.76153300212917, + "grad_norm": 0.4114496409893036, + "learning_rate": 8.024684173172464e-05, + "loss": 0.05616902112960816, + "step": 139220 + }, + { + "epoch": 19.762952448545068, + "grad_norm": 0.25582075119018555, + "learning_rate": 8.024542228530873e-05, + "loss": 0.03344759047031402, + "step": 139230 + }, + { + "epoch": 19.764371894960966, + "grad_norm": 8.181188583374023, + "learning_rate": 8.024400283889284e-05, + "loss": 0.04690394699573517, + "step": 139240 + }, + { + "epoch": 19.765791341376865, + "grad_norm": 0.21906448900699615, + "learning_rate": 8.024258339247693e-05, + "loss": 0.011242108047008514, + "step": 139250 + }, + { + "epoch": 19.76721078779276, + "grad_norm": 1.6838641166687012, + "learning_rate": 8.024116394606104e-05, + "loss": 0.005934418737888336, + "step": 139260 + }, + { + "epoch": 19.768630234208658, + "grad_norm": 0.09939505159854889, + "learning_rate": 8.023974449964514e-05, + "loss": 0.0072611324489116665, + "step": 139270 + }, + { + "epoch": 19.770049680624556, + "grad_norm": 10.888344764709473, + "learning_rate": 8.023832505322925e-05, + "loss": 0.03970246315002442, + "step": 139280 + }, + { + "epoch": 19.771469127040454, + "grad_norm": 0.010829064063727856, + "learning_rate": 8.023690560681334e-05, + "loss": 0.006960421800613403, + "step": 139290 + }, + { + "epoch": 19.772888573456353, + "grad_norm": 8.759506225585938, + "learning_rate": 8.023548616039744e-05, + "loss": 0.02476315051317215, + "step": 139300 + }, + { + "epoch": 19.77430801987225, + "grad_norm": 0.019028963521122932, + "learning_rate": 8.023406671398155e-05, + "loss": 0.08776750564575195, + "step": 139310 + }, + { + "epoch": 19.77572746628815, + "grad_norm": 3.19905948638916, + "learning_rate": 8.023264726756565e-05, + "loss": 0.022287966310977937, + "step": 139320 + }, + { + "epoch": 19.777146912704044, + "grad_norm": 0.04139919579029083, + "learning_rate": 8.023122782114976e-05, + "loss": 0.0035886283963918685, + "step": 139330 + }, + { + "epoch": 19.778566359119942, + "grad_norm": 0.14054623246192932, + "learning_rate": 8.022980837473386e-05, + "loss": 0.017585280537605285, + "step": 139340 + }, + { + "epoch": 19.77998580553584, + "grad_norm": 1.868897557258606, + "learning_rate": 8.022838892831796e-05, + "loss": 0.012134405225515366, + "step": 139350 + }, + { + "epoch": 19.78140525195174, + "grad_norm": 0.010445545427501202, + "learning_rate": 8.022696948190205e-05, + "loss": 0.010198205709457397, + "step": 139360 + }, + { + "epoch": 19.782824698367637, + "grad_norm": 0.0983772873878479, + "learning_rate": 8.022555003548616e-05, + "loss": 0.03380552530288696, + "step": 139370 + }, + { + "epoch": 19.784244144783536, + "grad_norm": 0.01494747307151556, + "learning_rate": 8.022413058907026e-05, + "loss": 0.018215297162532805, + "step": 139380 + }, + { + "epoch": 19.785663591199434, + "grad_norm": 0.041560154408216476, + "learning_rate": 8.022271114265437e-05, + "loss": 0.014817462861537933, + "step": 139390 + }, + { + "epoch": 19.78708303761533, + "grad_norm": 0.37853920459747314, + "learning_rate": 8.022129169623847e-05, + "loss": 0.03316742777824402, + "step": 139400 + }, + { + "epoch": 19.788502484031227, + "grad_norm": 0.37998664379119873, + "learning_rate": 8.021987224982257e-05, + "loss": 0.003211744502186775, + "step": 139410 + }, + { + "epoch": 19.789921930447125, + "grad_norm": 1.9321067333221436, + "learning_rate": 8.021845280340668e-05, + "loss": 0.011175059527158738, + "step": 139420 + }, + { + "epoch": 19.791341376863024, + "grad_norm": 11.715725898742676, + "learning_rate": 8.021703335699078e-05, + "loss": 0.01863114982843399, + "step": 139430 + }, + { + "epoch": 19.792760823278922, + "grad_norm": 0.04460087791085243, + "learning_rate": 8.021561391057489e-05, + "loss": 0.013021458685398103, + "step": 139440 + }, + { + "epoch": 19.79418026969482, + "grad_norm": 0.5963287949562073, + "learning_rate": 8.021419446415898e-05, + "loss": 0.011012168228626251, + "step": 139450 + }, + { + "epoch": 19.79559971611072, + "grad_norm": 0.29790475964546204, + "learning_rate": 8.021277501774308e-05, + "loss": 0.020087811350822448, + "step": 139460 + }, + { + "epoch": 19.797019162526613, + "grad_norm": 0.044911980628967285, + "learning_rate": 8.021135557132718e-05, + "loss": 0.019449099898338318, + "step": 139470 + }, + { + "epoch": 19.79843860894251, + "grad_norm": 1.8258098363876343, + "learning_rate": 8.020993612491129e-05, + "loss": 0.0049147456884384155, + "step": 139480 + }, + { + "epoch": 19.79985805535841, + "grad_norm": 1.326615810394287, + "learning_rate": 8.02085166784954e-05, + "loss": 0.03728066086769104, + "step": 139490 + }, + { + "epoch": 19.801277501774308, + "grad_norm": 4.393553256988525, + "learning_rate": 8.02070972320795e-05, + "loss": 0.01763288676738739, + "step": 139500 + }, + { + "epoch": 19.801277501774308, + "eval_accuracy": 0.9856298086093979, + "eval_loss": 0.0544835589826107, + "eval_runtime": 33.1607, + "eval_samples_per_second": 474.267, + "eval_steps_per_second": 14.837, + "step": 139500 + }, + { + "epoch": 19.802696948190206, + "grad_norm": 0.11989425122737885, + "learning_rate": 8.02056777856636e-05, + "loss": 0.01877267360687256, + "step": 139510 + }, + { + "epoch": 19.804116394606105, + "grad_norm": 0.060171280056238174, + "learning_rate": 8.020425833924769e-05, + "loss": 0.00610172413289547, + "step": 139520 + }, + { + "epoch": 19.805535841022003, + "grad_norm": 0.6509189605712891, + "learning_rate": 8.02028388928318e-05, + "loss": 0.021013137698173524, + "step": 139530 + }, + { + "epoch": 19.806955287437898, + "grad_norm": 0.0794127956032753, + "learning_rate": 8.02014194464159e-05, + "loss": 0.011158202588558198, + "step": 139540 + }, + { + "epoch": 19.808374733853796, + "grad_norm": 0.05593372881412506, + "learning_rate": 8.020000000000001e-05, + "loss": 0.02410032004117966, + "step": 139550 + }, + { + "epoch": 19.809794180269694, + "grad_norm": 2.906390905380249, + "learning_rate": 8.01985805535841e-05, + "loss": 0.010091563314199447, + "step": 139560 + }, + { + "epoch": 19.811213626685593, + "grad_norm": 0.03298679366707802, + "learning_rate": 8.01971611071682e-05, + "loss": 0.01290801763534546, + "step": 139570 + }, + { + "epoch": 19.81263307310149, + "grad_norm": 0.02637336403131485, + "learning_rate": 8.019574166075232e-05, + "loss": 0.021950289607048035, + "step": 139580 + }, + { + "epoch": 19.81405251951739, + "grad_norm": 0.021645430475473404, + "learning_rate": 8.019432221433641e-05, + "loss": 0.02902255952358246, + "step": 139590 + }, + { + "epoch": 19.815471965933288, + "grad_norm": 0.014755524694919586, + "learning_rate": 8.019290276792053e-05, + "loss": 0.004876154288649559, + "step": 139600 + }, + { + "epoch": 19.816891412349182, + "grad_norm": 0.19308185577392578, + "learning_rate": 8.019148332150461e-05, + "loss": 0.002289852499961853, + "step": 139610 + }, + { + "epoch": 19.81831085876508, + "grad_norm": 3.7021358013153076, + "learning_rate": 8.019006387508872e-05, + "loss": 0.021032847464084625, + "step": 139620 + }, + { + "epoch": 19.81973030518098, + "grad_norm": 9.473984718322754, + "learning_rate": 8.018864442867282e-05, + "loss": 0.008704672008752823, + "step": 139630 + }, + { + "epoch": 19.821149751596877, + "grad_norm": 0.630850613117218, + "learning_rate": 8.018722498225693e-05, + "loss": 0.016734354197978973, + "step": 139640 + }, + { + "epoch": 19.822569198012776, + "grad_norm": 0.07666724175214767, + "learning_rate": 8.018580553584103e-05, + "loss": 0.004861601814627648, + "step": 139650 + }, + { + "epoch": 19.823988644428674, + "grad_norm": 4.326195240020752, + "learning_rate": 8.018438608942512e-05, + "loss": 0.00682557076215744, + "step": 139660 + }, + { + "epoch": 19.825408090844572, + "grad_norm": 0.04820561781525612, + "learning_rate": 8.018296664300923e-05, + "loss": 0.007097174227237701, + "step": 139670 + }, + { + "epoch": 19.826827537260467, + "grad_norm": 12.860306739807129, + "learning_rate": 8.018154719659333e-05, + "loss": 0.02446320354938507, + "step": 139680 + }, + { + "epoch": 19.828246983676365, + "grad_norm": 0.4204403758049011, + "learning_rate": 8.018012775017744e-05, + "loss": 0.017366963624954223, + "step": 139690 + }, + { + "epoch": 19.829666430092264, + "grad_norm": 0.0063424259424209595, + "learning_rate": 8.017870830376154e-05, + "loss": 0.026230162382125853, + "step": 139700 + }, + { + "epoch": 19.831085876508162, + "grad_norm": 1.6799793243408203, + "learning_rate": 8.017728885734564e-05, + "loss": 0.004461310058832169, + "step": 139710 + }, + { + "epoch": 19.83250532292406, + "grad_norm": 4.775684833526611, + "learning_rate": 8.017586941092973e-05, + "loss": 0.011769990622997283, + "step": 139720 + }, + { + "epoch": 19.83392476933996, + "grad_norm": 0.06816478073596954, + "learning_rate": 8.017444996451385e-05, + "loss": 0.01424238234758377, + "step": 139730 + }, + { + "epoch": 19.835344215755857, + "grad_norm": 3.2718963623046875, + "learning_rate": 8.017303051809794e-05, + "loss": 0.0027877304702997206, + "step": 139740 + }, + { + "epoch": 19.83676366217175, + "grad_norm": 10.610093116760254, + "learning_rate": 8.017161107168205e-05, + "loss": 0.01973050832748413, + "step": 139750 + }, + { + "epoch": 19.83818310858765, + "grad_norm": 13.073330879211426, + "learning_rate": 8.017019162526615e-05, + "loss": 0.028023535013198854, + "step": 139760 + }, + { + "epoch": 19.839602555003548, + "grad_norm": 17.038087844848633, + "learning_rate": 8.016877217885025e-05, + "loss": 0.03127200305461884, + "step": 139770 + }, + { + "epoch": 19.841022001419446, + "grad_norm": 11.913118362426758, + "learning_rate": 8.016735273243436e-05, + "loss": 0.016964422166347505, + "step": 139780 + }, + { + "epoch": 19.842441447835345, + "grad_norm": 0.44638150930404663, + "learning_rate": 8.016593328601846e-05, + "loss": 0.01624404489994049, + "step": 139790 + }, + { + "epoch": 19.843860894251243, + "grad_norm": 0.12529753148555756, + "learning_rate": 8.016451383960257e-05, + "loss": 0.05540893077850342, + "step": 139800 + }, + { + "epoch": 19.84528034066714, + "grad_norm": 0.14248676598072052, + "learning_rate": 8.016309439318667e-05, + "loss": 0.011490736156702042, + "step": 139810 + }, + { + "epoch": 19.846699787083036, + "grad_norm": 3.391721725463867, + "learning_rate": 8.016167494677076e-05, + "loss": 0.013984756171703338, + "step": 139820 + }, + { + "epoch": 19.848119233498934, + "grad_norm": 2.265918731689453, + "learning_rate": 8.016025550035486e-05, + "loss": 0.03134198486804962, + "step": 139830 + }, + { + "epoch": 19.849538679914833, + "grad_norm": 0.15010884404182434, + "learning_rate": 8.015883605393897e-05, + "loss": 0.0080152228474617, + "step": 139840 + }, + { + "epoch": 19.85095812633073, + "grad_norm": 0.5058720707893372, + "learning_rate": 8.015741660752307e-05, + "loss": 0.0023042641580104826, + "step": 139850 + }, + { + "epoch": 19.85237757274663, + "grad_norm": 0.3317244052886963, + "learning_rate": 8.015599716110718e-05, + "loss": 0.0036775771528482436, + "step": 139860 + }, + { + "epoch": 19.853797019162528, + "grad_norm": 5.731398582458496, + "learning_rate": 8.015457771469128e-05, + "loss": 0.0798199474811554, + "step": 139870 + }, + { + "epoch": 19.855216465578426, + "grad_norm": 1.4253054857254028, + "learning_rate": 8.015315826827537e-05, + "loss": 0.04291553795337677, + "step": 139880 + }, + { + "epoch": 19.85663591199432, + "grad_norm": 3.632465124130249, + "learning_rate": 8.015173882185948e-05, + "loss": 0.060555887222290036, + "step": 139890 + }, + { + "epoch": 19.85805535841022, + "grad_norm": 0.17125140130519867, + "learning_rate": 8.015031937544358e-05, + "loss": 0.04922077655792236, + "step": 139900 + }, + { + "epoch": 19.859474804826117, + "grad_norm": 3.8708529472351074, + "learning_rate": 8.014889992902769e-05, + "loss": 0.043092742562294006, + "step": 139910 + }, + { + "epoch": 19.860894251242016, + "grad_norm": 0.012349608354270458, + "learning_rate": 8.014748048261178e-05, + "loss": 0.004922491312026977, + "step": 139920 + }, + { + "epoch": 19.862313697657914, + "grad_norm": 0.11534589529037476, + "learning_rate": 8.014606103619589e-05, + "loss": 0.047697001695632936, + "step": 139930 + }, + { + "epoch": 19.863733144073812, + "grad_norm": 0.23999154567718506, + "learning_rate": 8.014464158977999e-05, + "loss": 0.019572360813617705, + "step": 139940 + }, + { + "epoch": 19.86515259048971, + "grad_norm": 10.510241508483887, + "learning_rate": 8.01432221433641e-05, + "loss": 0.019799953699111937, + "step": 139950 + }, + { + "epoch": 19.866572036905605, + "grad_norm": 0.5493953227996826, + "learning_rate": 8.01418026969482e-05, + "loss": 0.0007798772305250168, + "step": 139960 + }, + { + "epoch": 19.867991483321504, + "grad_norm": 9.87894058227539, + "learning_rate": 8.014038325053229e-05, + "loss": 0.027184116840362548, + "step": 139970 + }, + { + "epoch": 19.869410929737402, + "grad_norm": 2.0167622566223145, + "learning_rate": 8.01389638041164e-05, + "loss": 0.031891757249832155, + "step": 139980 + }, + { + "epoch": 19.8708303761533, + "grad_norm": 4.695428848266602, + "learning_rate": 8.01375443577005e-05, + "loss": 0.022933872044086458, + "step": 139990 + }, + { + "epoch": 19.8722498225692, + "grad_norm": 0.11179798096418381, + "learning_rate": 8.013612491128461e-05, + "loss": 0.03894450962543487, + "step": 140000 + }, + { + "epoch": 19.8722498225692, + "eval_accuracy": 0.9813060342086857, + "eval_loss": 0.07770732045173645, + "eval_runtime": 32.8225, + "eval_samples_per_second": 479.152, + "eval_steps_per_second": 14.99, + "step": 140000 + }, + { + "epoch": 19.873669268985097, + "grad_norm": 0.02420716919004917, + "learning_rate": 8.013470546486871e-05, + "loss": 0.05574930906295776, + "step": 140010 + }, + { + "epoch": 19.875088715400995, + "grad_norm": 0.048414770513772964, + "learning_rate": 8.01332860184528e-05, + "loss": 0.005164441466331482, + "step": 140020 + }, + { + "epoch": 19.87650816181689, + "grad_norm": 0.04100466147065163, + "learning_rate": 8.01318665720369e-05, + "loss": 0.044208526611328125, + "step": 140030 + }, + { + "epoch": 19.87792760823279, + "grad_norm": 0.20554162561893463, + "learning_rate": 8.013044712562101e-05, + "loss": 0.010324726998805999, + "step": 140040 + }, + { + "epoch": 19.879347054648687, + "grad_norm": 0.28824618458747864, + "learning_rate": 8.012902767920511e-05, + "loss": 0.00433928295969963, + "step": 140050 + }, + { + "epoch": 19.880766501064585, + "grad_norm": 0.4394190311431885, + "learning_rate": 8.012760823278922e-05, + "loss": 0.016210195422172547, + "step": 140060 + }, + { + "epoch": 19.882185947480483, + "grad_norm": 0.5106674432754517, + "learning_rate": 8.012618878637332e-05, + "loss": 0.019017013907432555, + "step": 140070 + }, + { + "epoch": 19.88360539389638, + "grad_norm": 0.030603084713220596, + "learning_rate": 8.012476933995742e-05, + "loss": 0.02650708854198456, + "step": 140080 + }, + { + "epoch": 19.88502484031228, + "grad_norm": 0.5879043936729431, + "learning_rate": 8.012334989354153e-05, + "loss": 0.01849451959133148, + "step": 140090 + }, + { + "epoch": 19.886444286728175, + "grad_norm": 0.05266867205500603, + "learning_rate": 8.012193044712562e-05, + "loss": 0.02872341573238373, + "step": 140100 + }, + { + "epoch": 19.887863733144073, + "grad_norm": 0.01276461873203516, + "learning_rate": 8.012051100070974e-05, + "loss": 0.012407783418893814, + "step": 140110 + }, + { + "epoch": 19.88928317955997, + "grad_norm": 0.1270163506269455, + "learning_rate": 8.011909155429383e-05, + "loss": 0.00402568094432354, + "step": 140120 + }, + { + "epoch": 19.89070262597587, + "grad_norm": 0.021485812962055206, + "learning_rate": 8.011767210787793e-05, + "loss": 0.01000293791294098, + "step": 140130 + }, + { + "epoch": 19.892122072391768, + "grad_norm": 0.07036632299423218, + "learning_rate": 8.011625266146203e-05, + "loss": 0.005553951486945152, + "step": 140140 + }, + { + "epoch": 19.893541518807666, + "grad_norm": 0.3364790976047516, + "learning_rate": 8.011483321504614e-05, + "loss": 0.005696587264537811, + "step": 140150 + }, + { + "epoch": 19.894960965223564, + "grad_norm": 8.817188262939453, + "learning_rate": 8.011341376863024e-05, + "loss": 0.03514130413532257, + "step": 140160 + }, + { + "epoch": 19.89638041163946, + "grad_norm": 7.810001373291016, + "learning_rate": 8.011199432221435e-05, + "loss": 0.027446284890174866, + "step": 140170 + }, + { + "epoch": 19.897799858055357, + "grad_norm": 7.393718242645264, + "learning_rate": 8.011057487579844e-05, + "loss": 0.04557895958423615, + "step": 140180 + }, + { + "epoch": 19.899219304471256, + "grad_norm": 4.390380859375, + "learning_rate": 8.010915542938254e-05, + "loss": 0.013040535151958466, + "step": 140190 + }, + { + "epoch": 19.900638750887154, + "grad_norm": 0.17087222635746002, + "learning_rate": 8.010773598296665e-05, + "loss": 0.052542030811309814, + "step": 140200 + }, + { + "epoch": 19.902058197303052, + "grad_norm": 0.10569733381271362, + "learning_rate": 8.010631653655075e-05, + "loss": 0.00950702428817749, + "step": 140210 + }, + { + "epoch": 19.90347764371895, + "grad_norm": 0.4821464717388153, + "learning_rate": 8.010489709013486e-05, + "loss": 0.006985366344451904, + "step": 140220 + }, + { + "epoch": 19.90489709013485, + "grad_norm": 1.4368162155151367, + "learning_rate": 8.010347764371894e-05, + "loss": 0.048772335052490234, + "step": 140230 + }, + { + "epoch": 19.906316536550744, + "grad_norm": 6.197601318359375, + "learning_rate": 8.010205819730306e-05, + "loss": 0.04889421761035919, + "step": 140240 + }, + { + "epoch": 19.907735982966642, + "grad_norm": 3.209953546524048, + "learning_rate": 8.010063875088715e-05, + "loss": 0.005126167461276055, + "step": 140250 + }, + { + "epoch": 19.90915542938254, + "grad_norm": 0.1128145158290863, + "learning_rate": 8.009921930447126e-05, + "loss": 0.003893549740314484, + "step": 140260 + }, + { + "epoch": 19.91057487579844, + "grad_norm": 0.0572998970746994, + "learning_rate": 8.009779985805536e-05, + "loss": 0.022830471396446228, + "step": 140270 + }, + { + "epoch": 19.911994322214337, + "grad_norm": 0.1309823989868164, + "learning_rate": 8.009638041163946e-05, + "loss": 0.0027685169130563735, + "step": 140280 + }, + { + "epoch": 19.913413768630235, + "grad_norm": 0.9516026973724365, + "learning_rate": 8.009496096522357e-05, + "loss": 0.020607098937034607, + "step": 140290 + }, + { + "epoch": 19.914833215046134, + "grad_norm": 0.10789740830659866, + "learning_rate": 8.009354151880767e-05, + "loss": 0.007872572541236878, + "step": 140300 + }, + { + "epoch": 19.91625266146203, + "grad_norm": 0.21346351504325867, + "learning_rate": 8.009212207239178e-05, + "loss": 0.024496954679489136, + "step": 140310 + }, + { + "epoch": 19.917672107877927, + "grad_norm": 0.15668897330760956, + "learning_rate": 8.009070262597588e-05, + "loss": 0.006305134296417237, + "step": 140320 + }, + { + "epoch": 19.919091554293825, + "grad_norm": 14.24783706665039, + "learning_rate": 8.008928317955997e-05, + "loss": 0.03388771116733551, + "step": 140330 + }, + { + "epoch": 19.920511000709723, + "grad_norm": 0.8930047154426575, + "learning_rate": 8.008786373314407e-05, + "loss": 0.014166541397571564, + "step": 140340 + }, + { + "epoch": 19.92193044712562, + "grad_norm": 8.249771118164062, + "learning_rate": 8.008644428672818e-05, + "loss": 0.028839975595474243, + "step": 140350 + }, + { + "epoch": 19.92334989354152, + "grad_norm": 4.8222880363464355, + "learning_rate": 8.008502484031228e-05, + "loss": 0.043366655707359314, + "step": 140360 + }, + { + "epoch": 19.924769339957418, + "grad_norm": 2.358502149581909, + "learning_rate": 8.008360539389639e-05, + "loss": 0.05386812090873718, + "step": 140370 + }, + { + "epoch": 19.926188786373313, + "grad_norm": 1.2944918870925903, + "learning_rate": 8.008218594748049e-05, + "loss": 0.08023052811622619, + "step": 140380 + }, + { + "epoch": 19.92760823278921, + "grad_norm": 1.1863116025924683, + "learning_rate": 8.008076650106458e-05, + "loss": 0.07636402249336242, + "step": 140390 + }, + { + "epoch": 19.92902767920511, + "grad_norm": 3.6277599334716797, + "learning_rate": 8.00793470546487e-05, + "loss": 0.018394359946250917, + "step": 140400 + }, + { + "epoch": 19.930447125621008, + "grad_norm": 5.336276054382324, + "learning_rate": 8.007792760823279e-05, + "loss": 0.052164393663406375, + "step": 140410 + }, + { + "epoch": 19.931866572036906, + "grad_norm": 0.47161370515823364, + "learning_rate": 8.00765081618169e-05, + "loss": 0.015585443377494812, + "step": 140420 + }, + { + "epoch": 19.933286018452804, + "grad_norm": 2.699232339859009, + "learning_rate": 8.007508871540099e-05, + "loss": 0.06586329936981201, + "step": 140430 + }, + { + "epoch": 19.934705464868703, + "grad_norm": 7.184756278991699, + "learning_rate": 8.00736692689851e-05, + "loss": 0.06342093348503113, + "step": 140440 + }, + { + "epoch": 19.936124911284598, + "grad_norm": 0.12634967267513275, + "learning_rate": 8.00722498225692e-05, + "loss": 0.008061951398849488, + "step": 140450 + }, + { + "epoch": 19.937544357700496, + "grad_norm": 6.7007269859313965, + "learning_rate": 8.00708303761533e-05, + "loss": 0.022827640175819397, + "step": 140460 + }, + { + "epoch": 19.938963804116394, + "grad_norm": 0.08524321019649506, + "learning_rate": 8.00694109297374e-05, + "loss": 0.021901366114616395, + "step": 140470 + }, + { + "epoch": 19.940383250532292, + "grad_norm": 4.822833061218262, + "learning_rate": 8.006799148332151e-05, + "loss": 0.018462111055850983, + "step": 140480 + }, + { + "epoch": 19.94180269694819, + "grad_norm": 1.6026697158813477, + "learning_rate": 8.006657203690561e-05, + "loss": 0.02011823356151581, + "step": 140490 + }, + { + "epoch": 19.94322214336409, + "grad_norm": 0.2379651665687561, + "learning_rate": 8.006515259048971e-05, + "loss": 0.015809541940689086, + "step": 140500 + }, + { + "epoch": 19.94322214336409, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.05191691592335701, + "eval_runtime": 32.7801, + "eval_samples_per_second": 479.773, + "eval_steps_per_second": 15.009, + "step": 140500 + }, + { + "epoch": 19.944641589779987, + "grad_norm": 0.455178827047348, + "learning_rate": 8.006373314407382e-05, + "loss": 0.04101063311100006, + "step": 140510 + }, + { + "epoch": 19.946061036195882, + "grad_norm": 1.712772011756897, + "learning_rate": 8.006231369765792e-05, + "loss": 0.02889895737171173, + "step": 140520 + }, + { + "epoch": 19.94748048261178, + "grad_norm": 0.1457989364862442, + "learning_rate": 8.006089425124203e-05, + "loss": 0.019174237549304963, + "step": 140530 + }, + { + "epoch": 19.94889992902768, + "grad_norm": 0.6908969879150391, + "learning_rate": 8.005947480482611e-05, + "loss": 0.015175694227218628, + "step": 140540 + }, + { + "epoch": 19.950319375443577, + "grad_norm": 0.05797145515680313, + "learning_rate": 8.005805535841022e-05, + "loss": 0.034023651480674745, + "step": 140550 + }, + { + "epoch": 19.951738821859475, + "grad_norm": 4.641683578491211, + "learning_rate": 8.005663591199432e-05, + "loss": 0.031184056401252748, + "step": 140560 + }, + { + "epoch": 19.953158268275374, + "grad_norm": 0.24377094209194183, + "learning_rate": 8.005521646557843e-05, + "loss": 0.020593273639678954, + "step": 140570 + }, + { + "epoch": 19.954577714691272, + "grad_norm": 0.2801949977874756, + "learning_rate": 8.005379701916253e-05, + "loss": 0.012578007578849793, + "step": 140580 + }, + { + "epoch": 19.955997161107167, + "grad_norm": 0.00720011442899704, + "learning_rate": 8.005237757274663e-05, + "loss": 0.05097510814666748, + "step": 140590 + }, + { + "epoch": 19.957416607523065, + "grad_norm": 3.5743913650512695, + "learning_rate": 8.005095812633074e-05, + "loss": 0.015194380283355713, + "step": 140600 + }, + { + "epoch": 19.958836053938963, + "grad_norm": 11.544880867004395, + "learning_rate": 8.004953867991483e-05, + "loss": 0.011013476550579071, + "step": 140610 + }, + { + "epoch": 19.96025550035486, + "grad_norm": 1.3924674987792969, + "learning_rate": 8.004811923349895e-05, + "loss": 0.02060462683439255, + "step": 140620 + }, + { + "epoch": 19.96167494677076, + "grad_norm": 0.5600424408912659, + "learning_rate": 8.004669978708304e-05, + "loss": 0.0067936301231384276, + "step": 140630 + }, + { + "epoch": 19.96309439318666, + "grad_norm": 2.087996244430542, + "learning_rate": 8.004528034066714e-05, + "loss": 0.004950342327356338, + "step": 140640 + }, + { + "epoch": 19.964513839602557, + "grad_norm": 0.007394047453999519, + "learning_rate": 8.004386089425124e-05, + "loss": 0.005445841327309608, + "step": 140650 + }, + { + "epoch": 19.96593328601845, + "grad_norm": 7.113765716552734, + "learning_rate": 8.004244144783535e-05, + "loss": 0.008016441762447358, + "step": 140660 + }, + { + "epoch": 19.96735273243435, + "grad_norm": 0.10857373476028442, + "learning_rate": 8.004102200141945e-05, + "loss": 0.003368879854679108, + "step": 140670 + }, + { + "epoch": 19.968772178850248, + "grad_norm": 0.370307981967926, + "learning_rate": 8.003960255500356e-05, + "loss": 0.014521026611328125, + "step": 140680 + }, + { + "epoch": 19.970191625266146, + "grad_norm": 0.025977713987231255, + "learning_rate": 8.003818310858765e-05, + "loss": 0.014565077424049378, + "step": 140690 + }, + { + "epoch": 19.971611071682045, + "grad_norm": 0.11662080883979797, + "learning_rate": 8.003676366217175e-05, + "loss": 0.01800130307674408, + "step": 140700 + }, + { + "epoch": 19.973030518097943, + "grad_norm": 5.711389541625977, + "learning_rate": 8.003534421575586e-05, + "loss": 0.0410052627325058, + "step": 140710 + }, + { + "epoch": 19.97444996451384, + "grad_norm": 0.013483582064509392, + "learning_rate": 8.003392476933996e-05, + "loss": 0.009572294354438782, + "step": 140720 + }, + { + "epoch": 19.975869410929736, + "grad_norm": 17.010944366455078, + "learning_rate": 8.003250532292407e-05, + "loss": 0.02607950270175934, + "step": 140730 + }, + { + "epoch": 19.977288857345634, + "grad_norm": 0.10605667531490326, + "learning_rate": 8.003108587650815e-05, + "loss": 0.007246570289134979, + "step": 140740 + }, + { + "epoch": 19.978708303761533, + "grad_norm": 7.407968997955322, + "learning_rate": 8.002966643009227e-05, + "loss": 0.011455393582582473, + "step": 140750 + }, + { + "epoch": 19.98012775017743, + "grad_norm": 0.04318075627088547, + "learning_rate": 8.002824698367636e-05, + "loss": 0.014967672526836395, + "step": 140760 + }, + { + "epoch": 19.98154719659333, + "grad_norm": 0.34878456592559814, + "learning_rate": 8.002682753726047e-05, + "loss": 0.017500746250152587, + "step": 140770 + }, + { + "epoch": 19.982966643009227, + "grad_norm": 4.813690662384033, + "learning_rate": 8.002540809084457e-05, + "loss": 0.010691556334495544, + "step": 140780 + }, + { + "epoch": 19.984386089425126, + "grad_norm": 0.8639642000198364, + "learning_rate": 8.002398864442867e-05, + "loss": 0.0032157417386770248, + "step": 140790 + }, + { + "epoch": 19.98580553584102, + "grad_norm": 0.014026980847120285, + "learning_rate": 8.002256919801278e-05, + "loss": 0.015287537872791291, + "step": 140800 + }, + { + "epoch": 19.98722498225692, + "grad_norm": 0.1736358255147934, + "learning_rate": 8.002114975159688e-05, + "loss": 0.049898722767829896, + "step": 140810 + }, + { + "epoch": 19.988644428672817, + "grad_norm": 0.01327612716704607, + "learning_rate": 8.001973030518099e-05, + "loss": 0.019483727216720582, + "step": 140820 + }, + { + "epoch": 19.990063875088715, + "grad_norm": 0.06661958247423172, + "learning_rate": 8.001831085876509e-05, + "loss": 0.006842435896396637, + "step": 140830 + }, + { + "epoch": 19.991483321504614, + "grad_norm": 0.008325074799358845, + "learning_rate": 8.00168914123492e-05, + "loss": 0.006324427574872971, + "step": 140840 + }, + { + "epoch": 19.992902767920512, + "grad_norm": 0.2477165162563324, + "learning_rate": 8.001547196593328e-05, + "loss": 0.02415194809436798, + "step": 140850 + }, + { + "epoch": 19.99432221433641, + "grad_norm": 0.03715141862630844, + "learning_rate": 8.001405251951739e-05, + "loss": 0.015509502589702606, + "step": 140860 + }, + { + "epoch": 19.995741660752305, + "grad_norm": 0.06404056400060654, + "learning_rate": 8.001263307310149e-05, + "loss": 0.026072847843170165, + "step": 140870 + }, + { + "epoch": 19.997161107168203, + "grad_norm": 3.6626203060150146, + "learning_rate": 8.00112136266856e-05, + "loss": 0.016191501915454865, + "step": 140880 + }, + { + "epoch": 19.9985805535841, + "grad_norm": 4.256667137145996, + "learning_rate": 8.000979418026971e-05, + "loss": 0.00894407331943512, + "step": 140890 + }, + { + "epoch": 20.0, + "grad_norm": 0.0740240067243576, + "learning_rate": 8.00083747338538e-05, + "loss": 0.015609632432460784, + "step": 140900 + }, + { + "epoch": 20.0014194464159, + "grad_norm": 0.22740566730499268, + "learning_rate": 8.00069552874379e-05, + "loss": 0.041918623447418216, + "step": 140910 + }, + { + "epoch": 20.002838892831797, + "grad_norm": 0.4715220630168915, + "learning_rate": 8.0005535841022e-05, + "loss": 0.02898731529712677, + "step": 140920 + }, + { + "epoch": 20.004258339247695, + "grad_norm": 19.265684127807617, + "learning_rate": 8.000411639460611e-05, + "loss": 0.05128769874572754, + "step": 140930 + }, + { + "epoch": 20.00567778566359, + "grad_norm": 4.531553268432617, + "learning_rate": 8.000269694819021e-05, + "loss": 0.08501461148262024, + "step": 140940 + }, + { + "epoch": 20.007097232079488, + "grad_norm": 0.04022873565554619, + "learning_rate": 8.000127750177431e-05, + "loss": 0.03906202912330627, + "step": 140950 + }, + { + "epoch": 20.008516678495386, + "grad_norm": 0.02828882448375225, + "learning_rate": 7.99998580553584e-05, + "loss": 0.062254679203033444, + "step": 140960 + }, + { + "epoch": 20.009936124911285, + "grad_norm": 0.7951211333274841, + "learning_rate": 7.999843860894252e-05, + "loss": 0.012028510868549346, + "step": 140970 + }, + { + "epoch": 20.011355571327183, + "grad_norm": 0.1233096718788147, + "learning_rate": 7.999701916252663e-05, + "loss": 0.008423009514808654, + "step": 140980 + }, + { + "epoch": 20.01277501774308, + "grad_norm": 0.4016595780849457, + "learning_rate": 7.999559971611072e-05, + "loss": 0.009395520389080047, + "step": 140990 + }, + { + "epoch": 20.01419446415898, + "grad_norm": 1.084055781364441, + "learning_rate": 7.999418026969482e-05, + "loss": 0.015512244403362274, + "step": 141000 + }, + { + "epoch": 20.01419446415898, + "eval_accuracy": 0.9862656577859732, + "eval_loss": 0.05205647274851799, + "eval_runtime": 33.1367, + "eval_samples_per_second": 474.609, + "eval_steps_per_second": 14.848, + "step": 141000 + }, + { + "epoch": 20.015613910574874, + "grad_norm": 0.015475750900804996, + "learning_rate": 7.999276082327892e-05, + "loss": 0.028347843885421754, + "step": 141010 + }, + { + "epoch": 20.017033356990773, + "grad_norm": 6.349401950836182, + "learning_rate": 7.999134137686303e-05, + "loss": 0.031047925353050232, + "step": 141020 + }, + { + "epoch": 20.01845280340667, + "grad_norm": 5.873929977416992, + "learning_rate": 7.998992193044713e-05, + "loss": 0.00811959058046341, + "step": 141030 + }, + { + "epoch": 20.01987224982257, + "grad_norm": 0.0425172820687294, + "learning_rate": 7.998850248403124e-05, + "loss": 0.004664409905672073, + "step": 141040 + }, + { + "epoch": 20.021291696238467, + "grad_norm": 0.10378196835517883, + "learning_rate": 7.998708303761532e-05, + "loss": 0.016263917088508606, + "step": 141050 + }, + { + "epoch": 20.022711142654366, + "grad_norm": 0.8236654996871948, + "learning_rate": 7.998566359119943e-05, + "loss": 0.011845842748880387, + "step": 141060 + }, + { + "epoch": 20.024130589070264, + "grad_norm": 5.560585021972656, + "learning_rate": 7.998424414478354e-05, + "loss": 0.008132990449666977, + "step": 141070 + }, + { + "epoch": 20.02555003548616, + "grad_norm": 0.8124029636383057, + "learning_rate": 7.998282469836764e-05, + "loss": 0.008311792463064193, + "step": 141080 + }, + { + "epoch": 20.026969481902057, + "grad_norm": 0.23469218611717224, + "learning_rate": 7.998140525195175e-05, + "loss": 0.001304752752184868, + "step": 141090 + }, + { + "epoch": 20.028388928317955, + "grad_norm": 0.8229203820228577, + "learning_rate": 7.997998580553584e-05, + "loss": 0.0024736978113651274, + "step": 141100 + }, + { + "epoch": 20.029808374733854, + "grad_norm": 0.11447346210479736, + "learning_rate": 7.997856635911995e-05, + "loss": 0.04228949248790741, + "step": 141110 + }, + { + "epoch": 20.031227821149752, + "grad_norm": 0.06373047083616257, + "learning_rate": 7.997714691270404e-05, + "loss": 0.009620143473148346, + "step": 141120 + }, + { + "epoch": 20.03264726756565, + "grad_norm": 8.882691383361816, + "learning_rate": 7.997572746628816e-05, + "loss": 0.05514065623283386, + "step": 141130 + }, + { + "epoch": 20.03406671398155, + "grad_norm": 0.018412787467241287, + "learning_rate": 7.997430801987225e-05, + "loss": 0.011524337530136108, + "step": 141140 + }, + { + "epoch": 20.035486160397443, + "grad_norm": 0.012451624497771263, + "learning_rate": 7.997288857345635e-05, + "loss": 0.00642521008849144, + "step": 141150 + }, + { + "epoch": 20.03690560681334, + "grad_norm": 2.8071608543395996, + "learning_rate": 7.997146912704046e-05, + "loss": 0.014402301609516143, + "step": 141160 + }, + { + "epoch": 20.03832505322924, + "grad_norm": 4.811960697174072, + "learning_rate": 7.997004968062456e-05, + "loss": 0.010839483886957168, + "step": 141170 + }, + { + "epoch": 20.03974449964514, + "grad_norm": 0.11822403967380524, + "learning_rate": 7.996863023420867e-05, + "loss": 0.017814382910728455, + "step": 141180 + }, + { + "epoch": 20.041163946061037, + "grad_norm": 0.0183568075299263, + "learning_rate": 7.996721078779277e-05, + "loss": 0.041589167714118955, + "step": 141190 + }, + { + "epoch": 20.042583392476935, + "grad_norm": 0.5206476449966431, + "learning_rate": 7.996579134137688e-05, + "loss": 0.026238131523132324, + "step": 141200 + }, + { + "epoch": 20.044002838892833, + "grad_norm": 1.3223717212677002, + "learning_rate": 7.996437189496096e-05, + "loss": 0.004569841176271438, + "step": 141210 + }, + { + "epoch": 20.045422285308728, + "grad_norm": 0.18440695106983185, + "learning_rate": 7.996295244854507e-05, + "loss": 0.013369666039943695, + "step": 141220 + }, + { + "epoch": 20.046841731724626, + "grad_norm": 0.3286435306072235, + "learning_rate": 7.996153300212917e-05, + "loss": 0.035613265633583066, + "step": 141230 + }, + { + "epoch": 20.048261178140525, + "grad_norm": 3.6914515495300293, + "learning_rate": 7.996011355571328e-05, + "loss": 0.0053454674780368805, + "step": 141240 + }, + { + "epoch": 20.049680624556423, + "grad_norm": 1.9531716108322144, + "learning_rate": 7.995869410929738e-05, + "loss": 0.02335241883993149, + "step": 141250 + }, + { + "epoch": 20.05110007097232, + "grad_norm": 1.275848388671875, + "learning_rate": 7.995727466288148e-05, + "loss": 0.0027617398649454118, + "step": 141260 + }, + { + "epoch": 20.05251951738822, + "grad_norm": 0.027349628508090973, + "learning_rate": 7.995585521646559e-05, + "loss": 0.008489987254142762, + "step": 141270 + }, + { + "epoch": 20.053938963804118, + "grad_norm": 0.4223189949989319, + "learning_rate": 7.995443577004968e-05, + "loss": 0.015506541728973389, + "step": 141280 + }, + { + "epoch": 20.055358410220013, + "grad_norm": 0.24928779900074005, + "learning_rate": 7.99530163236338e-05, + "loss": 0.0367767333984375, + "step": 141290 + }, + { + "epoch": 20.05677785663591, + "grad_norm": 0.03538127243518829, + "learning_rate": 7.995159687721789e-05, + "loss": 0.005999685451388359, + "step": 141300 + }, + { + "epoch": 20.05819730305181, + "grad_norm": 0.048333484679460526, + "learning_rate": 7.995017743080199e-05, + "loss": 0.008087261021137238, + "step": 141310 + }, + { + "epoch": 20.059616749467708, + "grad_norm": 2.40273118019104, + "learning_rate": 7.994875798438609e-05, + "loss": 0.024593907594680785, + "step": 141320 + }, + { + "epoch": 20.061036195883606, + "grad_norm": 0.033067528158426285, + "learning_rate": 7.99473385379702e-05, + "loss": 0.03219816386699677, + "step": 141330 + }, + { + "epoch": 20.062455642299504, + "grad_norm": 8.978111267089844, + "learning_rate": 7.994606103619588e-05, + "loss": 0.048678803443908694, + "step": 141340 + }, + { + "epoch": 20.063875088715402, + "grad_norm": 5.184823513031006, + "learning_rate": 7.994464158978e-05, + "loss": 0.01843302547931671, + "step": 141350 + }, + { + "epoch": 20.065294535131297, + "grad_norm": 0.6802540421485901, + "learning_rate": 7.994322214336409e-05, + "loss": 0.006856510043144226, + "step": 141360 + }, + { + "epoch": 20.066713981547196, + "grad_norm": 6.415282726287842, + "learning_rate": 7.99418026969482e-05, + "loss": 0.03161287009716034, + "step": 141370 + }, + { + "epoch": 20.068133427963094, + "grad_norm": 0.6133005619049072, + "learning_rate": 7.994038325053229e-05, + "loss": 0.003521961346268654, + "step": 141380 + }, + { + "epoch": 20.069552874378992, + "grad_norm": 0.5762439370155334, + "learning_rate": 7.99389638041164e-05, + "loss": 0.008870533108711243, + "step": 141390 + }, + { + "epoch": 20.07097232079489, + "grad_norm": 0.41646623611450195, + "learning_rate": 7.99375443577005e-05, + "loss": 0.011471281200647354, + "step": 141400 + }, + { + "epoch": 20.07239176721079, + "grad_norm": 1.1015546321868896, + "learning_rate": 7.99361249112846e-05, + "loss": 0.015262497961521149, + "step": 141410 + }, + { + "epoch": 20.073811213626687, + "grad_norm": 0.1063433587551117, + "learning_rate": 7.99347054648687e-05, + "loss": 0.007727238535881043, + "step": 141420 + }, + { + "epoch": 20.075230660042582, + "grad_norm": 0.02339489385485649, + "learning_rate": 7.99332860184528e-05, + "loss": 0.009775744378566742, + "step": 141430 + }, + { + "epoch": 20.07665010645848, + "grad_norm": 0.04368114471435547, + "learning_rate": 7.993186657203691e-05, + "loss": 0.008957654982805253, + "step": 141440 + }, + { + "epoch": 20.07806955287438, + "grad_norm": 0.021869715303182602, + "learning_rate": 7.993044712562101e-05, + "loss": 0.036583393812179565, + "step": 141450 + }, + { + "epoch": 20.079488999290277, + "grad_norm": 0.16165511310100555, + "learning_rate": 7.992902767920512e-05, + "loss": 0.013248316943645477, + "step": 141460 + }, + { + "epoch": 20.080908445706175, + "grad_norm": 0.0021154037676751614, + "learning_rate": 7.992760823278922e-05, + "loss": 0.020379316806793214, + "step": 141470 + }, + { + "epoch": 20.082327892122073, + "grad_norm": 1.236534595489502, + "learning_rate": 7.992618878637331e-05, + "loss": 0.01821013242006302, + "step": 141480 + }, + { + "epoch": 20.08374733853797, + "grad_norm": 0.05037076026201248, + "learning_rate": 7.992476933995741e-05, + "loss": 0.008942946791648865, + "step": 141490 + }, + { + "epoch": 20.085166784953866, + "grad_norm": 20.41444206237793, + "learning_rate": 7.992334989354152e-05, + "loss": 0.041226530075073244, + "step": 141500 + }, + { + "epoch": 20.085166784953866, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.048269789665937424, + "eval_runtime": 33.2933, + "eval_samples_per_second": 472.377, + "eval_steps_per_second": 14.778, + "step": 141500 + }, + { + "epoch": 20.086586231369765, + "grad_norm": 2.1929523944854736, + "learning_rate": 7.992193044712562e-05, + "loss": 0.05169554948806763, + "step": 141510 + }, + { + "epoch": 20.088005677785663, + "grad_norm": 0.07201051712036133, + "learning_rate": 7.992051100070973e-05, + "loss": 0.005569947138428688, + "step": 141520 + }, + { + "epoch": 20.08942512420156, + "grad_norm": 0.03781836852431297, + "learning_rate": 7.991909155429383e-05, + "loss": 0.012127821147441865, + "step": 141530 + }, + { + "epoch": 20.09084457061746, + "grad_norm": 0.1598953753709793, + "learning_rate": 7.991767210787793e-05, + "loss": 0.01272830069065094, + "step": 141540 + }, + { + "epoch": 20.092264017033358, + "grad_norm": 0.372420072555542, + "learning_rate": 7.991625266146204e-05, + "loss": 0.0067143462598323825, + "step": 141550 + }, + { + "epoch": 20.093683463449256, + "grad_norm": 0.2735583484172821, + "learning_rate": 7.991483321504613e-05, + "loss": 0.019540132582187654, + "step": 141560 + }, + { + "epoch": 20.09510290986515, + "grad_norm": 0.10384074598550797, + "learning_rate": 7.991341376863024e-05, + "loss": 0.01544135957956314, + "step": 141570 + }, + { + "epoch": 20.09652235628105, + "grad_norm": 0.02889949269592762, + "learning_rate": 7.991199432221434e-05, + "loss": 0.03289896249771118, + "step": 141580 + }, + { + "epoch": 20.097941802696948, + "grad_norm": 0.0637548416852951, + "learning_rate": 7.991057487579844e-05, + "loss": 0.013479314744472504, + "step": 141590 + }, + { + "epoch": 20.099361249112846, + "grad_norm": 0.03864375129342079, + "learning_rate": 7.990915542938254e-05, + "loss": 0.011584682017564773, + "step": 141600 + }, + { + "epoch": 20.100780695528744, + "grad_norm": 0.07262269407510757, + "learning_rate": 7.990773598296665e-05, + "loss": 0.003744155168533325, + "step": 141610 + }, + { + "epoch": 20.102200141944643, + "grad_norm": 0.9087169170379639, + "learning_rate": 7.990631653655075e-05, + "loss": 0.008905504643917084, + "step": 141620 + }, + { + "epoch": 20.10361958836054, + "grad_norm": 0.4373335838317871, + "learning_rate": 7.990489709013486e-05, + "loss": 0.0007677737623453141, + "step": 141630 + }, + { + "epoch": 20.105039034776436, + "grad_norm": 0.01339707151055336, + "learning_rate": 7.990347764371895e-05, + "loss": 0.01338742971420288, + "step": 141640 + }, + { + "epoch": 20.106458481192334, + "grad_norm": 0.09526550024747849, + "learning_rate": 7.990205819730305e-05, + "loss": 0.0082273468375206, + "step": 141650 + }, + { + "epoch": 20.107877927608232, + "grad_norm": 0.15038612484931946, + "learning_rate": 7.990063875088716e-05, + "loss": 0.016612686216831207, + "step": 141660 + }, + { + "epoch": 20.10929737402413, + "grad_norm": 0.11727338284254074, + "learning_rate": 7.989921930447126e-05, + "loss": 0.002484797686338425, + "step": 141670 + }, + { + "epoch": 20.11071682044003, + "grad_norm": 1.2446929216384888, + "learning_rate": 7.989779985805537e-05, + "loss": 0.010968457162380218, + "step": 141680 + }, + { + "epoch": 20.112136266855927, + "grad_norm": 0.02222842164337635, + "learning_rate": 7.989638041163945e-05, + "loss": 0.04818206131458282, + "step": 141690 + }, + { + "epoch": 20.113555713271825, + "grad_norm": 0.017617546021938324, + "learning_rate": 7.989496096522356e-05, + "loss": 0.03665972352027893, + "step": 141700 + }, + { + "epoch": 20.11497515968772, + "grad_norm": 0.06581719219684601, + "learning_rate": 7.989354151880766e-05, + "loss": 0.004803193733096123, + "step": 141710 + }, + { + "epoch": 20.11639460610362, + "grad_norm": 0.05679214745759964, + "learning_rate": 7.989212207239177e-05, + "loss": 0.01601347476243973, + "step": 141720 + }, + { + "epoch": 20.117814052519517, + "grad_norm": 0.02132427506148815, + "learning_rate": 7.989070262597588e-05, + "loss": 0.01751757264137268, + "step": 141730 + }, + { + "epoch": 20.119233498935415, + "grad_norm": 0.03954348340630531, + "learning_rate": 7.988928317955997e-05, + "loss": 0.03036175072193146, + "step": 141740 + }, + { + "epoch": 20.120652945351313, + "grad_norm": 0.3142281174659729, + "learning_rate": 7.988786373314408e-05, + "loss": 0.023780320584774018, + "step": 141750 + }, + { + "epoch": 20.12207239176721, + "grad_norm": 0.015397654846310616, + "learning_rate": 7.988644428672818e-05, + "loss": 0.006399238109588623, + "step": 141760 + }, + { + "epoch": 20.12349183818311, + "grad_norm": 0.15912342071533203, + "learning_rate": 7.988502484031229e-05, + "loss": 0.007101371884346008, + "step": 141770 + }, + { + "epoch": 20.124911284599005, + "grad_norm": 0.1289181411266327, + "learning_rate": 7.988360539389638e-05, + "loss": 0.008804739266633988, + "step": 141780 + }, + { + "epoch": 20.126330731014903, + "grad_norm": 0.0552683100104332, + "learning_rate": 7.988218594748048e-05, + "loss": 0.013078097999095917, + "step": 141790 + }, + { + "epoch": 20.1277501774308, + "grad_norm": 0.024977317079901695, + "learning_rate": 7.988076650106458e-05, + "loss": 0.05996226668357849, + "step": 141800 + }, + { + "epoch": 20.1291696238467, + "grad_norm": 0.12581050395965576, + "learning_rate": 7.987934705464869e-05, + "loss": 0.011745229363441467, + "step": 141810 + }, + { + "epoch": 20.130589070262598, + "grad_norm": 0.03531504422426224, + "learning_rate": 7.98779276082328e-05, + "loss": 0.003165086731314659, + "step": 141820 + }, + { + "epoch": 20.132008516678496, + "grad_norm": 1.4400914907455444, + "learning_rate": 7.98765081618169e-05, + "loss": 0.005033036321401596, + "step": 141830 + }, + { + "epoch": 20.133427963094395, + "grad_norm": 0.15162140130996704, + "learning_rate": 7.9875088715401e-05, + "loss": 0.02480035275220871, + "step": 141840 + }, + { + "epoch": 20.13484740951029, + "grad_norm": 0.7097915410995483, + "learning_rate": 7.987366926898509e-05, + "loss": 0.004496370628476143, + "step": 141850 + }, + { + "epoch": 20.136266855926188, + "grad_norm": 0.0464242622256279, + "learning_rate": 7.98722498225692e-05, + "loss": 0.03190518319606781, + "step": 141860 + }, + { + "epoch": 20.137686302342086, + "grad_norm": 0.0554632693529129, + "learning_rate": 7.98708303761533e-05, + "loss": 0.011931977421045303, + "step": 141870 + }, + { + "epoch": 20.139105748757984, + "grad_norm": 0.07021571695804596, + "learning_rate": 7.986941092973741e-05, + "loss": 0.005147505924105645, + "step": 141880 + }, + { + "epoch": 20.140525195173883, + "grad_norm": 0.03586012125015259, + "learning_rate": 7.986799148332151e-05, + "loss": 0.0158030703663826, + "step": 141890 + }, + { + "epoch": 20.14194464158978, + "grad_norm": 0.10487499088048935, + "learning_rate": 7.986657203690561e-05, + "loss": 0.0037853769958019257, + "step": 141900 + }, + { + "epoch": 20.14336408800568, + "grad_norm": 0.03854146599769592, + "learning_rate": 7.986515259048972e-05, + "loss": 0.007671931385993957, + "step": 141910 + }, + { + "epoch": 20.144783534421574, + "grad_norm": 0.013667808845639229, + "learning_rate": 7.986373314407382e-05, + "loss": 0.003470684587955475, + "step": 141920 + }, + { + "epoch": 20.146202980837472, + "grad_norm": 0.34707924723625183, + "learning_rate": 7.986231369765793e-05, + "loss": 0.016699378192424775, + "step": 141930 + }, + { + "epoch": 20.14762242725337, + "grad_norm": 4.193675994873047, + "learning_rate": 7.986089425124202e-05, + "loss": 0.008417283743619918, + "step": 141940 + }, + { + "epoch": 20.14904187366927, + "grad_norm": 0.017162833362817764, + "learning_rate": 7.985947480482612e-05, + "loss": 0.0015996877104043961, + "step": 141950 + }, + { + "epoch": 20.150461320085167, + "grad_norm": 0.2931995689868927, + "learning_rate": 7.985805535841022e-05, + "loss": 0.02403400242328644, + "step": 141960 + }, + { + "epoch": 20.151880766501066, + "grad_norm": 1.2880780696868896, + "learning_rate": 7.985663591199433e-05, + "loss": 0.04142577946186066, + "step": 141970 + }, + { + "epoch": 20.153300212916964, + "grad_norm": 3.4541120529174805, + "learning_rate": 7.985521646557843e-05, + "loss": 0.014758683741092682, + "step": 141980 + }, + { + "epoch": 20.15471965933286, + "grad_norm": 0.11815626919269562, + "learning_rate": 7.985379701916254e-05, + "loss": 0.05548862814903259, + "step": 141990 + }, + { + "epoch": 20.156139105748757, + "grad_norm": 0.2253233939409256, + "learning_rate": 7.985237757274664e-05, + "loss": 0.0025531187653541564, + "step": 142000 + }, + { + "epoch": 20.156139105748757, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04457830637693405, + "eval_runtime": 34.0845, + "eval_samples_per_second": 461.412, + "eval_steps_per_second": 14.435, + "step": 142000 + }, + { + "epoch": 20.157558552164655, + "grad_norm": 0.11514417827129364, + "learning_rate": 7.985095812633073e-05, + "loss": 0.02372538149356842, + "step": 142010 + }, + { + "epoch": 20.158977998580554, + "grad_norm": 0.01031376700848341, + "learning_rate": 7.984953867991484e-05, + "loss": 0.00249270536005497, + "step": 142020 + }, + { + "epoch": 20.160397444996452, + "grad_norm": 0.6879292130470276, + "learning_rate": 7.984811923349894e-05, + "loss": 0.007451292872428894, + "step": 142030 + }, + { + "epoch": 20.16181689141235, + "grad_norm": 0.016475774347782135, + "learning_rate": 7.984669978708305e-05, + "loss": 0.027611124515533447, + "step": 142040 + }, + { + "epoch": 20.16323633782825, + "grad_norm": 4.053171157836914, + "learning_rate": 7.984528034066714e-05, + "loss": 0.012854620814323425, + "step": 142050 + }, + { + "epoch": 20.164655784244143, + "grad_norm": 0.009656962938606739, + "learning_rate": 7.984386089425125e-05, + "loss": 0.000679202750325203, + "step": 142060 + }, + { + "epoch": 20.16607523066004, + "grad_norm": 0.4034946858882904, + "learning_rate": 7.984244144783534e-05, + "loss": 0.019225259125232697, + "step": 142070 + }, + { + "epoch": 20.16749467707594, + "grad_norm": 5.981703758239746, + "learning_rate": 7.984102200141945e-05, + "loss": 0.0107732355594635, + "step": 142080 + }, + { + "epoch": 20.168914123491838, + "grad_norm": 5.658140659332275, + "learning_rate": 7.983960255500355e-05, + "loss": 0.01368822604417801, + "step": 142090 + }, + { + "epoch": 20.170333569907736, + "grad_norm": 0.316007137298584, + "learning_rate": 7.983818310858765e-05, + "loss": 0.011389472335577012, + "step": 142100 + }, + { + "epoch": 20.171753016323635, + "grad_norm": 0.05075006186962128, + "learning_rate": 7.983676366217176e-05, + "loss": 0.016885870695114137, + "step": 142110 + }, + { + "epoch": 20.173172462739533, + "grad_norm": 16.16810417175293, + "learning_rate": 7.983534421575586e-05, + "loss": 0.05068206787109375, + "step": 142120 + }, + { + "epoch": 20.174591909155428, + "grad_norm": 5.7081708908081055, + "learning_rate": 7.983392476933997e-05, + "loss": 0.04219783842563629, + "step": 142130 + }, + { + "epoch": 20.176011355571326, + "grad_norm": 3.024782657623291, + "learning_rate": 7.983250532292407e-05, + "loss": 0.008670754730701447, + "step": 142140 + }, + { + "epoch": 20.177430801987224, + "grad_norm": 2.9781620502471924, + "learning_rate": 7.983108587650816e-05, + "loss": 0.06037324070930481, + "step": 142150 + }, + { + "epoch": 20.178850248403123, + "grad_norm": 0.16223609447479248, + "learning_rate": 7.982966643009226e-05, + "loss": 0.014251476526260376, + "step": 142160 + }, + { + "epoch": 20.18026969481902, + "grad_norm": 0.027019178494811058, + "learning_rate": 7.982824698367637e-05, + "loss": 0.011684049665927888, + "step": 142170 + }, + { + "epoch": 20.18168914123492, + "grad_norm": 0.6857998371124268, + "learning_rate": 7.982682753726047e-05, + "loss": 0.002962096780538559, + "step": 142180 + }, + { + "epoch": 20.183108587650818, + "grad_norm": 0.026773851364850998, + "learning_rate": 7.982540809084458e-05, + "loss": 0.010839618742465973, + "step": 142190 + }, + { + "epoch": 20.184528034066712, + "grad_norm": 10.207509994506836, + "learning_rate": 7.982398864442868e-05, + "loss": 0.00893901064991951, + "step": 142200 + }, + { + "epoch": 20.18594748048261, + "grad_norm": 0.5077146887779236, + "learning_rate": 7.982256919801277e-05, + "loss": 0.023206639289855956, + "step": 142210 + }, + { + "epoch": 20.18736692689851, + "grad_norm": 13.906847953796387, + "learning_rate": 7.982114975159689e-05, + "loss": 0.04845001697540283, + "step": 142220 + }, + { + "epoch": 20.188786373314407, + "grad_norm": 0.22066904604434967, + "learning_rate": 7.981973030518098e-05, + "loss": 0.020987828075885773, + "step": 142230 + }, + { + "epoch": 20.190205819730306, + "grad_norm": 7.529053211212158, + "learning_rate": 7.98183108587651e-05, + "loss": 0.04300286471843719, + "step": 142240 + }, + { + "epoch": 20.191625266146204, + "grad_norm": 9.46019458770752, + "learning_rate": 7.981689141234919e-05, + "loss": 0.05241010785102844, + "step": 142250 + }, + { + "epoch": 20.193044712562102, + "grad_norm": 7.071815490722656, + "learning_rate": 7.981547196593329e-05, + "loss": 0.031628957390785216, + "step": 142260 + }, + { + "epoch": 20.194464158977997, + "grad_norm": 0.0734858512878418, + "learning_rate": 7.981405251951739e-05, + "loss": 0.01677306890487671, + "step": 142270 + }, + { + "epoch": 20.195883605393895, + "grad_norm": 0.8266683220863342, + "learning_rate": 7.98126330731015e-05, + "loss": 0.020758605003356932, + "step": 142280 + }, + { + "epoch": 20.197303051809794, + "grad_norm": 0.038721963763237, + "learning_rate": 7.98112136266856e-05, + "loss": 0.01994621455669403, + "step": 142290 + }, + { + "epoch": 20.198722498225692, + "grad_norm": 0.268017441034317, + "learning_rate": 7.98097941802697e-05, + "loss": 0.0036127448081970217, + "step": 142300 + }, + { + "epoch": 20.20014194464159, + "grad_norm": 0.9657958745956421, + "learning_rate": 7.98083747338538e-05, + "loss": 0.030071181058883668, + "step": 142310 + }, + { + "epoch": 20.20156139105749, + "grad_norm": 0.21522824466228485, + "learning_rate": 7.98070972320795e-05, + "loss": 0.0300694078207016, + "step": 142320 + }, + { + "epoch": 20.202980837473387, + "grad_norm": 0.011424322612583637, + "learning_rate": 7.980567778566359e-05, + "loss": 0.0058794297277927395, + "step": 142330 + }, + { + "epoch": 20.20440028388928, + "grad_norm": 5.839929103851318, + "learning_rate": 7.98042583392477e-05, + "loss": 0.032745689153671265, + "step": 142340 + }, + { + "epoch": 20.20581973030518, + "grad_norm": 0.03547418490052223, + "learning_rate": 7.98028388928318e-05, + "loss": 0.01504252403974533, + "step": 142350 + }, + { + "epoch": 20.207239176721078, + "grad_norm": 8.353195190429688, + "learning_rate": 7.98014194464159e-05, + "loss": 0.01957404613494873, + "step": 142360 + }, + { + "epoch": 20.208658623136976, + "grad_norm": 5.893552303314209, + "learning_rate": 7.98e-05, + "loss": 0.02026599943637848, + "step": 142370 + }, + { + "epoch": 20.210078069552875, + "grad_norm": 0.010802337899804115, + "learning_rate": 7.97985805535841e-05, + "loss": 0.024339427053928376, + "step": 142380 + }, + { + "epoch": 20.211497515968773, + "grad_norm": 11.485355377197266, + "learning_rate": 7.979716110716821e-05, + "loss": 0.011984516680240632, + "step": 142390 + }, + { + "epoch": 20.21291696238467, + "grad_norm": 17.191001892089844, + "learning_rate": 7.979574166075231e-05, + "loss": 0.028717640042304992, + "step": 142400 + }, + { + "epoch": 20.214336408800566, + "grad_norm": 0.20301076769828796, + "learning_rate": 7.979432221433642e-05, + "loss": 0.0018381725996732712, + "step": 142410 + }, + { + "epoch": 20.215755855216464, + "grad_norm": 21.390623092651367, + "learning_rate": 7.979290276792052e-05, + "loss": 0.043421268463134766, + "step": 142420 + }, + { + "epoch": 20.217175301632363, + "grad_norm": 11.526211738586426, + "learning_rate": 7.979148332150461e-05, + "loss": 0.04156326353549957, + "step": 142430 + }, + { + "epoch": 20.21859474804826, + "grad_norm": 0.04185756668448448, + "learning_rate": 7.979006387508871e-05, + "loss": 0.011933413147926331, + "step": 142440 + }, + { + "epoch": 20.22001419446416, + "grad_norm": 1.5703065395355225, + "learning_rate": 7.978864442867282e-05, + "loss": 0.04834843277931213, + "step": 142450 + }, + { + "epoch": 20.221433640880058, + "grad_norm": 0.4260168969631195, + "learning_rate": 7.978722498225692e-05, + "loss": 0.02226836383342743, + "step": 142460 + }, + { + "epoch": 20.222853087295956, + "grad_norm": 0.5218490958213806, + "learning_rate": 7.978580553584103e-05, + "loss": 0.001465563103556633, + "step": 142470 + }, + { + "epoch": 20.22427253371185, + "grad_norm": 0.6611945629119873, + "learning_rate": 7.978438608942513e-05, + "loss": 0.020285823941230775, + "step": 142480 + }, + { + "epoch": 20.22569198012775, + "grad_norm": 0.007459554355591536, + "learning_rate": 7.978296664300922e-05, + "loss": 0.02087102234363556, + "step": 142490 + }, + { + "epoch": 20.227111426543647, + "grad_norm": 0.334139883518219, + "learning_rate": 7.978154719659334e-05, + "loss": 0.011999078094959259, + "step": 142500 + }, + { + "epoch": 20.227111426543647, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.048032622784376144, + "eval_runtime": 32.801, + "eval_samples_per_second": 479.467, + "eval_steps_per_second": 15.0, + "step": 142500 + }, + { + "epoch": 20.228530872959546, + "grad_norm": 0.02765466645359993, + "learning_rate": 7.978012775017743e-05, + "loss": 0.017795243859291078, + "step": 142510 + }, + { + "epoch": 20.229950319375444, + "grad_norm": 12.934488296508789, + "learning_rate": 7.977870830376154e-05, + "loss": 0.027302253246307372, + "step": 142520 + }, + { + "epoch": 20.231369765791342, + "grad_norm": 0.847672700881958, + "learning_rate": 7.977728885734564e-05, + "loss": 0.05470612645149231, + "step": 142530 + }, + { + "epoch": 20.23278921220724, + "grad_norm": 14.233654975891113, + "learning_rate": 7.977586941092974e-05, + "loss": 0.034137874841690063, + "step": 142540 + }, + { + "epoch": 20.234208658623135, + "grad_norm": 1.0833734273910522, + "learning_rate": 7.977444996451384e-05, + "loss": 0.0424355298280716, + "step": 142550 + }, + { + "epoch": 20.235628105039034, + "grad_norm": 0.5786467790603638, + "learning_rate": 7.977303051809795e-05, + "loss": 0.014288076758384704, + "step": 142560 + }, + { + "epoch": 20.237047551454932, + "grad_norm": 0.015368801541626453, + "learning_rate": 7.977161107168204e-05, + "loss": 0.021183985471725463, + "step": 142570 + }, + { + "epoch": 20.23846699787083, + "grad_norm": 0.07482022047042847, + "learning_rate": 7.977019162526615e-05, + "loss": 0.024730314314365388, + "step": 142580 + }, + { + "epoch": 20.23988644428673, + "grad_norm": 0.7866510152816772, + "learning_rate": 7.976877217885025e-05, + "loss": 0.006797478348016739, + "step": 142590 + }, + { + "epoch": 20.241305890702627, + "grad_norm": 8.225578308105469, + "learning_rate": 7.976735273243435e-05, + "loss": 0.03572050929069519, + "step": 142600 + }, + { + "epoch": 20.242725337118525, + "grad_norm": 0.04579515755176544, + "learning_rate": 7.976593328601846e-05, + "loss": 0.04496398270130157, + "step": 142610 + }, + { + "epoch": 20.24414478353442, + "grad_norm": 0.04150356724858284, + "learning_rate": 7.976451383960256e-05, + "loss": 0.02562972903251648, + "step": 142620 + }, + { + "epoch": 20.24556422995032, + "grad_norm": 9.057428359985352, + "learning_rate": 7.976309439318667e-05, + "loss": 0.031835424900054934, + "step": 142630 + }, + { + "epoch": 20.246983676366217, + "grad_norm": 5.182511806488037, + "learning_rate": 7.976167494677075e-05, + "loss": 0.013751554489135741, + "step": 142640 + }, + { + "epoch": 20.248403122782115, + "grad_norm": 0.15399454534053802, + "learning_rate": 7.976025550035486e-05, + "loss": 0.007786694914102554, + "step": 142650 + }, + { + "epoch": 20.249822569198013, + "grad_norm": 0.1412489116191864, + "learning_rate": 7.975883605393896e-05, + "loss": 0.0022139832377433776, + "step": 142660 + }, + { + "epoch": 20.25124201561391, + "grad_norm": 14.100513458251953, + "learning_rate": 7.975741660752307e-05, + "loss": 0.031208738684654236, + "step": 142670 + }, + { + "epoch": 20.25266146202981, + "grad_norm": 0.028892293572425842, + "learning_rate": 7.975599716110718e-05, + "loss": 0.05569702386856079, + "step": 142680 + }, + { + "epoch": 20.254080908445705, + "grad_norm": 0.07068059593439102, + "learning_rate": 7.975457771469127e-05, + "loss": 0.005036211758852005, + "step": 142690 + }, + { + "epoch": 20.255500354861603, + "grad_norm": 0.8706439733505249, + "learning_rate": 7.975315826827538e-05, + "loss": 0.015374001860618592, + "step": 142700 + }, + { + "epoch": 20.2569198012775, + "grad_norm": 0.022880127653479576, + "learning_rate": 7.975173882185948e-05, + "loss": 0.01999637931585312, + "step": 142710 + }, + { + "epoch": 20.2583392476934, + "grad_norm": 0.12257270514965057, + "learning_rate": 7.975031937544359e-05, + "loss": 0.0037969771772623064, + "step": 142720 + }, + { + "epoch": 20.259758694109298, + "grad_norm": 0.10455180704593658, + "learning_rate": 7.974889992902768e-05, + "loss": 0.024285706877708434, + "step": 142730 + }, + { + "epoch": 20.261178140525196, + "grad_norm": 0.08691148459911346, + "learning_rate": 7.974748048261178e-05, + "loss": 0.002531801909208298, + "step": 142740 + }, + { + "epoch": 20.262597586941094, + "grad_norm": 0.06757752597332001, + "learning_rate": 7.974606103619588e-05, + "loss": 0.010270431637763977, + "step": 142750 + }, + { + "epoch": 20.26401703335699, + "grad_norm": 10.979951858520508, + "learning_rate": 7.974464158977999e-05, + "loss": 0.05837968587875366, + "step": 142760 + }, + { + "epoch": 20.265436479772887, + "grad_norm": 0.23595213890075684, + "learning_rate": 7.97432221433641e-05, + "loss": 0.013116481900215148, + "step": 142770 + }, + { + "epoch": 20.266855926188786, + "grad_norm": 0.2715380787849426, + "learning_rate": 7.97418026969482e-05, + "loss": 0.022164252400398255, + "step": 142780 + }, + { + "epoch": 20.268275372604684, + "grad_norm": 0.4441491961479187, + "learning_rate": 7.97403832505323e-05, + "loss": 0.006470701098442078, + "step": 142790 + }, + { + "epoch": 20.269694819020582, + "grad_norm": 12.162850379943848, + "learning_rate": 7.973896380411639e-05, + "loss": 0.05165572166442871, + "step": 142800 + }, + { + "epoch": 20.27111426543648, + "grad_norm": 3.346304178237915, + "learning_rate": 7.97375443577005e-05, + "loss": 0.01856372058391571, + "step": 142810 + }, + { + "epoch": 20.27253371185238, + "grad_norm": 0.02115229330956936, + "learning_rate": 7.97361249112846e-05, + "loss": 0.005987913906574249, + "step": 142820 + }, + { + "epoch": 20.273953158268274, + "grad_norm": 0.013563952408730984, + "learning_rate": 7.973470546486871e-05, + "loss": 0.004847363010048867, + "step": 142830 + }, + { + "epoch": 20.275372604684172, + "grad_norm": 1.303944706916809, + "learning_rate": 7.97332860184528e-05, + "loss": 0.01625673323869705, + "step": 142840 + }, + { + "epoch": 20.27679205110007, + "grad_norm": 0.06267435103654861, + "learning_rate": 7.97318665720369e-05, + "loss": 0.01115289404988289, + "step": 142850 + }, + { + "epoch": 20.27821149751597, + "grad_norm": 0.06709060817956924, + "learning_rate": 7.973044712562102e-05, + "loss": 0.0017483565956354142, + "step": 142860 + }, + { + "epoch": 20.279630943931867, + "grad_norm": 0.030019039288163185, + "learning_rate": 7.972902767920511e-05, + "loss": 0.012597373127937317, + "step": 142870 + }, + { + "epoch": 20.281050390347765, + "grad_norm": 0.0831553041934967, + "learning_rate": 7.972760823278923e-05, + "loss": 0.011013035476207734, + "step": 142880 + }, + { + "epoch": 20.282469836763664, + "grad_norm": 0.031748171895742416, + "learning_rate": 7.972618878637332e-05, + "loss": 0.026995158195495604, + "step": 142890 + }, + { + "epoch": 20.28388928317956, + "grad_norm": 0.1174553707242012, + "learning_rate": 7.972476933995742e-05, + "loss": 0.00546933151781559, + "step": 142900 + }, + { + "epoch": 20.285308729595457, + "grad_norm": 0.052703723311424255, + "learning_rate": 7.972334989354152e-05, + "loss": 0.005904996022582054, + "step": 142910 + }, + { + "epoch": 20.286728176011355, + "grad_norm": 0.38965511322021484, + "learning_rate": 7.972193044712563e-05, + "loss": 0.005780385807156563, + "step": 142920 + }, + { + "epoch": 20.288147622427253, + "grad_norm": 0.5876114964485168, + "learning_rate": 7.972051100070973e-05, + "loss": 0.025393232703208923, + "step": 142930 + }, + { + "epoch": 20.28956706884315, + "grad_norm": 0.2854948341846466, + "learning_rate": 7.971909155429384e-05, + "loss": 0.008964084088802338, + "step": 142940 + }, + { + "epoch": 20.29098651525905, + "grad_norm": 10.083375930786133, + "learning_rate": 7.971767210787793e-05, + "loss": 0.043013885617256165, + "step": 142950 + }, + { + "epoch": 20.292405961674948, + "grad_norm": 0.013548131100833416, + "learning_rate": 7.971625266146203e-05, + "loss": 0.01220654621720314, + "step": 142960 + }, + { + "epoch": 20.293825408090843, + "grad_norm": 0.2204940915107727, + "learning_rate": 7.971483321504614e-05, + "loss": 0.04201371967792511, + "step": 142970 + }, + { + "epoch": 20.29524485450674, + "grad_norm": 0.1154663935303688, + "learning_rate": 7.971341376863024e-05, + "loss": 0.03712378144264221, + "step": 142980 + }, + { + "epoch": 20.29666430092264, + "grad_norm": 0.39607420563697815, + "learning_rate": 7.971199432221435e-05, + "loss": 0.01964118182659149, + "step": 142990 + }, + { + "epoch": 20.298083747338538, + "grad_norm": 0.13066010177135468, + "learning_rate": 7.971057487579843e-05, + "loss": 0.013802319765090942, + "step": 143000 + }, + { + "epoch": 20.298083747338538, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.04419711232185364, + "eval_runtime": 33.5958, + "eval_samples_per_second": 468.125, + "eval_steps_per_second": 14.645, + "step": 143000 + }, + { + "epoch": 20.299503193754436, + "grad_norm": 0.0710671991109848, + "learning_rate": 7.970915542938255e-05, + "loss": 0.022224968671798705, + "step": 143010 + }, + { + "epoch": 20.300922640170334, + "grad_norm": 0.3132091462612152, + "learning_rate": 7.970773598296664e-05, + "loss": 0.0013002410531044006, + "step": 143020 + }, + { + "epoch": 20.302342086586233, + "grad_norm": 0.16595591604709625, + "learning_rate": 7.970631653655075e-05, + "loss": 0.002704678475856781, + "step": 143030 + }, + { + "epoch": 20.303761533002127, + "grad_norm": 9.94991683959961, + "learning_rate": 7.970489709013485e-05, + "loss": 0.023206396400928496, + "step": 143040 + }, + { + "epoch": 20.305180979418026, + "grad_norm": 0.06891633570194244, + "learning_rate": 7.970347764371895e-05, + "loss": 0.0032977689057588576, + "step": 143050 + }, + { + "epoch": 20.306600425833924, + "grad_norm": 0.07374939322471619, + "learning_rate": 7.970205819730306e-05, + "loss": 0.0014350403100252152, + "step": 143060 + }, + { + "epoch": 20.308019872249822, + "grad_norm": 1.4631593227386475, + "learning_rate": 7.970063875088716e-05, + "loss": 0.03319612145423889, + "step": 143070 + }, + { + "epoch": 20.30943931866572, + "grad_norm": 0.02771943248808384, + "learning_rate": 7.969921930447127e-05, + "loss": 0.01646076887845993, + "step": 143080 + }, + { + "epoch": 20.31085876508162, + "grad_norm": 0.8415849804878235, + "learning_rate": 7.969779985805537e-05, + "loss": 0.02362992614507675, + "step": 143090 + }, + { + "epoch": 20.312278211497517, + "grad_norm": 0.09133688360452652, + "learning_rate": 7.969638041163946e-05, + "loss": 0.003934069722890854, + "step": 143100 + }, + { + "epoch": 20.313697657913412, + "grad_norm": 0.6612482666969299, + "learning_rate": 7.969496096522356e-05, + "loss": 0.029899373650550842, + "step": 143110 + }, + { + "epoch": 20.31511710432931, + "grad_norm": 0.15536877512931824, + "learning_rate": 7.969354151880767e-05, + "loss": 0.024406220018863677, + "step": 143120 + }, + { + "epoch": 20.31653655074521, + "grad_norm": 0.09201609343290329, + "learning_rate": 7.969212207239177e-05, + "loss": 0.0193393275141716, + "step": 143130 + }, + { + "epoch": 20.317955997161107, + "grad_norm": 0.15614308416843414, + "learning_rate": 7.969070262597588e-05, + "loss": 0.008105764538049698, + "step": 143140 + }, + { + "epoch": 20.319375443577005, + "grad_norm": 5.1248040199279785, + "learning_rate": 7.968928317955998e-05, + "loss": 0.024651895463466644, + "step": 143150 + }, + { + "epoch": 20.320794889992904, + "grad_norm": 12.470382690429688, + "learning_rate": 7.968786373314407e-05, + "loss": 0.057747375965118405, + "step": 143160 + }, + { + "epoch": 20.322214336408802, + "grad_norm": 0.02708159014582634, + "learning_rate": 7.968644428672818e-05, + "loss": 0.006167247146368027, + "step": 143170 + }, + { + "epoch": 20.323633782824697, + "grad_norm": 0.27378222346305847, + "learning_rate": 7.968502484031228e-05, + "loss": 0.0270577996969223, + "step": 143180 + }, + { + "epoch": 20.325053229240595, + "grad_norm": 0.4684883654117584, + "learning_rate": 7.968360539389639e-05, + "loss": 0.007477696239948273, + "step": 143190 + }, + { + "epoch": 20.326472675656493, + "grad_norm": 8.21023941040039, + "learning_rate": 7.968218594748048e-05, + "loss": 0.024356037378311157, + "step": 143200 + }, + { + "epoch": 20.32789212207239, + "grad_norm": 3.9998233318328857, + "learning_rate": 7.968076650106459e-05, + "loss": 0.011114828288555145, + "step": 143210 + }, + { + "epoch": 20.32931156848829, + "grad_norm": 8.465042114257812, + "learning_rate": 7.967934705464869e-05, + "loss": 0.053187215328216554, + "step": 143220 + }, + { + "epoch": 20.330731014904188, + "grad_norm": 1.8651872873306274, + "learning_rate": 7.96779276082328e-05, + "loss": 0.008612476289272308, + "step": 143230 + }, + { + "epoch": 20.332150461320087, + "grad_norm": 0.15141363441944122, + "learning_rate": 7.96765081618169e-05, + "loss": 0.06432226300239563, + "step": 143240 + }, + { + "epoch": 20.33356990773598, + "grad_norm": 13.178853034973145, + "learning_rate": 7.9675088715401e-05, + "loss": 0.03949933350086212, + "step": 143250 + }, + { + "epoch": 20.33498935415188, + "grad_norm": 4.991683483123779, + "learning_rate": 7.96736692689851e-05, + "loss": 0.011390550434589386, + "step": 143260 + }, + { + "epoch": 20.336408800567778, + "grad_norm": 0.4497523605823517, + "learning_rate": 7.96722498225692e-05, + "loss": 0.022866718471050262, + "step": 143270 + }, + { + "epoch": 20.337828246983676, + "grad_norm": 0.2339865118265152, + "learning_rate": 7.967083037615331e-05, + "loss": 0.0036530278623104094, + "step": 143280 + }, + { + "epoch": 20.339247693399575, + "grad_norm": 0.6333643794059753, + "learning_rate": 7.966941092973741e-05, + "loss": 0.011677588522434234, + "step": 143290 + }, + { + "epoch": 20.340667139815473, + "grad_norm": 0.5095767378807068, + "learning_rate": 7.966799148332152e-05, + "loss": 0.04856514036655426, + "step": 143300 + }, + { + "epoch": 20.34208658623137, + "grad_norm": 2.0360395908355713, + "learning_rate": 7.96665720369056e-05, + "loss": 0.01791858673095703, + "step": 143310 + }, + { + "epoch": 20.343506032647266, + "grad_norm": 0.019238732755184174, + "learning_rate": 7.966515259048971e-05, + "loss": 0.019448147714138032, + "step": 143320 + }, + { + "epoch": 20.344925479063164, + "grad_norm": 1.314644694328308, + "learning_rate": 7.966373314407381e-05, + "loss": 0.013599833846092224, + "step": 143330 + }, + { + "epoch": 20.346344925479062, + "grad_norm": 0.051649417728185654, + "learning_rate": 7.966231369765792e-05, + "loss": 0.013281884789466857, + "step": 143340 + }, + { + "epoch": 20.34776437189496, + "grad_norm": 0.6772908568382263, + "learning_rate": 7.966089425124202e-05, + "loss": 0.011097601056098938, + "step": 143350 + }, + { + "epoch": 20.34918381831086, + "grad_norm": 1.8880468606948853, + "learning_rate": 7.965947480482612e-05, + "loss": 0.028100913763046263, + "step": 143360 + }, + { + "epoch": 20.350603264726757, + "grad_norm": 10.578641891479492, + "learning_rate": 7.965805535841023e-05, + "loss": 0.007112830132246018, + "step": 143370 + }, + { + "epoch": 20.352022711142656, + "grad_norm": 0.7287304401397705, + "learning_rate": 7.965663591199432e-05, + "loss": 0.010783711075782776, + "step": 143380 + }, + { + "epoch": 20.35344215755855, + "grad_norm": 6.590147018432617, + "learning_rate": 7.965521646557844e-05, + "loss": 0.010001247376203537, + "step": 143390 + }, + { + "epoch": 20.35486160397445, + "grad_norm": 6.245509624481201, + "learning_rate": 7.965379701916253e-05, + "loss": 0.03738590776920318, + "step": 143400 + }, + { + "epoch": 20.356281050390347, + "grad_norm": 0.07190784811973572, + "learning_rate": 7.965237757274663e-05, + "loss": 0.025264889001846313, + "step": 143410 + }, + { + "epoch": 20.357700496806245, + "grad_norm": 0.01021641492843628, + "learning_rate": 7.965095812633073e-05, + "loss": 0.012144586443901062, + "step": 143420 + }, + { + "epoch": 20.359119943222144, + "grad_norm": 0.2613297700881958, + "learning_rate": 7.964953867991484e-05, + "loss": 0.010810536146163941, + "step": 143430 + }, + { + "epoch": 20.360539389638042, + "grad_norm": 0.21913333237171173, + "learning_rate": 7.964811923349894e-05, + "loss": 0.005209186300635338, + "step": 143440 + }, + { + "epoch": 20.36195883605394, + "grad_norm": 0.002570623531937599, + "learning_rate": 7.964669978708305e-05, + "loss": 0.027671465277671815, + "step": 143450 + }, + { + "epoch": 20.363378282469835, + "grad_norm": 0.007602290716022253, + "learning_rate": 7.964528034066714e-05, + "loss": 0.016392304003238677, + "step": 143460 + }, + { + "epoch": 20.364797728885733, + "grad_norm": 1.9662175178527832, + "learning_rate": 7.964386089425124e-05, + "loss": 0.019445177912712098, + "step": 143470 + }, + { + "epoch": 20.36621717530163, + "grad_norm": 1.7708325386047363, + "learning_rate": 7.964244144783535e-05, + "loss": 0.06833457350730895, + "step": 143480 + }, + { + "epoch": 20.36763662171753, + "grad_norm": 0.0037574139423668385, + "learning_rate": 7.964102200141945e-05, + "loss": 0.002533341571688652, + "step": 143490 + }, + { + "epoch": 20.36905606813343, + "grad_norm": 3.2528820037841797, + "learning_rate": 7.963960255500356e-05, + "loss": 0.008031685650348664, + "step": 143500 + }, + { + "epoch": 20.36905606813343, + "eval_accuracy": 0.9901443377630826, + "eval_loss": 0.03529705852270126, + "eval_runtime": 33.265, + "eval_samples_per_second": 472.779, + "eval_steps_per_second": 14.79, + "step": 143500 + }, + { + "epoch": 20.370475514549327, + "grad_norm": 1.3986564874649048, + "learning_rate": 7.963818310858764e-05, + "loss": 0.005757429823279381, + "step": 143510 + }, + { + "epoch": 20.371894960965225, + "grad_norm": 0.37375408411026, + "learning_rate": 7.963676366217176e-05, + "loss": 0.004768835753202439, + "step": 143520 + }, + { + "epoch": 20.37331440738112, + "grad_norm": 0.9124902486801147, + "learning_rate": 7.963534421575585e-05, + "loss": 0.008659066259860992, + "step": 143530 + }, + { + "epoch": 20.374733853797018, + "grad_norm": 0.4636019766330719, + "learning_rate": 7.963392476933996e-05, + "loss": 0.03807288706302643, + "step": 143540 + }, + { + "epoch": 20.376153300212916, + "grad_norm": 2.9868409633636475, + "learning_rate": 7.963250532292406e-05, + "loss": 0.004987531527876854, + "step": 143550 + }, + { + "epoch": 20.377572746628815, + "grad_norm": 0.053384773433208466, + "learning_rate": 7.963108587650816e-05, + "loss": 0.00445505827665329, + "step": 143560 + }, + { + "epoch": 20.378992193044713, + "grad_norm": 3.059741258621216, + "learning_rate": 7.962966643009227e-05, + "loss": 0.013981804251670837, + "step": 143570 + }, + { + "epoch": 20.38041163946061, + "grad_norm": 0.02386626973748207, + "learning_rate": 7.962824698367637e-05, + "loss": 0.0010259795933961868, + "step": 143580 + }, + { + "epoch": 20.38183108587651, + "grad_norm": 0.6353252530097961, + "learning_rate": 7.962682753726048e-05, + "loss": 0.00803077220916748, + "step": 143590 + }, + { + "epoch": 20.383250532292404, + "grad_norm": 0.2397974729537964, + "learning_rate": 7.962540809084458e-05, + "loss": 0.013181793689727783, + "step": 143600 + }, + { + "epoch": 20.384669978708303, + "grad_norm": 0.4873849153518677, + "learning_rate": 7.962398864442869e-05, + "loss": 0.0353862851858139, + "step": 143610 + }, + { + "epoch": 20.3860894251242, + "grad_norm": 0.3819289207458496, + "learning_rate": 7.962256919801277e-05, + "loss": 0.013367743790149688, + "step": 143620 + }, + { + "epoch": 20.3875088715401, + "grad_norm": 0.5420555472373962, + "learning_rate": 7.962114975159688e-05, + "loss": 0.010918934643268586, + "step": 143630 + }, + { + "epoch": 20.388928317955997, + "grad_norm": 0.8265628814697266, + "learning_rate": 7.961973030518098e-05, + "loss": 0.01730220913887024, + "step": 143640 + }, + { + "epoch": 20.390347764371896, + "grad_norm": 0.17687322199344635, + "learning_rate": 7.961831085876509e-05, + "loss": 0.03857523202896118, + "step": 143650 + }, + { + "epoch": 20.391767210787794, + "grad_norm": 0.38359999656677246, + "learning_rate": 7.961689141234919e-05, + "loss": 0.013185246288776398, + "step": 143660 + }, + { + "epoch": 20.39318665720369, + "grad_norm": 0.7572456002235413, + "learning_rate": 7.961547196593328e-05, + "loss": 0.021738988161087037, + "step": 143670 + }, + { + "epoch": 20.394606103619587, + "grad_norm": 0.933469831943512, + "learning_rate": 7.96140525195174e-05, + "loss": 0.016792310774326323, + "step": 143680 + }, + { + "epoch": 20.396025550035485, + "grad_norm": 0.48743927478790283, + "learning_rate": 7.961263307310149e-05, + "loss": 0.015995195508003233, + "step": 143690 + }, + { + "epoch": 20.397444996451384, + "grad_norm": 1.1666967868804932, + "learning_rate": 7.96112136266856e-05, + "loss": 0.006618998944759369, + "step": 143700 + }, + { + "epoch": 20.398864442867282, + "grad_norm": 0.054619308561086655, + "learning_rate": 7.96097941802697e-05, + "loss": 0.07580899000167847, + "step": 143710 + }, + { + "epoch": 20.40028388928318, + "grad_norm": 0.8266465067863464, + "learning_rate": 7.96083747338538e-05, + "loss": 0.005849643424153328, + "step": 143720 + }, + { + "epoch": 20.40170333569908, + "grad_norm": 11.851669311523438, + "learning_rate": 7.96069552874379e-05, + "loss": 0.010745181143283844, + "step": 143730 + }, + { + "epoch": 20.403122782114973, + "grad_norm": 0.21050934493541718, + "learning_rate": 7.9605535841022e-05, + "loss": 0.01756216138601303, + "step": 143740 + }, + { + "epoch": 20.40454222853087, + "grad_norm": 7.4519500732421875, + "learning_rate": 7.96041163946061e-05, + "loss": 0.048693782091140746, + "step": 143750 + }, + { + "epoch": 20.40596167494677, + "grad_norm": 0.02900480106472969, + "learning_rate": 7.960269694819021e-05, + "loss": 0.025216007232666017, + "step": 143760 + }, + { + "epoch": 20.40738112136267, + "grad_norm": 0.048225436359643936, + "learning_rate": 7.960127750177431e-05, + "loss": 0.024932587146759035, + "step": 143770 + }, + { + "epoch": 20.408800567778567, + "grad_norm": 4.381402969360352, + "learning_rate": 7.959985805535841e-05, + "loss": 0.019103142619132995, + "step": 143780 + }, + { + "epoch": 20.410220014194465, + "grad_norm": 6.140013694763184, + "learning_rate": 7.959843860894252e-05, + "loss": 0.010969682037830353, + "step": 143790 + }, + { + "epoch": 20.411639460610363, + "grad_norm": 0.9153760075569153, + "learning_rate": 7.959701916252662e-05, + "loss": 0.032856902480125426, + "step": 143800 + }, + { + "epoch": 20.413058907026258, + "grad_norm": 13.96950626373291, + "learning_rate": 7.959559971611073e-05, + "loss": 0.05687206983566284, + "step": 143810 + }, + { + "epoch": 20.414478353442156, + "grad_norm": 0.008463529869914055, + "learning_rate": 7.959418026969481e-05, + "loss": 0.015109725296497345, + "step": 143820 + }, + { + "epoch": 20.415897799858055, + "grad_norm": 0.047428861260414124, + "learning_rate": 7.959276082327892e-05, + "loss": 0.020231030881404877, + "step": 143830 + }, + { + "epoch": 20.417317246273953, + "grad_norm": 0.282391756772995, + "learning_rate": 7.959134137686302e-05, + "loss": 0.02680096626281738, + "step": 143840 + }, + { + "epoch": 20.41873669268985, + "grad_norm": 11.744375228881836, + "learning_rate": 7.958992193044713e-05, + "loss": 0.006563518196344376, + "step": 143850 + }, + { + "epoch": 20.42015613910575, + "grad_norm": 4.867786884307861, + "learning_rate": 7.958850248403123e-05, + "loss": 0.05495727062225342, + "step": 143860 + }, + { + "epoch": 20.421575585521648, + "grad_norm": 0.4899853467941284, + "learning_rate": 7.958708303761533e-05, + "loss": 0.008291900902986527, + "step": 143870 + }, + { + "epoch": 20.422995031937543, + "grad_norm": 2.9325203895568848, + "learning_rate": 7.958566359119944e-05, + "loss": 0.01769079566001892, + "step": 143880 + }, + { + "epoch": 20.42441447835344, + "grad_norm": 2.7301363945007324, + "learning_rate": 7.958424414478353e-05, + "loss": 0.01559491604566574, + "step": 143890 + }, + { + "epoch": 20.42583392476934, + "grad_norm": 0.005212320946156979, + "learning_rate": 7.958282469836765e-05, + "loss": 0.00540030300617218, + "step": 143900 + }, + { + "epoch": 20.427253371185238, + "grad_norm": 0.1059456467628479, + "learning_rate": 7.958140525195174e-05, + "loss": 0.030397701263427734, + "step": 143910 + }, + { + "epoch": 20.428672817601136, + "grad_norm": 1.7047607898712158, + "learning_rate": 7.957998580553584e-05, + "loss": 0.028485524654388427, + "step": 143920 + }, + { + "epoch": 20.430092264017034, + "grad_norm": 0.0941111221909523, + "learning_rate": 7.957856635911994e-05, + "loss": 0.018759424984455108, + "step": 143930 + }, + { + "epoch": 20.431511710432932, + "grad_norm": 1.2995245456695557, + "learning_rate": 7.957714691270405e-05, + "loss": 0.035907435417175296, + "step": 143940 + }, + { + "epoch": 20.432931156848827, + "grad_norm": 0.02575070969760418, + "learning_rate": 7.957572746628815e-05, + "loss": 0.029672053456306458, + "step": 143950 + }, + { + "epoch": 20.434350603264726, + "grad_norm": 0.2127975970506668, + "learning_rate": 7.957430801987226e-05, + "loss": 0.012106426805257798, + "step": 143960 + }, + { + "epoch": 20.435770049680624, + "grad_norm": 3.6614949703216553, + "learning_rate": 7.957288857345637e-05, + "loss": 0.0195004865527153, + "step": 143970 + }, + { + "epoch": 20.437189496096522, + "grad_norm": 1.0612987279891968, + "learning_rate": 7.957146912704045e-05, + "loss": 0.030395376682281493, + "step": 143980 + }, + { + "epoch": 20.43860894251242, + "grad_norm": 0.5859479904174805, + "learning_rate": 7.957004968062456e-05, + "loss": 0.039389362931251524, + "step": 143990 + }, + { + "epoch": 20.44002838892832, + "grad_norm": 2.5606186389923096, + "learning_rate": 7.956863023420866e-05, + "loss": 0.0025921862572431563, + "step": 144000 + }, + { + "epoch": 20.44002838892832, + "eval_accuracy": 0.9828320722324665, + "eval_loss": 0.061342090368270874, + "eval_runtime": 34.3117, + "eval_samples_per_second": 458.357, + "eval_steps_per_second": 14.339, + "step": 144000 + }, + { + "epoch": 20.441447835344217, + "grad_norm": 15.801836967468262, + "learning_rate": 7.956721078779277e-05, + "loss": 0.030484694242477416, + "step": 144010 + }, + { + "epoch": 20.442867281760112, + "grad_norm": 13.656440734863281, + "learning_rate": 7.956579134137687e-05, + "loss": 0.017888091504573822, + "step": 144020 + }, + { + "epoch": 20.44428672817601, + "grad_norm": 0.3600428104400635, + "learning_rate": 7.956437189496097e-05, + "loss": 0.025508299469947815, + "step": 144030 + }, + { + "epoch": 20.44570617459191, + "grad_norm": 0.37523525953292847, + "learning_rate": 7.956295244854506e-05, + "loss": 0.018080103397369384, + "step": 144040 + }, + { + "epoch": 20.447125621007807, + "grad_norm": 1.7047333717346191, + "learning_rate": 7.956153300212917e-05, + "loss": 0.026506760716438295, + "step": 144050 + }, + { + "epoch": 20.448545067423705, + "grad_norm": 1.0215219259262085, + "learning_rate": 7.956011355571328e-05, + "loss": 0.02427029013633728, + "step": 144060 + }, + { + "epoch": 20.449964513839603, + "grad_norm": 0.22409643232822418, + "learning_rate": 7.955869410929738e-05, + "loss": 0.057889151573181155, + "step": 144070 + }, + { + "epoch": 20.4513839602555, + "grad_norm": 0.19263197481632233, + "learning_rate": 7.955727466288148e-05, + "loss": 0.02869861125946045, + "step": 144080 + }, + { + "epoch": 20.4528034066714, + "grad_norm": 0.5966265201568604, + "learning_rate": 7.955585521646558e-05, + "loss": 0.0024799294769763947, + "step": 144090 + }, + { + "epoch": 20.454222853087295, + "grad_norm": 0.09790827333927155, + "learning_rate": 7.955443577004969e-05, + "loss": 0.0106086365878582, + "step": 144100 + }, + { + "epoch": 20.455642299503193, + "grad_norm": 5.698458194732666, + "learning_rate": 7.955301632363379e-05, + "loss": 0.006442001461982727, + "step": 144110 + }, + { + "epoch": 20.45706174591909, + "grad_norm": 0.15753024816513062, + "learning_rate": 7.95515968772179e-05, + "loss": 0.013531532883644105, + "step": 144120 + }, + { + "epoch": 20.45848119233499, + "grad_norm": 0.19208881258964539, + "learning_rate": 7.955017743080198e-05, + "loss": 0.010571710765361786, + "step": 144130 + }, + { + "epoch": 20.459900638750888, + "grad_norm": 0.400680273771286, + "learning_rate": 7.954875798438609e-05, + "loss": 0.00333230085670948, + "step": 144140 + }, + { + "epoch": 20.461320085166786, + "grad_norm": 0.12694121897220612, + "learning_rate": 7.95473385379702e-05, + "loss": 0.003826696053147316, + "step": 144150 + }, + { + "epoch": 20.462739531582685, + "grad_norm": 0.19814366102218628, + "learning_rate": 7.95459190915543e-05, + "loss": 0.05421693325042724, + "step": 144160 + }, + { + "epoch": 20.46415897799858, + "grad_norm": 1.1977726221084595, + "learning_rate": 7.954449964513841e-05, + "loss": 0.03128778040409088, + "step": 144170 + }, + { + "epoch": 20.465578424414478, + "grad_norm": 0.010190371423959732, + "learning_rate": 7.95430801987225e-05, + "loss": 0.005667523294687271, + "step": 144180 + }, + { + "epoch": 20.466997870830376, + "grad_norm": 0.5949884653091431, + "learning_rate": 7.95416607523066e-05, + "loss": 0.02206910401582718, + "step": 144190 + }, + { + "epoch": 20.468417317246274, + "grad_norm": 0.3613375127315521, + "learning_rate": 7.95402413058907e-05, + "loss": 0.002604234963655472, + "step": 144200 + }, + { + "epoch": 20.469836763662173, + "grad_norm": 0.03862883523106575, + "learning_rate": 7.953882185947481e-05, + "loss": 0.00332438163459301, + "step": 144210 + }, + { + "epoch": 20.47125621007807, + "grad_norm": 0.4510781764984131, + "learning_rate": 7.953740241305891e-05, + "loss": 0.04834697842597961, + "step": 144220 + }, + { + "epoch": 20.47267565649397, + "grad_norm": 8.12693977355957, + "learning_rate": 7.953598296664301e-05, + "loss": 0.021429724991321564, + "step": 144230 + }, + { + "epoch": 20.474095102909864, + "grad_norm": 0.24772100150585175, + "learning_rate": 7.953456352022712e-05, + "loss": 0.023217305541038513, + "step": 144240 + }, + { + "epoch": 20.475514549325762, + "grad_norm": 0.0036164431367069483, + "learning_rate": 7.953314407381122e-05, + "loss": 0.015688189864158632, + "step": 144250 + }, + { + "epoch": 20.47693399574166, + "grad_norm": 5.6642279624938965, + "learning_rate": 7.953172462739533e-05, + "loss": 0.022077365219593047, + "step": 144260 + }, + { + "epoch": 20.47835344215756, + "grad_norm": 0.3166951835155487, + "learning_rate": 7.953030518097942e-05, + "loss": 0.010726609826087951, + "step": 144270 + }, + { + "epoch": 20.479772888573457, + "grad_norm": 0.05075109004974365, + "learning_rate": 7.952888573456352e-05, + "loss": 0.004327214881777763, + "step": 144280 + }, + { + "epoch": 20.481192334989355, + "grad_norm": 0.052158039063215256, + "learning_rate": 7.952746628814762e-05, + "loss": 0.0032896395772695542, + "step": 144290 + }, + { + "epoch": 20.482611781405254, + "grad_norm": 0.011469243094325066, + "learning_rate": 7.952604684173173e-05, + "loss": 0.002999286726117134, + "step": 144300 + }, + { + "epoch": 20.48403122782115, + "grad_norm": 1.171034812927246, + "learning_rate": 7.952462739531583e-05, + "loss": 0.02299569547176361, + "step": 144310 + }, + { + "epoch": 20.485450674237047, + "grad_norm": 14.642333984375, + "learning_rate": 7.952320794889994e-05, + "loss": 0.02856981158256531, + "step": 144320 + }, + { + "epoch": 20.486870120652945, + "grad_norm": 0.003455414902418852, + "learning_rate": 7.952178850248404e-05, + "loss": 0.006457825750112533, + "step": 144330 + }, + { + "epoch": 20.488289567068843, + "grad_norm": 1.4039028882980347, + "learning_rate": 7.952036905606813e-05, + "loss": 0.013668262958526611, + "step": 144340 + }, + { + "epoch": 20.48970901348474, + "grad_norm": 1.8155009746551514, + "learning_rate": 7.951894960965224e-05, + "loss": 0.021648672223091126, + "step": 144350 + }, + { + "epoch": 20.49112845990064, + "grad_norm": 0.5147523283958435, + "learning_rate": 7.951753016323634e-05, + "loss": 0.048282742500305176, + "step": 144360 + }, + { + "epoch": 20.49254790631654, + "grad_norm": 0.06422603875398636, + "learning_rate": 7.951611071682045e-05, + "loss": 0.0357273668050766, + "step": 144370 + }, + { + "epoch": 20.493967352732433, + "grad_norm": 6.547656536102295, + "learning_rate": 7.951469127040455e-05, + "loss": 0.04553760290145874, + "step": 144380 + }, + { + "epoch": 20.49538679914833, + "grad_norm": 19.1754093170166, + "learning_rate": 7.951327182398865e-05, + "loss": 0.028497081995010377, + "step": 144390 + }, + { + "epoch": 20.49680624556423, + "grad_norm": 1.616147756576538, + "learning_rate": 7.951185237757274e-05, + "loss": 0.02675079107284546, + "step": 144400 + }, + { + "epoch": 20.498225691980128, + "grad_norm": 1.0593620538711548, + "learning_rate": 7.951043293115686e-05, + "loss": 0.017714273929595948, + "step": 144410 + }, + { + "epoch": 20.499645138396026, + "grad_norm": 2.143415689468384, + "learning_rate": 7.950901348474095e-05, + "loss": 0.005340157449245453, + "step": 144420 + }, + { + "epoch": 20.501064584811925, + "grad_norm": 0.31105735898017883, + "learning_rate": 7.950759403832506e-05, + "loss": 0.01819224953651428, + "step": 144430 + }, + { + "epoch": 20.502484031227823, + "grad_norm": 15.8975830078125, + "learning_rate": 7.950617459190916e-05, + "loss": 0.02267924100160599, + "step": 144440 + }, + { + "epoch": 20.503903477643718, + "grad_norm": 0.6668938398361206, + "learning_rate": 7.950475514549326e-05, + "loss": 0.024639742076396944, + "step": 144450 + }, + { + "epoch": 20.505322924059616, + "grad_norm": 0.33851656317710876, + "learning_rate": 7.950333569907737e-05, + "loss": 0.007134123146533966, + "step": 144460 + }, + { + "epoch": 20.506742370475514, + "grad_norm": 0.5393432378768921, + "learning_rate": 7.950191625266147e-05, + "loss": 0.002460606023669243, + "step": 144470 + }, + { + "epoch": 20.508161816891413, + "grad_norm": 0.0949764996767044, + "learning_rate": 7.950049680624558e-05, + "loss": 0.023367878794670106, + "step": 144480 + }, + { + "epoch": 20.50958126330731, + "grad_norm": 0.16646301746368408, + "learning_rate": 7.949907735982966e-05, + "loss": 0.013226522505283356, + "step": 144490 + }, + { + "epoch": 20.51100070972321, + "grad_norm": 7.567387104034424, + "learning_rate": 7.949765791341377e-05, + "loss": 0.03316475450992584, + "step": 144500 + }, + { + "epoch": 20.51100070972321, + "eval_accuracy": 0.9884275449863292, + "eval_loss": 0.04046180099248886, + "eval_runtime": 34.0112, + "eval_samples_per_second": 462.406, + "eval_steps_per_second": 14.466, + "step": 144500 + }, + { + "epoch": 20.512420156139108, + "grad_norm": 11.487260818481445, + "learning_rate": 7.949623846699787e-05, + "loss": 0.009593375027179718, + "step": 144510 + }, + { + "epoch": 20.513839602555002, + "grad_norm": 0.2956373393535614, + "learning_rate": 7.949481902058198e-05, + "loss": 0.013197061419487, + "step": 144520 + }, + { + "epoch": 20.5152590489709, + "grad_norm": 0.6947842836380005, + "learning_rate": 7.949339957416608e-05, + "loss": 0.00269196555018425, + "step": 144530 + }, + { + "epoch": 20.5166784953868, + "grad_norm": 0.025174129754304886, + "learning_rate": 7.949198012775018e-05, + "loss": 0.0012585099786520005, + "step": 144540 + }, + { + "epoch": 20.518097941802697, + "grad_norm": 0.04389370232820511, + "learning_rate": 7.949056068133429e-05, + "loss": 0.0021505054086446763, + "step": 144550 + }, + { + "epoch": 20.519517388218595, + "grad_norm": 17.116100311279297, + "learning_rate": 7.948914123491838e-05, + "loss": 0.017475605010986328, + "step": 144560 + }, + { + "epoch": 20.520936834634494, + "grad_norm": 0.06114675849676132, + "learning_rate": 7.94877217885025e-05, + "loss": 0.021759213507175447, + "step": 144570 + }, + { + "epoch": 20.522356281050392, + "grad_norm": 0.26840707659721375, + "learning_rate": 7.948630234208659e-05, + "loss": 0.012413251399993896, + "step": 144580 + }, + { + "epoch": 20.523775727466287, + "grad_norm": 0.026500867679715157, + "learning_rate": 7.948488289567069e-05, + "loss": 0.028197860717773436, + "step": 144590 + }, + { + "epoch": 20.525195173882185, + "grad_norm": 0.005009442567825317, + "learning_rate": 7.948346344925479e-05, + "loss": 0.011039438843727111, + "step": 144600 + }, + { + "epoch": 20.526614620298083, + "grad_norm": 13.434609413146973, + "learning_rate": 7.94820440028389e-05, + "loss": 0.011699755489826203, + "step": 144610 + }, + { + "epoch": 20.528034066713982, + "grad_norm": 0.3532591164112091, + "learning_rate": 7.9480624556423e-05, + "loss": 0.010837162286043167, + "step": 144620 + }, + { + "epoch": 20.52945351312988, + "grad_norm": 0.33298391103744507, + "learning_rate": 7.94792051100071e-05, + "loss": 0.015803493559360504, + "step": 144630 + }, + { + "epoch": 20.53087295954578, + "grad_norm": 0.3064946234226227, + "learning_rate": 7.94777856635912e-05, + "loss": 0.008358365297317505, + "step": 144640 + }, + { + "epoch": 20.532292405961677, + "grad_norm": 0.365702748298645, + "learning_rate": 7.94763662171753e-05, + "loss": 0.0238431379199028, + "step": 144650 + }, + { + "epoch": 20.53371185237757, + "grad_norm": 0.05736573413014412, + "learning_rate": 7.947494677075941e-05, + "loss": 0.001297728344798088, + "step": 144660 + }, + { + "epoch": 20.53513129879347, + "grad_norm": 3.2892520427703857, + "learning_rate": 7.947352732434351e-05, + "loss": 0.009251207113265991, + "step": 144670 + }, + { + "epoch": 20.536550745209368, + "grad_norm": 0.2739517092704773, + "learning_rate": 7.947210787792762e-05, + "loss": 0.011083140969276428, + "step": 144680 + }, + { + "epoch": 20.537970191625266, + "grad_norm": 0.0026503384578973055, + "learning_rate": 7.947068843151172e-05, + "loss": 0.011821965128183365, + "step": 144690 + }, + { + "epoch": 20.539389638041165, + "grad_norm": 11.42822551727295, + "learning_rate": 7.946926898509582e-05, + "loss": 0.05259329080581665, + "step": 144700 + }, + { + "epoch": 20.540809084457063, + "grad_norm": 0.06336436420679092, + "learning_rate": 7.946784953867991e-05, + "loss": 0.0036733098328113555, + "step": 144710 + }, + { + "epoch": 20.54222853087296, + "grad_norm": 1.5711698532104492, + "learning_rate": 7.946643009226402e-05, + "loss": 0.020183426141738892, + "step": 144720 + }, + { + "epoch": 20.543647977288856, + "grad_norm": 0.12674912810325623, + "learning_rate": 7.946501064584812e-05, + "loss": 0.022117115557193756, + "step": 144730 + }, + { + "epoch": 20.545067423704754, + "grad_norm": 7.3648152351379395, + "learning_rate": 7.946359119943223e-05, + "loss": 0.05172304511070251, + "step": 144740 + }, + { + "epoch": 20.546486870120653, + "grad_norm": 0.5710421800613403, + "learning_rate": 7.946217175301633e-05, + "loss": 0.004803726077079773, + "step": 144750 + }, + { + "epoch": 20.54790631653655, + "grad_norm": 0.3407939374446869, + "learning_rate": 7.946089425124203e-05, + "loss": 0.04576562941074371, + "step": 144760 + }, + { + "epoch": 20.54932576295245, + "grad_norm": 0.05488349124789238, + "learning_rate": 7.945947480482611e-05, + "loss": 0.0018111549317836762, + "step": 144770 + }, + { + "epoch": 20.550745209368348, + "grad_norm": 10.437118530273438, + "learning_rate": 7.945805535841022e-05, + "loss": 0.04636000096797943, + "step": 144780 + }, + { + "epoch": 20.552164655784246, + "grad_norm": 0.01174534484744072, + "learning_rate": 7.945663591199432e-05, + "loss": 0.012236421555280685, + "step": 144790 + }, + { + "epoch": 20.55358410220014, + "grad_norm": 0.017734697088599205, + "learning_rate": 7.945521646557843e-05, + "loss": 0.006476728618144989, + "step": 144800 + }, + { + "epoch": 20.55500354861604, + "grad_norm": 12.977668762207031, + "learning_rate": 7.945379701916253e-05, + "loss": 0.01667654812335968, + "step": 144810 + }, + { + "epoch": 20.556422995031937, + "grad_norm": 24.415279388427734, + "learning_rate": 7.945237757274663e-05, + "loss": 0.04182113707065582, + "step": 144820 + }, + { + "epoch": 20.557842441447836, + "grad_norm": 0.016579121351242065, + "learning_rate": 7.945095812633074e-05, + "loss": 0.028746408224105836, + "step": 144830 + }, + { + "epoch": 20.559261887863734, + "grad_norm": 0.32016265392303467, + "learning_rate": 7.944953867991483e-05, + "loss": 0.04389718174934387, + "step": 144840 + }, + { + "epoch": 20.560681334279632, + "grad_norm": 2.2971858978271484, + "learning_rate": 7.944811923349894e-05, + "loss": 0.03008989095687866, + "step": 144850 + }, + { + "epoch": 20.56210078069553, + "grad_norm": 15.585360527038574, + "learning_rate": 7.944669978708304e-05, + "loss": 0.018069356679916382, + "step": 144860 + }, + { + "epoch": 20.563520227111425, + "grad_norm": 6.462601184844971, + "learning_rate": 7.944528034066714e-05, + "loss": 0.014132696390151977, + "step": 144870 + }, + { + "epoch": 20.564939673527324, + "grad_norm": 5.803984642028809, + "learning_rate": 7.944386089425124e-05, + "loss": 0.03112670183181763, + "step": 144880 + }, + { + "epoch": 20.566359119943222, + "grad_norm": 0.7162980437278748, + "learning_rate": 7.944244144783535e-05, + "loss": 0.012357431650161742, + "step": 144890 + }, + { + "epoch": 20.56777856635912, + "grad_norm": 6.502598285675049, + "learning_rate": 7.944102200141944e-05, + "loss": 0.02088506519794464, + "step": 144900 + }, + { + "epoch": 20.56919801277502, + "grad_norm": 2.168832540512085, + "learning_rate": 7.943960255500356e-05, + "loss": 0.006182302162051201, + "step": 144910 + }, + { + "epoch": 20.570617459190917, + "grad_norm": 0.8160980939865112, + "learning_rate": 7.943818310858765e-05, + "loss": 0.03159986436367035, + "step": 144920 + }, + { + "epoch": 20.572036905606815, + "grad_norm": 7.209481716156006, + "learning_rate": 7.943676366217175e-05, + "loss": 0.023209857940673827, + "step": 144930 + }, + { + "epoch": 20.57345635202271, + "grad_norm": 19.98500633239746, + "learning_rate": 7.943534421575586e-05, + "loss": 0.03503158986568451, + "step": 144940 + }, + { + "epoch": 20.574875798438608, + "grad_norm": 1.5545172691345215, + "learning_rate": 7.943392476933996e-05, + "loss": 0.005894295498728752, + "step": 144950 + }, + { + "epoch": 20.576295244854506, + "grad_norm": 0.007062634453177452, + "learning_rate": 7.943250532292407e-05, + "loss": 0.048457229137420656, + "step": 144960 + }, + { + "epoch": 20.577714691270405, + "grad_norm": 0.0761469379067421, + "learning_rate": 7.943108587650817e-05, + "loss": 0.01926290839910507, + "step": 144970 + }, + { + "epoch": 20.579134137686303, + "grad_norm": 0.3928318917751312, + "learning_rate": 7.942966643009226e-05, + "loss": 0.002387086674571037, + "step": 144980 + }, + { + "epoch": 20.5805535841022, + "grad_norm": 1.768370270729065, + "learning_rate": 7.942824698367636e-05, + "loss": 0.03047075867652893, + "step": 144990 + }, + { + "epoch": 20.5819730305181, + "grad_norm": 0.02380281686782837, + "learning_rate": 7.942682753726047e-05, + "loss": 0.007473225891590119, + "step": 145000 + }, + { + "epoch": 20.5819730305181, + "eval_accuracy": 0.9804158453614803, + "eval_loss": 0.07275503128767014, + "eval_runtime": 43.1489, + "eval_samples_per_second": 364.482, + "eval_steps_per_second": 11.402, + "step": 145000 + }, + { + "epoch": 20.583392476933994, + "grad_norm": 0.06391783058643341, + "learning_rate": 7.942540809084458e-05, + "loss": 0.023229347169399263, + "step": 145010 + }, + { + "epoch": 20.584811923349893, + "grad_norm": 9.119282722473145, + "learning_rate": 7.942398864442868e-05, + "loss": 0.013885484635829925, + "step": 145020 + }, + { + "epoch": 20.58623136976579, + "grad_norm": 15.528413772583008, + "learning_rate": 7.942256919801278e-05, + "loss": 0.052690714597702026, + "step": 145030 + }, + { + "epoch": 20.58765081618169, + "grad_norm": 0.10440409183502197, + "learning_rate": 7.942114975159688e-05, + "loss": 0.034368190169334414, + "step": 145040 + }, + { + "epoch": 20.589070262597588, + "grad_norm": 3.9504480361938477, + "learning_rate": 7.941973030518099e-05, + "loss": 0.03920840620994568, + "step": 145050 + }, + { + "epoch": 20.590489709013486, + "grad_norm": 0.19848771393299103, + "learning_rate": 7.941831085876508e-05, + "loss": 0.00808703526854515, + "step": 145060 + }, + { + "epoch": 20.591909155429384, + "grad_norm": 0.538018524646759, + "learning_rate": 7.94168914123492e-05, + "loss": 0.0028334088623523713, + "step": 145070 + }, + { + "epoch": 20.59332860184528, + "grad_norm": 9.399895668029785, + "learning_rate": 7.941547196593328e-05, + "loss": 0.026027819514274596, + "step": 145080 + }, + { + "epoch": 20.594748048261177, + "grad_norm": 13.63102912902832, + "learning_rate": 7.941405251951739e-05, + "loss": 0.02653225064277649, + "step": 145090 + }, + { + "epoch": 20.596167494677076, + "grad_norm": 0.030577469617128372, + "learning_rate": 7.94126330731015e-05, + "loss": 0.0009738571941852569, + "step": 145100 + }, + { + "epoch": 20.597586941092974, + "grad_norm": 0.3383432626724243, + "learning_rate": 7.94112136266856e-05, + "loss": 0.006517581641674042, + "step": 145110 + }, + { + "epoch": 20.599006387508872, + "grad_norm": 0.045411527156829834, + "learning_rate": 7.940979418026971e-05, + "loss": 0.02748202681541443, + "step": 145120 + }, + { + "epoch": 20.60042583392477, + "grad_norm": 0.057020217180252075, + "learning_rate": 7.940837473385379e-05, + "loss": 0.03388555943965912, + "step": 145130 + }, + { + "epoch": 20.60184528034067, + "grad_norm": 0.04674219712615013, + "learning_rate": 7.94069552874379e-05, + "loss": 0.01053178608417511, + "step": 145140 + }, + { + "epoch": 20.603264726756564, + "grad_norm": 0.1999913901090622, + "learning_rate": 7.9405535841022e-05, + "loss": 0.06322197914123535, + "step": 145150 + }, + { + "epoch": 20.604684173172462, + "grad_norm": 0.051310621201992035, + "learning_rate": 7.940411639460611e-05, + "loss": 0.013078901171684264, + "step": 145160 + }, + { + "epoch": 20.60610361958836, + "grad_norm": 0.057633381336927414, + "learning_rate": 7.940269694819021e-05, + "loss": 0.046252280473709106, + "step": 145170 + }, + { + "epoch": 20.60752306600426, + "grad_norm": 1.1152459383010864, + "learning_rate": 7.940127750177431e-05, + "loss": 0.013692560791969299, + "step": 145180 + }, + { + "epoch": 20.608942512420157, + "grad_norm": 0.6031478643417358, + "learning_rate": 7.939985805535842e-05, + "loss": 0.00519898347556591, + "step": 145190 + }, + { + "epoch": 20.610361958836055, + "grad_norm": 0.12555818259716034, + "learning_rate": 7.939843860894252e-05, + "loss": 0.004649277776479721, + "step": 145200 + }, + { + "epoch": 20.611781405251953, + "grad_norm": 0.07937568426132202, + "learning_rate": 7.939701916252663e-05, + "loss": 0.01588006317615509, + "step": 145210 + }, + { + "epoch": 20.613200851667848, + "grad_norm": 0.02730713225901127, + "learning_rate": 7.939559971611072e-05, + "loss": 0.00898231491446495, + "step": 145220 + }, + { + "epoch": 20.614620298083747, + "grad_norm": 0.16898596286773682, + "learning_rate": 7.939418026969482e-05, + "loss": 0.010687188804149627, + "step": 145230 + }, + { + "epoch": 20.616039744499645, + "grad_norm": 2.987128257751465, + "learning_rate": 7.939276082327892e-05, + "loss": 0.02661990523338318, + "step": 145240 + }, + { + "epoch": 20.617459190915543, + "grad_norm": 0.052394766360521317, + "learning_rate": 7.939134137686303e-05, + "loss": 0.04456896483898163, + "step": 145250 + }, + { + "epoch": 20.61887863733144, + "grad_norm": 0.5967915654182434, + "learning_rate": 7.938992193044713e-05, + "loss": 0.004843110218644142, + "step": 145260 + }, + { + "epoch": 20.62029808374734, + "grad_norm": 0.033302005380392075, + "learning_rate": 7.938850248403124e-05, + "loss": 0.011710944026708603, + "step": 145270 + }, + { + "epoch": 20.621717530163238, + "grad_norm": 0.11341115832328796, + "learning_rate": 7.938708303761533e-05, + "loss": 0.009142975509166717, + "step": 145280 + }, + { + "epoch": 20.623136976579133, + "grad_norm": 0.08349739015102386, + "learning_rate": 7.938566359119943e-05, + "loss": 0.011193586885929108, + "step": 145290 + }, + { + "epoch": 20.62455642299503, + "grad_norm": 0.03839299455285072, + "learning_rate": 7.938424414478354e-05, + "loss": 0.003303806111216545, + "step": 145300 + }, + { + "epoch": 20.62597586941093, + "grad_norm": 0.041031140834093094, + "learning_rate": 7.938282469836764e-05, + "loss": 0.005088656768202781, + "step": 145310 + }, + { + "epoch": 20.627395315826828, + "grad_norm": 14.062775611877441, + "learning_rate": 7.938140525195175e-05, + "loss": 0.030301907658576967, + "step": 145320 + }, + { + "epoch": 20.628814762242726, + "grad_norm": 0.021417422220110893, + "learning_rate": 7.937998580553585e-05, + "loss": 0.043958616256713864, + "step": 145330 + }, + { + "epoch": 20.630234208658624, + "grad_norm": 0.08100397139787674, + "learning_rate": 7.937856635911995e-05, + "loss": 0.01910707950592041, + "step": 145340 + }, + { + "epoch": 20.631653655074523, + "grad_norm": 0.23383355140686035, + "learning_rate": 7.937714691270404e-05, + "loss": 0.0186216801404953, + "step": 145350 + }, + { + "epoch": 20.633073101490417, + "grad_norm": 0.22817610204219818, + "learning_rate": 7.937572746628815e-05, + "loss": 0.004916000366210938, + "step": 145360 + }, + { + "epoch": 20.634492547906316, + "grad_norm": 5.143357753753662, + "learning_rate": 7.937430801987225e-05, + "loss": 0.0041277710348367695, + "step": 145370 + }, + { + "epoch": 20.635911994322214, + "grad_norm": 1.02508544921875, + "learning_rate": 7.937288857345636e-05, + "loss": 0.02411566972732544, + "step": 145380 + }, + { + "epoch": 20.637331440738112, + "grad_norm": 0.04756942763924599, + "learning_rate": 7.937146912704046e-05, + "loss": 0.01031438484787941, + "step": 145390 + }, + { + "epoch": 20.63875088715401, + "grad_norm": 0.3243301212787628, + "learning_rate": 7.937004968062456e-05, + "loss": 0.011414211988449097, + "step": 145400 + }, + { + "epoch": 20.64017033356991, + "grad_norm": 0.07148227840662003, + "learning_rate": 7.936863023420867e-05, + "loss": 0.01592966765165329, + "step": 145410 + }, + { + "epoch": 20.641589779985807, + "grad_norm": 0.004803858697414398, + "learning_rate": 7.936721078779277e-05, + "loss": 0.002159743383526802, + "step": 145420 + }, + { + "epoch": 20.643009226401702, + "grad_norm": 0.3068103790283203, + "learning_rate": 7.936579134137688e-05, + "loss": 0.010675179958343505, + "step": 145430 + }, + { + "epoch": 20.6444286728176, + "grad_norm": 0.023777369409799576, + "learning_rate": 7.936437189496096e-05, + "loss": 0.007066705822944641, + "step": 145440 + }, + { + "epoch": 20.6458481192335, + "grad_norm": 1.9655640125274658, + "learning_rate": 7.936295244854507e-05, + "loss": 0.018037761747837066, + "step": 145450 + }, + { + "epoch": 20.647267565649397, + "grad_norm": 8.728242874145508, + "learning_rate": 7.936153300212917e-05, + "loss": 0.012693244218826293, + "step": 145460 + }, + { + "epoch": 20.648687012065295, + "grad_norm": 1.5654006004333496, + "learning_rate": 7.936011355571328e-05, + "loss": 0.013682277500629425, + "step": 145470 + }, + { + "epoch": 20.650106458481194, + "grad_norm": 0.17269207537174225, + "learning_rate": 7.935869410929738e-05, + "loss": 0.022952693700790405, + "step": 145480 + }, + { + "epoch": 20.651525904897092, + "grad_norm": 4.186474800109863, + "learning_rate": 7.935727466288147e-05, + "loss": 0.02002708613872528, + "step": 145490 + }, + { + "epoch": 20.652945351312987, + "grad_norm": 0.6695135235786438, + "learning_rate": 7.935585521646559e-05, + "loss": 0.04722619950771332, + "step": 145500 + }, + { + "epoch": 20.652945351312987, + "eval_accuracy": 0.9896992433394799, + "eval_loss": 0.03496474400162697, + "eval_runtime": 34.2515, + "eval_samples_per_second": 459.163, + "eval_steps_per_second": 14.364, + "step": 145500 + }, + { + "epoch": 20.654364797728885, + "grad_norm": 0.21022425591945648, + "learning_rate": 7.935443577004968e-05, + "loss": 0.017057400941848756, + "step": 145510 + }, + { + "epoch": 20.655784244144783, + "grad_norm": 1.2094993591308594, + "learning_rate": 7.93530163236338e-05, + "loss": 0.04082568883895874, + "step": 145520 + }, + { + "epoch": 20.65720369056068, + "grad_norm": 0.07598985731601715, + "learning_rate": 7.935159687721789e-05, + "loss": 0.0014511864632368089, + "step": 145530 + }, + { + "epoch": 20.65862313697658, + "grad_norm": 0.40147170424461365, + "learning_rate": 7.935017743080199e-05, + "loss": 0.0203866183757782, + "step": 145540 + }, + { + "epoch": 20.660042583392478, + "grad_norm": 0.04159606620669365, + "learning_rate": 7.934875798438609e-05, + "loss": 0.00415494367480278, + "step": 145550 + }, + { + "epoch": 20.661462029808376, + "grad_norm": 0.2567363977432251, + "learning_rate": 7.93473385379702e-05, + "loss": 0.004127321392297744, + "step": 145560 + }, + { + "epoch": 20.66288147622427, + "grad_norm": 1.3238670825958252, + "learning_rate": 7.93459190915543e-05, + "loss": 0.01999867856502533, + "step": 145570 + }, + { + "epoch": 20.66430092264017, + "grad_norm": 0.8857561945915222, + "learning_rate": 7.93444996451384e-05, + "loss": 0.003461797907948494, + "step": 145580 + }, + { + "epoch": 20.665720369056068, + "grad_norm": 0.02009095437824726, + "learning_rate": 7.93430801987225e-05, + "loss": 0.04140637814998627, + "step": 145590 + }, + { + "epoch": 20.667139815471966, + "grad_norm": 6.540339946746826, + "learning_rate": 7.93416607523066e-05, + "loss": 0.035273030400276184, + "step": 145600 + }, + { + "epoch": 20.668559261887864, + "grad_norm": 0.42791029810905457, + "learning_rate": 7.934024130589071e-05, + "loss": 0.004827521741390228, + "step": 145610 + }, + { + "epoch": 20.669978708303763, + "grad_norm": 0.9732591509819031, + "learning_rate": 7.933882185947481e-05, + "loss": 0.015463906526565551, + "step": 145620 + }, + { + "epoch": 20.67139815471966, + "grad_norm": 0.04784591868519783, + "learning_rate": 7.933740241305892e-05, + "loss": 0.00696093738079071, + "step": 145630 + }, + { + "epoch": 20.672817601135556, + "grad_norm": 0.08414292335510254, + "learning_rate": 7.9335982966643e-05, + "loss": 0.03177079856395722, + "step": 145640 + }, + { + "epoch": 20.674237047551454, + "grad_norm": 0.09434685856103897, + "learning_rate": 7.933456352022711e-05, + "loss": 0.012461913377046585, + "step": 145650 + }, + { + "epoch": 20.675656493967352, + "grad_norm": 21.005489349365234, + "learning_rate": 7.933314407381121e-05, + "loss": 0.025967153906822204, + "step": 145660 + }, + { + "epoch": 20.67707594038325, + "grad_norm": 2.2884438037872314, + "learning_rate": 7.933172462739532e-05, + "loss": 0.014497111737728118, + "step": 145670 + }, + { + "epoch": 20.67849538679915, + "grad_norm": 0.3916836977005005, + "learning_rate": 7.933030518097942e-05, + "loss": 0.029211747646331786, + "step": 145680 + }, + { + "epoch": 20.679914833215047, + "grad_norm": 1.543169617652893, + "learning_rate": 7.932888573456353e-05, + "loss": 0.08056480884552002, + "step": 145690 + }, + { + "epoch": 20.681334279630946, + "grad_norm": 11.27370834350586, + "learning_rate": 7.932746628814763e-05, + "loss": 0.022616779804229735, + "step": 145700 + }, + { + "epoch": 20.68275372604684, + "grad_norm": 0.5747624635696411, + "learning_rate": 7.932604684173173e-05, + "loss": 0.010348500311374664, + "step": 145710 + }, + { + "epoch": 20.68417317246274, + "grad_norm": 0.09961613267660141, + "learning_rate": 7.932462739531584e-05, + "loss": 0.014371511340141297, + "step": 145720 + }, + { + "epoch": 20.685592618878637, + "grad_norm": 1.2526777982711792, + "learning_rate": 7.932320794889993e-05, + "loss": 0.017895153164863585, + "step": 145730 + }, + { + "epoch": 20.687012065294535, + "grad_norm": 0.027043156325817108, + "learning_rate": 7.932178850248404e-05, + "loss": 0.03639167845249176, + "step": 145740 + }, + { + "epoch": 20.688431511710434, + "grad_norm": 0.0645659789443016, + "learning_rate": 7.932036905606813e-05, + "loss": 0.054617387056350705, + "step": 145750 + }, + { + "epoch": 20.689850958126332, + "grad_norm": 0.044021811336278915, + "learning_rate": 7.931894960965224e-05, + "loss": 0.005638457834720612, + "step": 145760 + }, + { + "epoch": 20.69127040454223, + "grad_norm": 0.08967779576778412, + "learning_rate": 7.931753016323634e-05, + "loss": 0.03928759396076202, + "step": 145770 + }, + { + "epoch": 20.692689850958125, + "grad_norm": 4.009666919708252, + "learning_rate": 7.931611071682045e-05, + "loss": 0.003842705860733986, + "step": 145780 + }, + { + "epoch": 20.694109297374023, + "grad_norm": 0.1355072259902954, + "learning_rate": 7.931469127040455e-05, + "loss": 0.016324999928474426, + "step": 145790 + }, + { + "epoch": 20.69552874378992, + "grad_norm": 0.63166344165802, + "learning_rate": 7.931327182398864e-05, + "loss": 0.012057775259017944, + "step": 145800 + }, + { + "epoch": 20.69694819020582, + "grad_norm": 0.08763563632965088, + "learning_rate": 7.931185237757275e-05, + "loss": 0.020572511851787566, + "step": 145810 + }, + { + "epoch": 20.698367636621718, + "grad_norm": 1.3961236476898193, + "learning_rate": 7.931043293115685e-05, + "loss": 0.025046154856681824, + "step": 145820 + }, + { + "epoch": 20.699787083037616, + "grad_norm": 0.23597484827041626, + "learning_rate": 7.930901348474096e-05, + "loss": 0.028546819090843202, + "step": 145830 + }, + { + "epoch": 20.701206529453515, + "grad_norm": 0.029362551867961884, + "learning_rate": 7.930759403832506e-05, + "loss": 0.033306199312210086, + "step": 145840 + }, + { + "epoch": 20.70262597586941, + "grad_norm": 5.759678363800049, + "learning_rate": 7.930617459190916e-05, + "loss": 0.01832512617111206, + "step": 145850 + }, + { + "epoch": 20.704045422285308, + "grad_norm": 0.11721846461296082, + "learning_rate": 7.930475514549325e-05, + "loss": 0.008271043002605439, + "step": 145860 + }, + { + "epoch": 20.705464868701206, + "grad_norm": 0.03404093161225319, + "learning_rate": 7.930333569907736e-05, + "loss": 0.00751207247376442, + "step": 145870 + }, + { + "epoch": 20.706884315117104, + "grad_norm": 0.9527551531791687, + "learning_rate": 7.930191625266146e-05, + "loss": 0.020460736751556397, + "step": 145880 + }, + { + "epoch": 20.708303761533003, + "grad_norm": 0.04391974210739136, + "learning_rate": 7.930049680624557e-05, + "loss": 0.03786468505859375, + "step": 145890 + }, + { + "epoch": 20.7097232079489, + "grad_norm": 0.14532692730426788, + "learning_rate": 7.929907735982967e-05, + "loss": 0.015052646398544312, + "step": 145900 + }, + { + "epoch": 20.7111426543648, + "grad_norm": 0.9020943641662598, + "learning_rate": 7.929765791341377e-05, + "loss": 0.023717553913593294, + "step": 145910 + }, + { + "epoch": 20.712562100780694, + "grad_norm": 0.005768029484897852, + "learning_rate": 7.929623846699788e-05, + "loss": 0.01201062798500061, + "step": 145920 + }, + { + "epoch": 20.713981547196592, + "grad_norm": 0.6888214945793152, + "learning_rate": 7.929481902058198e-05, + "loss": 0.012487337738275529, + "step": 145930 + }, + { + "epoch": 20.71540099361249, + "grad_norm": 0.1479235738515854, + "learning_rate": 7.929339957416609e-05, + "loss": 0.03108862042427063, + "step": 145940 + }, + { + "epoch": 20.71682044002839, + "grad_norm": 0.03405297175049782, + "learning_rate": 7.929198012775017e-05, + "loss": 0.04431195259094238, + "step": 145950 + }, + { + "epoch": 20.718239886444287, + "grad_norm": 0.10078774392604828, + "learning_rate": 7.929056068133428e-05, + "loss": 0.017326073348522188, + "step": 145960 + }, + { + "epoch": 20.719659332860186, + "grad_norm": 6.214639186859131, + "learning_rate": 7.928914123491838e-05, + "loss": 0.046317586302757265, + "step": 145970 + }, + { + "epoch": 20.721078779276084, + "grad_norm": 0.32494044303894043, + "learning_rate": 7.928772178850249e-05, + "loss": 0.023650357127189638, + "step": 145980 + }, + { + "epoch": 20.72249822569198, + "grad_norm": 13.228584289550781, + "learning_rate": 7.928630234208659e-05, + "loss": 0.012420380115509033, + "step": 145990 + }, + { + "epoch": 20.723917672107877, + "grad_norm": 3.9393420219421387, + "learning_rate": 7.928488289567068e-05, + "loss": 0.007345489412546158, + "step": 146000 + }, + { + "epoch": 20.723917672107877, + "eval_accuracy": 0.9831499968207541, + "eval_loss": 0.061240628361701965, + "eval_runtime": 34.6141, + "eval_samples_per_second": 454.352, + "eval_steps_per_second": 14.214, + "step": 146000 + }, + { + "epoch": 20.725337118523775, + "grad_norm": 0.04467616602778435, + "learning_rate": 7.92834634492548e-05, + "loss": 0.002353701740503311, + "step": 146010 + }, + { + "epoch": 20.726756564939674, + "grad_norm": 0.07547491788864136, + "learning_rate": 7.928204400283889e-05, + "loss": 0.002427778393030167, + "step": 146020 + }, + { + "epoch": 20.728176011355572, + "grad_norm": 1.7890892028808594, + "learning_rate": 7.9280624556423e-05, + "loss": 0.03683383166790009, + "step": 146030 + }, + { + "epoch": 20.72959545777147, + "grad_norm": 2.051042079925537, + "learning_rate": 7.92792051100071e-05, + "loss": 0.013651996850967407, + "step": 146040 + }, + { + "epoch": 20.73101490418737, + "grad_norm": 0.030877405777573586, + "learning_rate": 7.927778566359121e-05, + "loss": 0.03212569057941437, + "step": 146050 + }, + { + "epoch": 20.732434350603263, + "grad_norm": 8.530709266662598, + "learning_rate": 7.92763662171753e-05, + "loss": 0.017933164536952973, + "step": 146060 + }, + { + "epoch": 20.73385379701916, + "grad_norm": 5.16195821762085, + "learning_rate": 7.927494677075941e-05, + "loss": 0.01793002188205719, + "step": 146070 + }, + { + "epoch": 20.73527324343506, + "grad_norm": 0.13871033489704132, + "learning_rate": 7.92735273243435e-05, + "loss": 0.012643037736415863, + "step": 146080 + }, + { + "epoch": 20.73669268985096, + "grad_norm": 0.05193353816866875, + "learning_rate": 7.927210787792762e-05, + "loss": 0.010770949721336364, + "step": 146090 + }, + { + "epoch": 20.738112136266857, + "grad_norm": 10.728626251220703, + "learning_rate": 7.927068843151171e-05, + "loss": 0.058276236057281494, + "step": 146100 + }, + { + "epoch": 20.739531582682755, + "grad_norm": 8.37535572052002, + "learning_rate": 7.926926898509581e-05, + "loss": 0.031953093409538266, + "step": 146110 + }, + { + "epoch": 20.740951029098653, + "grad_norm": 0.041031572967767715, + "learning_rate": 7.926784953867992e-05, + "loss": 0.02155022770166397, + "step": 146120 + }, + { + "epoch": 20.742370475514548, + "grad_norm": 0.11791174113750458, + "learning_rate": 7.926643009226402e-05, + "loss": 0.013758836686611176, + "step": 146130 + }, + { + "epoch": 20.743789921930446, + "grad_norm": 3.598696231842041, + "learning_rate": 7.926501064584813e-05, + "loss": 0.009702644497156142, + "step": 146140 + }, + { + "epoch": 20.745209368346345, + "grad_norm": 8.049793243408203, + "learning_rate": 7.926359119943223e-05, + "loss": 0.03341563940048218, + "step": 146150 + }, + { + "epoch": 20.746628814762243, + "grad_norm": 2.3235762119293213, + "learning_rate": 7.926217175301632e-05, + "loss": 0.023580312728881836, + "step": 146160 + }, + { + "epoch": 20.74804826117814, + "grad_norm": 2.129178285598755, + "learning_rate": 7.926075230660042e-05, + "loss": 0.01380157470703125, + "step": 146170 + }, + { + "epoch": 20.74946770759404, + "grad_norm": 6.452048301696777, + "learning_rate": 7.925933286018453e-05, + "loss": 0.03201970756053925, + "step": 146180 + }, + { + "epoch": 20.750887154009938, + "grad_norm": 1.5061073303222656, + "learning_rate": 7.925791341376863e-05, + "loss": 0.0893592119216919, + "step": 146190 + }, + { + "epoch": 20.752306600425833, + "grad_norm": 3.5834708213806152, + "learning_rate": 7.925649396735274e-05, + "loss": 0.03867372274398804, + "step": 146200 + }, + { + "epoch": 20.75372604684173, + "grad_norm": 0.40649959444999695, + "learning_rate": 7.925507452093684e-05, + "loss": 0.0099691703915596, + "step": 146210 + }, + { + "epoch": 20.75514549325763, + "grad_norm": 0.08001447468996048, + "learning_rate": 7.925365507452094e-05, + "loss": 0.024438340961933137, + "step": 146220 + }, + { + "epoch": 20.756564939673527, + "grad_norm": 0.28294801712036133, + "learning_rate": 7.925223562810505e-05, + "loss": 0.03000095784664154, + "step": 146230 + }, + { + "epoch": 20.757984386089426, + "grad_norm": 0.373675674200058, + "learning_rate": 7.925081618168914e-05, + "loss": 0.007903087139129638, + "step": 146240 + }, + { + "epoch": 20.759403832505324, + "grad_norm": 2.2916080951690674, + "learning_rate": 7.924939673527325e-05, + "loss": 0.04177244305610657, + "step": 146250 + }, + { + "epoch": 20.760823278921222, + "grad_norm": 0.39103400707244873, + "learning_rate": 7.924797728885734e-05, + "loss": 0.03385518789291382, + "step": 146260 + }, + { + "epoch": 20.762242725337117, + "grad_norm": 0.7429710030555725, + "learning_rate": 7.924655784244145e-05, + "loss": 0.03701767921447754, + "step": 146270 + }, + { + "epoch": 20.763662171753015, + "grad_norm": 0.938109815120697, + "learning_rate": 7.924513839602555e-05, + "loss": 0.015692499279975892, + "step": 146280 + }, + { + "epoch": 20.765081618168914, + "grad_norm": 3.9352753162384033, + "learning_rate": 7.924371894960966e-05, + "loss": 0.00905241072177887, + "step": 146290 + }, + { + "epoch": 20.766501064584812, + "grad_norm": 0.03445643186569214, + "learning_rate": 7.924229950319377e-05, + "loss": 0.015830010175704956, + "step": 146300 + }, + { + "epoch": 20.76792051100071, + "grad_norm": 0.5951571464538574, + "learning_rate": 7.924088005677785e-05, + "loss": 0.022093257308006285, + "step": 146310 + }, + { + "epoch": 20.76933995741661, + "grad_norm": 1.0523383617401123, + "learning_rate": 7.923946061036196e-05, + "loss": 0.02023012936115265, + "step": 146320 + }, + { + "epoch": 20.770759403832507, + "grad_norm": 0.11734830588102341, + "learning_rate": 7.923804116394606e-05, + "loss": 0.021375299990177156, + "step": 146330 + }, + { + "epoch": 20.7721788502484, + "grad_norm": 13.957560539245605, + "learning_rate": 7.923662171753017e-05, + "loss": 0.046216410398483274, + "step": 146340 + }, + { + "epoch": 20.7735982966643, + "grad_norm": 17.897584915161133, + "learning_rate": 7.923520227111427e-05, + "loss": 0.024576073884963988, + "step": 146350 + }, + { + "epoch": 20.7750177430802, + "grad_norm": 0.12110137194395065, + "learning_rate": 7.923378282469837e-05, + "loss": 0.006710472702980042, + "step": 146360 + }, + { + "epoch": 20.776437189496097, + "grad_norm": 0.12965555489063263, + "learning_rate": 7.923236337828246e-05, + "loss": 0.016585759818553925, + "step": 146370 + }, + { + "epoch": 20.777856635911995, + "grad_norm": 2.008803606033325, + "learning_rate": 7.923094393186657e-05, + "loss": 0.020957988500595093, + "step": 146380 + }, + { + "epoch": 20.779276082327893, + "grad_norm": 6.884129047393799, + "learning_rate": 7.922952448545069e-05, + "loss": 0.009522868692874909, + "step": 146390 + }, + { + "epoch": 20.78069552874379, + "grad_norm": 0.01557956263422966, + "learning_rate": 7.922810503903478e-05, + "loss": 0.02518046796321869, + "step": 146400 + }, + { + "epoch": 20.782114975159686, + "grad_norm": 2.624701499938965, + "learning_rate": 7.92266855926189e-05, + "loss": 0.01180010661482811, + "step": 146410 + }, + { + "epoch": 20.783534421575585, + "grad_norm": 9.77568531036377, + "learning_rate": 7.922526614620298e-05, + "loss": 0.03461946845054627, + "step": 146420 + }, + { + "epoch": 20.784953867991483, + "grad_norm": 0.02805766463279724, + "learning_rate": 7.922384669978709e-05, + "loss": 0.028170162439346315, + "step": 146430 + }, + { + "epoch": 20.78637331440738, + "grad_norm": 0.2586061358451843, + "learning_rate": 7.922242725337119e-05, + "loss": 0.0037616658955812454, + "step": 146440 + }, + { + "epoch": 20.78779276082328, + "grad_norm": 0.15174534916877747, + "learning_rate": 7.92210078069553e-05, + "loss": 0.016199553012847902, + "step": 146450 + }, + { + "epoch": 20.789212207239178, + "grad_norm": 4.853360176086426, + "learning_rate": 7.92195883605394e-05, + "loss": 0.013407911360263824, + "step": 146460 + }, + { + "epoch": 20.790631653655076, + "grad_norm": 0.007301007863134146, + "learning_rate": 7.921816891412349e-05, + "loss": 0.009662486612796783, + "step": 146470 + }, + { + "epoch": 20.79205110007097, + "grad_norm": 0.5911498069763184, + "learning_rate": 7.92167494677076e-05, + "loss": 0.004151134565472603, + "step": 146480 + }, + { + "epoch": 20.79347054648687, + "grad_norm": 5.810220241546631, + "learning_rate": 7.92153300212917e-05, + "loss": 0.026870083808898926, + "step": 146490 + }, + { + "epoch": 20.794889992902768, + "grad_norm": 0.8876366019248962, + "learning_rate": 7.921391057487581e-05, + "loss": 0.0064541235566139225, + "step": 146500 + }, + { + "epoch": 20.794889992902768, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.05110529065132141, + "eval_runtime": 34.6768, + "eval_samples_per_second": 453.531, + "eval_steps_per_second": 14.188, + "step": 146500 + }, + { + "epoch": 20.796309439318666, + "grad_norm": 0.02104359120130539, + "learning_rate": 7.921249112845991e-05, + "loss": 0.010167718678712846, + "step": 146510 + }, + { + "epoch": 20.797728885734564, + "grad_norm": 0.13494668900966644, + "learning_rate": 7.9211071682044e-05, + "loss": 0.026707875728607177, + "step": 146520 + }, + { + "epoch": 20.799148332150462, + "grad_norm": 0.15622884035110474, + "learning_rate": 7.92096522356281e-05, + "loss": 0.006951558589935303, + "step": 146530 + }, + { + "epoch": 20.80056777856636, + "grad_norm": 0.2711966633796692, + "learning_rate": 7.920823278921221e-05, + "loss": 0.012996454536914826, + "step": 146540 + }, + { + "epoch": 20.801987224982255, + "grad_norm": 0.3607664108276367, + "learning_rate": 7.920681334279631e-05, + "loss": 0.023054002225399016, + "step": 146550 + }, + { + "epoch": 20.803406671398154, + "grad_norm": 0.0117929857224226, + "learning_rate": 7.920539389638042e-05, + "loss": 0.015093138813972473, + "step": 146560 + }, + { + "epoch": 20.804826117814052, + "grad_norm": 0.1728912591934204, + "learning_rate": 7.920397444996452e-05, + "loss": 0.01267610639333725, + "step": 146570 + }, + { + "epoch": 20.80624556422995, + "grad_norm": 0.05420781672000885, + "learning_rate": 7.920255500354862e-05, + "loss": 0.01759723275899887, + "step": 146580 + }, + { + "epoch": 20.80766501064585, + "grad_norm": 1.4305238723754883, + "learning_rate": 7.920113555713273e-05, + "loss": 0.026539325714111328, + "step": 146590 + }, + { + "epoch": 20.809084457061747, + "grad_norm": 3.561781406402588, + "learning_rate": 7.919971611071683e-05, + "loss": 0.01684565544128418, + "step": 146600 + }, + { + "epoch": 20.810503903477645, + "grad_norm": 1.0061430931091309, + "learning_rate": 7.919829666430094e-05, + "loss": 0.004090564325451851, + "step": 146610 + }, + { + "epoch": 20.81192334989354, + "grad_norm": 10.373432159423828, + "learning_rate": 7.919687721788502e-05, + "loss": 0.019665876030921937, + "step": 146620 + }, + { + "epoch": 20.81334279630944, + "grad_norm": 0.11311222612857819, + "learning_rate": 7.919545777146913e-05, + "loss": 0.06184806227684021, + "step": 146630 + }, + { + "epoch": 20.814762242725337, + "grad_norm": 0.9957504272460938, + "learning_rate": 7.919403832505323e-05, + "loss": 0.008581961691379546, + "step": 146640 + }, + { + "epoch": 20.816181689141235, + "grad_norm": 15.013198852539062, + "learning_rate": 7.919261887863734e-05, + "loss": 0.019738689064979553, + "step": 146650 + }, + { + "epoch": 20.817601135557133, + "grad_norm": 0.9187983274459839, + "learning_rate": 7.919119943222144e-05, + "loss": 0.0025506075471639633, + "step": 146660 + }, + { + "epoch": 20.81902058197303, + "grad_norm": 0.6271923780441284, + "learning_rate": 7.918977998580553e-05, + "loss": 0.028296566009521483, + "step": 146670 + }, + { + "epoch": 20.82044002838893, + "grad_norm": 0.012920745648443699, + "learning_rate": 7.918836053938965e-05, + "loss": 0.00728784054517746, + "step": 146680 + }, + { + "epoch": 20.821859474804825, + "grad_norm": 2.474107265472412, + "learning_rate": 7.918694109297374e-05, + "loss": 0.040848946571350096, + "step": 146690 + }, + { + "epoch": 20.823278921220723, + "grad_norm": 0.03071807138621807, + "learning_rate": 7.918552164655785e-05, + "loss": 0.011624724417924882, + "step": 146700 + }, + { + "epoch": 20.82469836763662, + "grad_norm": 11.327455520629883, + "learning_rate": 7.918410220014195e-05, + "loss": 0.052508091926574706, + "step": 146710 + }, + { + "epoch": 20.82611781405252, + "grad_norm": 7.31023645401001, + "learning_rate": 7.918268275372605e-05, + "loss": 0.01595485508441925, + "step": 146720 + }, + { + "epoch": 20.827537260468418, + "grad_norm": 0.44911283254623413, + "learning_rate": 7.918126330731015e-05, + "loss": 0.001621703803539276, + "step": 146730 + }, + { + "epoch": 20.828956706884316, + "grad_norm": 0.690433919429779, + "learning_rate": 7.917984386089426e-05, + "loss": 0.009605031460523605, + "step": 146740 + }, + { + "epoch": 20.830376153300215, + "grad_norm": 0.373314768075943, + "learning_rate": 7.917842441447835e-05, + "loss": 0.002264280617237091, + "step": 146750 + }, + { + "epoch": 20.83179559971611, + "grad_norm": 0.04561639204621315, + "learning_rate": 7.917700496806246e-05, + "loss": 0.01368083357810974, + "step": 146760 + }, + { + "epoch": 20.833215046132008, + "grad_norm": 0.4038635492324829, + "learning_rate": 7.917558552164656e-05, + "loss": 0.022652235627174378, + "step": 146770 + }, + { + "epoch": 20.834634492547906, + "grad_norm": 4.013926982879639, + "learning_rate": 7.917416607523066e-05, + "loss": 0.040308129787445066, + "step": 146780 + }, + { + "epoch": 20.836053938963804, + "grad_norm": 0.009684121236205101, + "learning_rate": 7.917274662881477e-05, + "loss": 0.007236669957637787, + "step": 146790 + }, + { + "epoch": 20.837473385379703, + "grad_norm": 0.37154409289360046, + "learning_rate": 7.917132718239887e-05, + "loss": 0.002590896561741829, + "step": 146800 + }, + { + "epoch": 20.8388928317956, + "grad_norm": 0.23550155758857727, + "learning_rate": 7.916990773598298e-05, + "loss": 0.006059055775403976, + "step": 146810 + }, + { + "epoch": 20.8403122782115, + "grad_norm": 0.15419526398181915, + "learning_rate": 7.916848828956708e-05, + "loss": 0.010148958116769791, + "step": 146820 + }, + { + "epoch": 20.841731724627394, + "grad_norm": 0.3938317596912384, + "learning_rate": 7.916706884315117e-05, + "loss": 0.007333367317914963, + "step": 146830 + }, + { + "epoch": 20.843151171043292, + "grad_norm": 9.145828247070312, + "learning_rate": 7.916564939673527e-05, + "loss": 0.01460103690624237, + "step": 146840 + }, + { + "epoch": 20.84457061745919, + "grad_norm": 12.217960357666016, + "learning_rate": 7.916422995031938e-05, + "loss": 0.03970673084259033, + "step": 146850 + }, + { + "epoch": 20.84599006387509, + "grad_norm": 7.862465858459473, + "learning_rate": 7.916281050390348e-05, + "loss": 0.012903441488742829, + "step": 146860 + }, + { + "epoch": 20.847409510290987, + "grad_norm": 1.4578545093536377, + "learning_rate": 7.916139105748759e-05, + "loss": 0.017339283227920534, + "step": 146870 + }, + { + "epoch": 20.848828956706885, + "grad_norm": 0.0015449131606146693, + "learning_rate": 7.915997161107169e-05, + "loss": 0.016219761967658997, + "step": 146880 + }, + { + "epoch": 20.850248403122784, + "grad_norm": 0.6998650431632996, + "learning_rate": 7.915855216465578e-05, + "loss": 0.010852450132369995, + "step": 146890 + }, + { + "epoch": 20.85166784953868, + "grad_norm": 0.0549323745071888, + "learning_rate": 7.91571327182399e-05, + "loss": 0.051076120138168334, + "step": 146900 + }, + { + "epoch": 20.853087295954577, + "grad_norm": 14.543206214904785, + "learning_rate": 7.9155713271824e-05, + "loss": 0.03457919955253601, + "step": 146910 + }, + { + "epoch": 20.854506742370475, + "grad_norm": 1.7455620765686035, + "learning_rate": 7.91542938254081e-05, + "loss": 0.004024988040328026, + "step": 146920 + }, + { + "epoch": 20.855926188786373, + "grad_norm": 0.6541800498962402, + "learning_rate": 7.915287437899219e-05, + "loss": 0.038422593474388124, + "step": 146930 + }, + { + "epoch": 20.85734563520227, + "grad_norm": 0.0649171844124794, + "learning_rate": 7.91514549325763e-05, + "loss": 0.04115557968616486, + "step": 146940 + }, + { + "epoch": 20.85876508161817, + "grad_norm": 2.2254154682159424, + "learning_rate": 7.91500354861604e-05, + "loss": 0.015144921839237213, + "step": 146950 + }, + { + "epoch": 20.86018452803407, + "grad_norm": 2.6537952423095703, + "learning_rate": 7.914861603974451e-05, + "loss": 0.09188648462295532, + "step": 146960 + }, + { + "epoch": 20.861603974449963, + "grad_norm": 7.721174240112305, + "learning_rate": 7.91471965933286e-05, + "loss": 0.05469951629638672, + "step": 146970 + }, + { + "epoch": 20.86302342086586, + "grad_norm": 9.921685218811035, + "learning_rate": 7.91457771469127e-05, + "loss": 0.042270541191101074, + "step": 146980 + }, + { + "epoch": 20.86444286728176, + "grad_norm": 0.1609053611755371, + "learning_rate": 7.914435770049681e-05, + "loss": 0.033688592910766604, + "step": 146990 + }, + { + "epoch": 20.865862313697658, + "grad_norm": 0.05122964829206467, + "learning_rate": 7.914293825408091e-05, + "loss": 0.016573894023895263, + "step": 147000 + }, + { + "epoch": 20.865862313697658, + "eval_accuracy": 0.98995358301011, + "eval_loss": 0.0332157239317894, + "eval_runtime": 34.7743, + "eval_samples_per_second": 452.259, + "eval_steps_per_second": 14.148, + "step": 147000 + }, + { + "epoch": 20.867281760113556, + "grad_norm": 0.06537210196256638, + "learning_rate": 7.914151880766502e-05, + "loss": 0.001004798337817192, + "step": 147010 + }, + { + "epoch": 20.868701206529455, + "grad_norm": 1.1541846990585327, + "learning_rate": 7.914009936124912e-05, + "loss": 0.0018973808735609054, + "step": 147020 + }, + { + "epoch": 20.870120652945353, + "grad_norm": 0.2158127725124359, + "learning_rate": 7.913867991483322e-05, + "loss": 0.006645660102367401, + "step": 147030 + }, + { + "epoch": 20.871540099361248, + "grad_norm": 0.08106806129217148, + "learning_rate": 7.913726046841731e-05, + "loss": 0.02808656692504883, + "step": 147040 + }, + { + "epoch": 20.872959545777146, + "grad_norm": 0.567354679107666, + "learning_rate": 7.913584102200142e-05, + "loss": 0.010174166411161423, + "step": 147050 + }, + { + "epoch": 20.874378992193044, + "grad_norm": 0.9629103541374207, + "learning_rate": 7.913442157558552e-05, + "loss": 0.011855004727840424, + "step": 147060 + }, + { + "epoch": 20.875798438608943, + "grad_norm": 0.3463072180747986, + "learning_rate": 7.913300212916963e-05, + "loss": 0.004977930709719658, + "step": 147070 + }, + { + "epoch": 20.87721788502484, + "grad_norm": 1.4517196416854858, + "learning_rate": 7.913158268275373e-05, + "loss": 0.025546705722808837, + "step": 147080 + }, + { + "epoch": 20.87863733144074, + "grad_norm": 8.010258674621582, + "learning_rate": 7.913016323633783e-05, + "loss": 0.04137132465839386, + "step": 147090 + }, + { + "epoch": 20.880056777856637, + "grad_norm": 0.08049314469099045, + "learning_rate": 7.912874378992194e-05, + "loss": 0.0032156050205230714, + "step": 147100 + }, + { + "epoch": 20.881476224272532, + "grad_norm": 0.08725004643201828, + "learning_rate": 7.912732434350604e-05, + "loss": 0.011750607937574386, + "step": 147110 + }, + { + "epoch": 20.88289567068843, + "grad_norm": 0.059840764850378036, + "learning_rate": 7.912590489709015e-05, + "loss": 0.08233913779258728, + "step": 147120 + }, + { + "epoch": 20.88431511710433, + "grad_norm": 0.019664814695715904, + "learning_rate": 7.912448545067424e-05, + "loss": 0.005721261352300644, + "step": 147130 + }, + { + "epoch": 20.885734563520227, + "grad_norm": 5.9343461990356445, + "learning_rate": 7.912306600425834e-05, + "loss": 0.025959882140159606, + "step": 147140 + }, + { + "epoch": 20.887154009936125, + "grad_norm": 19.820329666137695, + "learning_rate": 7.912164655784244e-05, + "loss": 0.014830124378204346, + "step": 147150 + }, + { + "epoch": 20.888573456352024, + "grad_norm": 0.029194775968790054, + "learning_rate": 7.912036905606814e-05, + "loss": 0.024605154991149902, + "step": 147160 + }, + { + "epoch": 20.889992902767922, + "grad_norm": 2.7611563205718994, + "learning_rate": 7.911894960965223e-05, + "loss": 0.03203037977218628, + "step": 147170 + }, + { + "epoch": 20.891412349183817, + "grad_norm": 3.295513868331909, + "learning_rate": 7.911753016323635e-05, + "loss": 0.013078141212463378, + "step": 147180 + }, + { + "epoch": 20.892831795599715, + "grad_norm": 0.13321231305599213, + "learning_rate": 7.911611071682044e-05, + "loss": 0.03065497279167175, + "step": 147190 + }, + { + "epoch": 20.894251242015613, + "grad_norm": 6.127897262573242, + "learning_rate": 7.911469127040455e-05, + "loss": 0.018430909514427184, + "step": 147200 + }, + { + "epoch": 20.89567068843151, + "grad_norm": 0.07745135575532913, + "learning_rate": 7.911327182398864e-05, + "loss": 0.007900258898735047, + "step": 147210 + }, + { + "epoch": 20.89709013484741, + "grad_norm": 7.25475549697876, + "learning_rate": 7.911185237757275e-05, + "loss": 0.015114283561706543, + "step": 147220 + }, + { + "epoch": 20.89850958126331, + "grad_norm": 8.938583374023438, + "learning_rate": 7.911043293115685e-05, + "loss": 0.008583293855190277, + "step": 147230 + }, + { + "epoch": 20.899929027679207, + "grad_norm": 1.1823887825012207, + "learning_rate": 7.910901348474096e-05, + "loss": 0.00854203775525093, + "step": 147240 + }, + { + "epoch": 20.9013484740951, + "grad_norm": 0.05185272544622421, + "learning_rate": 7.910759403832507e-05, + "loss": 0.04141756296157837, + "step": 147250 + }, + { + "epoch": 20.902767920511, + "grad_norm": 0.06545621156692505, + "learning_rate": 7.910617459190915e-05, + "loss": 0.05727676153182983, + "step": 147260 + }, + { + "epoch": 20.904187366926898, + "grad_norm": 0.1190522238612175, + "learning_rate": 7.910475514549326e-05, + "loss": 0.016204726696014405, + "step": 147270 + }, + { + "epoch": 20.905606813342796, + "grad_norm": 0.022433854639530182, + "learning_rate": 7.910333569907736e-05, + "loss": 0.014434719085693359, + "step": 147280 + }, + { + "epoch": 20.907026259758695, + "grad_norm": 0.2797408998012543, + "learning_rate": 7.910191625266147e-05, + "loss": 0.003265124186873436, + "step": 147290 + }, + { + "epoch": 20.908445706174593, + "grad_norm": 0.14977967739105225, + "learning_rate": 7.910049680624557e-05, + "loss": 0.03801598250865936, + "step": 147300 + }, + { + "epoch": 20.90986515259049, + "grad_norm": 0.6391460299491882, + "learning_rate": 7.909907735982967e-05, + "loss": 0.032560572028160095, + "step": 147310 + }, + { + "epoch": 20.911284599006386, + "grad_norm": 3.5316262245178223, + "learning_rate": 7.909765791341376e-05, + "loss": 0.015275755524635315, + "step": 147320 + }, + { + "epoch": 20.912704045422284, + "grad_norm": 0.3526131510734558, + "learning_rate": 7.909623846699787e-05, + "loss": 0.01211082860827446, + "step": 147330 + }, + { + "epoch": 20.914123491838183, + "grad_norm": 1.025555968284607, + "learning_rate": 7.909481902058198e-05, + "loss": 0.027760547399520875, + "step": 147340 + }, + { + "epoch": 20.91554293825408, + "grad_norm": 0.5729354619979858, + "learning_rate": 7.909339957416608e-05, + "loss": 0.05583299398422241, + "step": 147350 + }, + { + "epoch": 20.91696238466998, + "grad_norm": 0.009218241088092327, + "learning_rate": 7.909198012775018e-05, + "loss": 0.004211675003170967, + "step": 147360 + }, + { + "epoch": 20.918381831085878, + "grad_norm": 0.9458274245262146, + "learning_rate": 7.909056068133428e-05, + "loss": 0.05659654140472412, + "step": 147370 + }, + { + "epoch": 20.919801277501776, + "grad_norm": 0.42036503553390503, + "learning_rate": 7.908914123491839e-05, + "loss": 0.03894461989402771, + "step": 147380 + }, + { + "epoch": 20.92122072391767, + "grad_norm": 10.208904266357422, + "learning_rate": 7.908772178850249e-05, + "loss": 0.016311009228229523, + "step": 147390 + }, + { + "epoch": 20.92264017033357, + "grad_norm": 0.01734880730509758, + "learning_rate": 7.90863023420866e-05, + "loss": 0.031048858165740968, + "step": 147400 + }, + { + "epoch": 20.924059616749467, + "grad_norm": 0.019519660621881485, + "learning_rate": 7.90848828956707e-05, + "loss": 0.028166115283966064, + "step": 147410 + }, + { + "epoch": 20.925479063165366, + "grad_norm": 1.4088938236236572, + "learning_rate": 7.908346344925479e-05, + "loss": 0.01879177838563919, + "step": 147420 + }, + { + "epoch": 20.926898509581264, + "grad_norm": 0.06329909712076187, + "learning_rate": 7.90820440028389e-05, + "loss": 0.036617633700370786, + "step": 147430 + }, + { + "epoch": 20.928317955997162, + "grad_norm": 0.5240737199783325, + "learning_rate": 7.9080624556423e-05, + "loss": 0.009767904877662659, + "step": 147440 + }, + { + "epoch": 20.92973740241306, + "grad_norm": 5.202698230743408, + "learning_rate": 7.907920511000711e-05, + "loss": 0.009432019293308258, + "step": 147450 + }, + { + "epoch": 20.931156848828955, + "grad_norm": 11.060153007507324, + "learning_rate": 7.907778566359121e-05, + "loss": 0.013427031040191651, + "step": 147460 + }, + { + "epoch": 20.932576295244854, + "grad_norm": 0.8927894830703735, + "learning_rate": 7.90763662171753e-05, + "loss": 0.015002116560935974, + "step": 147470 + }, + { + "epoch": 20.933995741660752, + "grad_norm": 0.030140064656734467, + "learning_rate": 7.90749467707594e-05, + "loss": 0.01904297322034836, + "step": 147480 + }, + { + "epoch": 20.93541518807665, + "grad_norm": 11.515298843383789, + "learning_rate": 7.907352732434351e-05, + "loss": 0.014468705654144287, + "step": 147490 + }, + { + "epoch": 20.93683463449255, + "grad_norm": 1.8394439220428467, + "learning_rate": 7.907210787792761e-05, + "loss": 0.01816985011100769, + "step": 147500 + }, + { + "epoch": 20.93683463449255, + "eval_accuracy": 0.9853118840211101, + "eval_loss": 0.059090834110975266, + "eval_runtime": 33.5044, + "eval_samples_per_second": 469.401, + "eval_steps_per_second": 14.685, + "step": 147500 + }, + { + "epoch": 20.938254080908447, + "grad_norm": 1.7218375205993652, + "learning_rate": 7.907068843151172e-05, + "loss": 0.007790523767471314, + "step": 147510 + }, + { + "epoch": 20.939673527324345, + "grad_norm": 2.2282238006591797, + "learning_rate": 7.906926898509582e-05, + "loss": 0.024518656730651855, + "step": 147520 + }, + { + "epoch": 20.94109297374024, + "grad_norm": 7.747946262359619, + "learning_rate": 7.906784953867992e-05, + "loss": 0.02995598316192627, + "step": 147530 + }, + { + "epoch": 20.942512420156138, + "grad_norm": 4.2246413230896, + "learning_rate": 7.906643009226403e-05, + "loss": 0.034627553820610044, + "step": 147540 + }, + { + "epoch": 20.943931866572036, + "grad_norm": 0.25856253504753113, + "learning_rate": 7.906501064584812e-05, + "loss": 0.00449640341103077, + "step": 147550 + }, + { + "epoch": 20.945351312987935, + "grad_norm": 4.605138778686523, + "learning_rate": 7.906359119943224e-05, + "loss": 0.013264468312263489, + "step": 147560 + }, + { + "epoch": 20.946770759403833, + "grad_norm": 14.829985618591309, + "learning_rate": 7.906217175301632e-05, + "loss": 0.03572210669517517, + "step": 147570 + }, + { + "epoch": 20.94819020581973, + "grad_norm": 0.2998245656490326, + "learning_rate": 7.906075230660043e-05, + "loss": 0.002378993108868599, + "step": 147580 + }, + { + "epoch": 20.94960965223563, + "grad_norm": 0.9222350120544434, + "learning_rate": 7.905933286018453e-05, + "loss": 0.01790400892496109, + "step": 147590 + }, + { + "epoch": 20.951029098651524, + "grad_norm": 9.298759460449219, + "learning_rate": 7.905791341376864e-05, + "loss": 0.031370556354522704, + "step": 147600 + }, + { + "epoch": 20.952448545067423, + "grad_norm": 0.08165039867162704, + "learning_rate": 7.905649396735274e-05, + "loss": 0.05433051586151123, + "step": 147610 + }, + { + "epoch": 20.95386799148332, + "grad_norm": 0.053742699325084686, + "learning_rate": 7.905507452093683e-05, + "loss": 0.017582088708877563, + "step": 147620 + }, + { + "epoch": 20.95528743789922, + "grad_norm": 2.7409090995788574, + "learning_rate": 7.905365507452094e-05, + "loss": 0.02233976274728775, + "step": 147630 + }, + { + "epoch": 20.956706884315118, + "grad_norm": 6.455620765686035, + "learning_rate": 7.905223562810504e-05, + "loss": 0.0433304637670517, + "step": 147640 + }, + { + "epoch": 20.958126330731016, + "grad_norm": 0.05922337621450424, + "learning_rate": 7.905081618168915e-05, + "loss": 0.04422985017299652, + "step": 147650 + }, + { + "epoch": 20.959545777146914, + "grad_norm": 1.0155706405639648, + "learning_rate": 7.904939673527325e-05, + "loss": 0.020816732943058015, + "step": 147660 + }, + { + "epoch": 20.96096522356281, + "grad_norm": 2.480271577835083, + "learning_rate": 7.904797728885735e-05, + "loss": 0.016534870862960814, + "step": 147670 + }, + { + "epoch": 20.962384669978707, + "grad_norm": 4.882796764373779, + "learning_rate": 7.904655784244144e-05, + "loss": 0.013173118233680725, + "step": 147680 + }, + { + "epoch": 20.963804116394606, + "grad_norm": 0.062019314616918564, + "learning_rate": 7.904513839602556e-05, + "loss": 0.00510212555527687, + "step": 147690 + }, + { + "epoch": 20.965223562810504, + "grad_norm": 0.006183877121657133, + "learning_rate": 7.904371894960965e-05, + "loss": 0.006848765164613723, + "step": 147700 + }, + { + "epoch": 20.966643009226402, + "grad_norm": 0.11883995682001114, + "learning_rate": 7.904229950319376e-05, + "loss": 0.015217235684394837, + "step": 147710 + }, + { + "epoch": 20.9680624556423, + "grad_norm": 0.04546439275145531, + "learning_rate": 7.904088005677786e-05, + "loss": 0.009336093068122863, + "step": 147720 + }, + { + "epoch": 20.9694819020582, + "grad_norm": 0.21021588146686554, + "learning_rate": 7.903946061036196e-05, + "loss": 0.010315261781215668, + "step": 147730 + }, + { + "epoch": 20.970901348474094, + "grad_norm": 0.004868995398283005, + "learning_rate": 7.903804116394607e-05, + "loss": 0.0026836566627025603, + "step": 147740 + }, + { + "epoch": 20.972320794889992, + "grad_norm": 0.28464367985725403, + "learning_rate": 7.903662171753017e-05, + "loss": 0.012949483096599579, + "step": 147750 + }, + { + "epoch": 20.97374024130589, + "grad_norm": 1.2863636016845703, + "learning_rate": 7.903520227111428e-05, + "loss": 0.07239128351211548, + "step": 147760 + }, + { + "epoch": 20.97515968772179, + "grad_norm": 10.511181831359863, + "learning_rate": 7.903378282469838e-05, + "loss": 0.025117868185043336, + "step": 147770 + }, + { + "epoch": 20.976579134137687, + "grad_norm": 1.9466150999069214, + "learning_rate": 7.903236337828247e-05, + "loss": 0.03874390423297882, + "step": 147780 + }, + { + "epoch": 20.977998580553585, + "grad_norm": 0.046162527054548264, + "learning_rate": 7.903094393186657e-05, + "loss": 0.01782917380332947, + "step": 147790 + }, + { + "epoch": 20.979418026969483, + "grad_norm": 1.2228208780288696, + "learning_rate": 7.902952448545068e-05, + "loss": 0.0012259628623723985, + "step": 147800 + }, + { + "epoch": 20.980837473385378, + "grad_norm": 0.046159371733665466, + "learning_rate": 7.902810503903478e-05, + "loss": 0.017555412650108338, + "step": 147810 + }, + { + "epoch": 20.982256919801276, + "grad_norm": 1.7239919900894165, + "learning_rate": 7.902668559261889e-05, + "loss": 0.03226572871208191, + "step": 147820 + }, + { + "epoch": 20.983676366217175, + "grad_norm": 0.0749114602804184, + "learning_rate": 7.902526614620299e-05, + "loss": 0.015035668015480041, + "step": 147830 + }, + { + "epoch": 20.985095812633073, + "grad_norm": 0.026181036606431007, + "learning_rate": 7.902384669978708e-05, + "loss": 0.036445245146751404, + "step": 147840 + }, + { + "epoch": 20.98651525904897, + "grad_norm": 0.051933784037828445, + "learning_rate": 7.90224272533712e-05, + "loss": 0.02933332324028015, + "step": 147850 + }, + { + "epoch": 20.98793470546487, + "grad_norm": 0.31368857622146606, + "learning_rate": 7.902100780695529e-05, + "loss": 0.006153375655412674, + "step": 147860 + }, + { + "epoch": 20.989354151880768, + "grad_norm": 0.5830459594726562, + "learning_rate": 7.90195883605394e-05, + "loss": 0.03201732337474823, + "step": 147870 + }, + { + "epoch": 20.990773598296663, + "grad_norm": 0.06816734373569489, + "learning_rate": 7.901816891412349e-05, + "loss": 0.04411167204380036, + "step": 147880 + }, + { + "epoch": 20.99219304471256, + "grad_norm": 0.040966372936964035, + "learning_rate": 7.90167494677076e-05, + "loss": 0.021518656611442567, + "step": 147890 + }, + { + "epoch": 20.99361249112846, + "grad_norm": 0.6190553307533264, + "learning_rate": 7.90153300212917e-05, + "loss": 0.04294320344924927, + "step": 147900 + }, + { + "epoch": 20.995031937544358, + "grad_norm": 2.001371145248413, + "learning_rate": 7.90139105748758e-05, + "loss": 0.03126533329486847, + "step": 147910 + }, + { + "epoch": 20.996451383960256, + "grad_norm": 0.9343059659004211, + "learning_rate": 7.90124911284599e-05, + "loss": 0.010781645774841309, + "step": 147920 + }, + { + "epoch": 20.997870830376154, + "grad_norm": 0.01081905048340559, + "learning_rate": 7.9011071682044e-05, + "loss": 0.0056711096316576, + "step": 147930 + }, + { + "epoch": 20.999290276792053, + "grad_norm": 12.201112747192383, + "learning_rate": 7.900965223562811e-05, + "loss": 0.014659737050533295, + "step": 147940 + }, + { + "epoch": 21.000709723207947, + "grad_norm": 0.039594896137714386, + "learning_rate": 7.900823278921221e-05, + "loss": 0.03759959638118744, + "step": 147950 + }, + { + "epoch": 21.002129169623846, + "grad_norm": 0.10926660895347595, + "learning_rate": 7.900681334279632e-05, + "loss": 0.032229763269424436, + "step": 147960 + }, + { + "epoch": 21.003548616039744, + "grad_norm": 0.014060246758162975, + "learning_rate": 7.900539389638042e-05, + "loss": 0.009088961035013199, + "step": 147970 + }, + { + "epoch": 21.004968062455642, + "grad_norm": 9.610164642333984, + "learning_rate": 7.900397444996452e-05, + "loss": 0.020833241939544677, + "step": 147980 + }, + { + "epoch": 21.00638750887154, + "grad_norm": 0.2901909351348877, + "learning_rate": 7.900255500354861e-05, + "loss": 0.013191723823547363, + "step": 147990 + }, + { + "epoch": 21.00780695528744, + "grad_norm": 7.599514961242676, + "learning_rate": 7.900113555713272e-05, + "loss": 0.02051192969083786, + "step": 148000 + }, + { + "epoch": 21.00780695528744, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.05769599974155426, + "eval_runtime": 34.1016, + "eval_samples_per_second": 461.18, + "eval_steps_per_second": 14.427, + "step": 148000 + }, + { + "epoch": 21.009226401703337, + "grad_norm": 0.030993621796369553, + "learning_rate": 7.899971611071682e-05, + "loss": 0.015471708774566651, + "step": 148010 + }, + { + "epoch": 21.010645848119232, + "grad_norm": 0.24977083504199982, + "learning_rate": 7.899829666430093e-05, + "loss": 0.005109232664108276, + "step": 148020 + }, + { + "epoch": 21.01206529453513, + "grad_norm": 0.01610984094440937, + "learning_rate": 7.899687721788503e-05, + "loss": 0.015183395147323609, + "step": 148030 + }, + { + "epoch": 21.01348474095103, + "grad_norm": 0.02092795819044113, + "learning_rate": 7.899545777146913e-05, + "loss": 0.0008370682597160339, + "step": 148040 + }, + { + "epoch": 21.014904187366927, + "grad_norm": 0.11478916555643082, + "learning_rate": 7.899403832505324e-05, + "loss": 0.03654801547527313, + "step": 148050 + }, + { + "epoch": 21.016323633782825, + "grad_norm": 0.007695644628256559, + "learning_rate": 7.899261887863733e-05, + "loss": 0.006345228850841522, + "step": 148060 + }, + { + "epoch": 21.017743080198724, + "grad_norm": 0.05666486918926239, + "learning_rate": 7.899119943222145e-05, + "loss": 0.005860751867294312, + "step": 148070 + }, + { + "epoch": 21.019162526614622, + "grad_norm": 1.7690719366073608, + "learning_rate": 7.898977998580553e-05, + "loss": 0.03130360245704651, + "step": 148080 + }, + { + "epoch": 21.020581973030517, + "grad_norm": 0.11971930414438248, + "learning_rate": 7.898836053938964e-05, + "loss": 0.0030382439494132996, + "step": 148090 + }, + { + "epoch": 21.022001419446415, + "grad_norm": 0.17994071543216705, + "learning_rate": 7.898694109297374e-05, + "loss": 0.033177369832992555, + "step": 148100 + }, + { + "epoch": 21.023420865862313, + "grad_norm": 0.09168960154056549, + "learning_rate": 7.898552164655785e-05, + "loss": 0.006556940078735351, + "step": 148110 + }, + { + "epoch": 21.02484031227821, + "grad_norm": 3.8017678260803223, + "learning_rate": 7.898410220014195e-05, + "loss": 0.003216473013162613, + "step": 148120 + }, + { + "epoch": 21.02625975869411, + "grad_norm": 0.021076340228319168, + "learning_rate": 7.898268275372606e-05, + "loss": 0.022803887724876404, + "step": 148130 + }, + { + "epoch": 21.027679205110008, + "grad_norm": 3.38307785987854, + "learning_rate": 7.898126330731015e-05, + "loss": 0.013693174719810486, + "step": 148140 + }, + { + "epoch": 21.029098651525906, + "grad_norm": 0.21132506430149078, + "learning_rate": 7.897984386089425e-05, + "loss": 0.006197235733270645, + "step": 148150 + }, + { + "epoch": 21.0305180979418, + "grad_norm": 14.318134307861328, + "learning_rate": 7.897842441447836e-05, + "loss": 0.029898801445961, + "step": 148160 + }, + { + "epoch": 21.0319375443577, + "grad_norm": 0.02023470774292946, + "learning_rate": 7.897700496806246e-05, + "loss": 0.0028037030249834062, + "step": 148170 + }, + { + "epoch": 21.033356990773598, + "grad_norm": 0.1024431511759758, + "learning_rate": 7.897558552164657e-05, + "loss": 0.01755046546459198, + "step": 148180 + }, + { + "epoch": 21.034776437189496, + "grad_norm": 0.012744315899908543, + "learning_rate": 7.897416607523065e-05, + "loss": 0.0270268052816391, + "step": 148190 + }, + { + "epoch": 21.036195883605394, + "grad_norm": 0.2124946266412735, + "learning_rate": 7.897274662881477e-05, + "loss": 0.00761546641588211, + "step": 148200 + }, + { + "epoch": 21.037615330021293, + "grad_norm": 0.8210309743881226, + "learning_rate": 7.897132718239886e-05, + "loss": 0.023723949491977692, + "step": 148210 + }, + { + "epoch": 21.03903477643719, + "grad_norm": 0.03501635789871216, + "learning_rate": 7.896990773598297e-05, + "loss": 0.002953563630580902, + "step": 148220 + }, + { + "epoch": 21.040454222853086, + "grad_norm": 4.635706901550293, + "learning_rate": 7.896848828956707e-05, + "loss": 0.0172238752245903, + "step": 148230 + }, + { + "epoch": 21.041873669268984, + "grad_norm": 9.512527465820312, + "learning_rate": 7.896706884315117e-05, + "loss": 0.011320335417985916, + "step": 148240 + }, + { + "epoch": 21.043293115684882, + "grad_norm": 0.4590822458267212, + "learning_rate": 7.896564939673528e-05, + "loss": 0.03066089451313019, + "step": 148250 + }, + { + "epoch": 21.04471256210078, + "grad_norm": 4.4357500076293945, + "learning_rate": 7.896422995031938e-05, + "loss": 0.03336559534072876, + "step": 148260 + }, + { + "epoch": 21.04613200851668, + "grad_norm": 0.4467734396457672, + "learning_rate": 7.896281050390349e-05, + "loss": 0.07908724546432495, + "step": 148270 + }, + { + "epoch": 21.047551454932577, + "grad_norm": 0.4528430104255676, + "learning_rate": 7.896139105748759e-05, + "loss": 0.020352552831172942, + "step": 148280 + }, + { + "epoch": 21.048970901348476, + "grad_norm": 0.019472533836960793, + "learning_rate": 7.895997161107168e-05, + "loss": 0.014004814624786376, + "step": 148290 + }, + { + "epoch": 21.05039034776437, + "grad_norm": 0.07784029096364975, + "learning_rate": 7.895855216465578e-05, + "loss": 0.020544235408306123, + "step": 148300 + }, + { + "epoch": 21.05180979418027, + "grad_norm": 13.455442428588867, + "learning_rate": 7.895713271823989e-05, + "loss": 0.05464913845062256, + "step": 148310 + }, + { + "epoch": 21.053229240596167, + "grad_norm": 0.027525078505277634, + "learning_rate": 7.895571327182399e-05, + "loss": 0.0052706852555274965, + "step": 148320 + }, + { + "epoch": 21.054648687012065, + "grad_norm": 4.2665252685546875, + "learning_rate": 7.89542938254081e-05, + "loss": 0.014684043824672699, + "step": 148330 + }, + { + "epoch": 21.056068133427964, + "grad_norm": 0.350553035736084, + "learning_rate": 7.89528743789922e-05, + "loss": 0.002646828070282936, + "step": 148340 + }, + { + "epoch": 21.057487579843862, + "grad_norm": 0.02084886096417904, + "learning_rate": 7.89514549325763e-05, + "loss": 0.01621060222387314, + "step": 148350 + }, + { + "epoch": 21.05890702625976, + "grad_norm": 0.3889669179916382, + "learning_rate": 7.89500354861604e-05, + "loss": 0.006524135917425155, + "step": 148360 + }, + { + "epoch": 21.060326472675655, + "grad_norm": 5.557522296905518, + "learning_rate": 7.89486160397445e-05, + "loss": 0.011207732558250427, + "step": 148370 + }, + { + "epoch": 21.061745919091553, + "grad_norm": 0.02720538340508938, + "learning_rate": 7.894719659332861e-05, + "loss": 0.009898757189512252, + "step": 148380 + }, + { + "epoch": 21.06316536550745, + "grad_norm": 0.6669163703918457, + "learning_rate": 7.89457771469127e-05, + "loss": 0.0027324333786964417, + "step": 148390 + }, + { + "epoch": 21.06458481192335, + "grad_norm": 5.931302547454834, + "learning_rate": 7.894435770049681e-05, + "loss": 0.026851081848144533, + "step": 148400 + }, + { + "epoch": 21.066004258339248, + "grad_norm": 0.9109027981758118, + "learning_rate": 7.89429382540809e-05, + "loss": 0.049683880805969236, + "step": 148410 + }, + { + "epoch": 21.067423704755146, + "grad_norm": 0.2809244692325592, + "learning_rate": 7.894151880766502e-05, + "loss": 0.01648867130279541, + "step": 148420 + }, + { + "epoch": 21.068843151171045, + "grad_norm": 0.09820913523435593, + "learning_rate": 7.894009936124911e-05, + "loss": 0.008877874910831451, + "step": 148430 + }, + { + "epoch": 21.07026259758694, + "grad_norm": 0.11730563640594482, + "learning_rate": 7.893867991483321e-05, + "loss": 0.010576790571212769, + "step": 148440 + }, + { + "epoch": 21.071682044002838, + "grad_norm": 0.02079988457262516, + "learning_rate": 7.893726046841732e-05, + "loss": 0.012141837924718856, + "step": 148450 + }, + { + "epoch": 21.073101490418736, + "grad_norm": 0.641864538192749, + "learning_rate": 7.893584102200142e-05, + "loss": 0.007063349336385727, + "step": 148460 + }, + { + "epoch": 21.074520936834634, + "grad_norm": 0.0663255825638771, + "learning_rate": 7.893442157558553e-05, + "loss": 0.013601230084896087, + "step": 148470 + }, + { + "epoch": 21.075940383250533, + "grad_norm": 0.2728342115879059, + "learning_rate": 7.893300212916963e-05, + "loss": 0.002510496228933334, + "step": 148480 + }, + { + "epoch": 21.07735982966643, + "grad_norm": 0.0070982640609145164, + "learning_rate": 7.893158268275374e-05, + "loss": 0.049581718444824216, + "step": 148490 + }, + { + "epoch": 21.07877927608233, + "grad_norm": 0.045075900852680206, + "learning_rate": 7.893016323633782e-05, + "loss": 0.032151824235916136, + "step": 148500 + }, + { + "epoch": 21.07877927608233, + "eval_accuracy": 0.9852482991034527, + "eval_loss": 0.054044269025325775, + "eval_runtime": 32.9291, + "eval_samples_per_second": 477.601, + "eval_steps_per_second": 14.941, + "step": 148500 + }, + { + "epoch": 21.080198722498224, + "grad_norm": 1.230156421661377, + "learning_rate": 7.892874378992193e-05, + "loss": 0.02540125250816345, + "step": 148510 + }, + { + "epoch": 21.081618168914122, + "grad_norm": 0.08572544902563095, + "learning_rate": 7.892732434350603e-05, + "loss": 0.025651815533638, + "step": 148520 + }, + { + "epoch": 21.08303761533002, + "grad_norm": 0.03452804684638977, + "learning_rate": 7.892590489709014e-05, + "loss": 0.017447566986083983, + "step": 148530 + }, + { + "epoch": 21.08445706174592, + "grad_norm": 2.0657129287719727, + "learning_rate": 7.892448545067425e-05, + "loss": 0.016311675310134888, + "step": 148540 + }, + { + "epoch": 21.085876508161817, + "grad_norm": 6.853950023651123, + "learning_rate": 7.892306600425834e-05, + "loss": 0.02916599214076996, + "step": 148550 + }, + { + "epoch": 21.087295954577716, + "grad_norm": 0.3733602464199066, + "learning_rate": 7.892164655784245e-05, + "loss": 0.01675369292497635, + "step": 148560 + }, + { + "epoch": 21.088715400993614, + "grad_norm": 5.816450119018555, + "learning_rate": 7.892022711142654e-05, + "loss": 0.01607876121997833, + "step": 148570 + }, + { + "epoch": 21.09013484740951, + "grad_norm": 1.2631343603134155, + "learning_rate": 7.891880766501066e-05, + "loss": 0.017340224981307984, + "step": 148580 + }, + { + "epoch": 21.091554293825407, + "grad_norm": 1.5140000581741333, + "learning_rate": 7.891738821859475e-05, + "loss": 0.048081427812576294, + "step": 148590 + }, + { + "epoch": 21.092973740241305, + "grad_norm": 0.04193044453859329, + "learning_rate": 7.891596877217885e-05, + "loss": 0.031192976236343383, + "step": 148600 + }, + { + "epoch": 21.094393186657204, + "grad_norm": 0.01965693198144436, + "learning_rate": 7.891454932576295e-05, + "loss": 0.018894796073436738, + "step": 148610 + }, + { + "epoch": 21.095812633073102, + "grad_norm": 0.15369942784309387, + "learning_rate": 7.891312987934706e-05, + "loss": 0.01120772510766983, + "step": 148620 + }, + { + "epoch": 21.097232079489, + "grad_norm": 0.09235134720802307, + "learning_rate": 7.891171043293117e-05, + "loss": 0.014111068844795228, + "step": 148630 + }, + { + "epoch": 21.0986515259049, + "grad_norm": 0.276559978723526, + "learning_rate": 7.891029098651527e-05, + "loss": 0.022494851052761076, + "step": 148640 + }, + { + "epoch": 21.100070972320793, + "grad_norm": 0.22418466210365295, + "learning_rate": 7.890887154009936e-05, + "loss": 0.013198715448379517, + "step": 148650 + }, + { + "epoch": 21.10149041873669, + "grad_norm": 0.5788456797599792, + "learning_rate": 7.890745209368346e-05, + "loss": 0.028161463141441346, + "step": 148660 + }, + { + "epoch": 21.10290986515259, + "grad_norm": 0.059804439544677734, + "learning_rate": 7.890603264726757e-05, + "loss": 0.017698991298675536, + "step": 148670 + }, + { + "epoch": 21.10432931156849, + "grad_norm": 13.677862167358398, + "learning_rate": 7.890461320085167e-05, + "loss": 0.04842973947525024, + "step": 148680 + }, + { + "epoch": 21.105748757984387, + "grad_norm": 0.19037757813930511, + "learning_rate": 7.890319375443578e-05, + "loss": 0.014427109062671662, + "step": 148690 + }, + { + "epoch": 21.107168204400285, + "grad_norm": 0.04324449226260185, + "learning_rate": 7.890177430801986e-05, + "loss": 0.013991636037826539, + "step": 148700 + }, + { + "epoch": 21.108587650816183, + "grad_norm": 0.027191873639822006, + "learning_rate": 7.890035486160398e-05, + "loss": 0.016370829939842225, + "step": 148710 + }, + { + "epoch": 21.110007097232078, + "grad_norm": 0.07318593561649323, + "learning_rate": 7.889893541518809e-05, + "loss": 0.0023487545549869537, + "step": 148720 + }, + { + "epoch": 21.111426543647976, + "grad_norm": 3.9949121475219727, + "learning_rate": 7.889751596877218e-05, + "loss": 0.011983869969844818, + "step": 148730 + }, + { + "epoch": 21.112845990063875, + "grad_norm": 7.160031318664551, + "learning_rate": 7.88960965223563e-05, + "loss": 0.012339068204164505, + "step": 148740 + }, + { + "epoch": 21.114265436479773, + "grad_norm": 0.01821727305650711, + "learning_rate": 7.889467707594038e-05, + "loss": 0.006593748927116394, + "step": 148750 + }, + { + "epoch": 21.11568488289567, + "grad_norm": 0.09724514931440353, + "learning_rate": 7.889325762952449e-05, + "loss": 0.0271767258644104, + "step": 148760 + }, + { + "epoch": 21.11710432931157, + "grad_norm": 0.006654900498688221, + "learning_rate": 7.889183818310859e-05, + "loss": 0.0282509446144104, + "step": 148770 + }, + { + "epoch": 21.118523775727468, + "grad_norm": 0.009552140720188618, + "learning_rate": 7.88904187366927e-05, + "loss": 0.03756798505783081, + "step": 148780 + }, + { + "epoch": 21.119943222143363, + "grad_norm": 0.024645091965794563, + "learning_rate": 7.88889992902768e-05, + "loss": 0.012369501590728759, + "step": 148790 + }, + { + "epoch": 21.12136266855926, + "grad_norm": 0.06392497569322586, + "learning_rate": 7.888757984386089e-05, + "loss": 0.00218241848051548, + "step": 148800 + }, + { + "epoch": 21.12278211497516, + "grad_norm": 0.5799600481987, + "learning_rate": 7.888616039744499e-05, + "loss": 0.009430690109729767, + "step": 148810 + }, + { + "epoch": 21.124201561391057, + "grad_norm": 0.06345344334840775, + "learning_rate": 7.88847409510291e-05, + "loss": 0.03182608783245087, + "step": 148820 + }, + { + "epoch": 21.125621007806956, + "grad_norm": 0.13898225128650665, + "learning_rate": 7.888332150461321e-05, + "loss": 0.05677466988563538, + "step": 148830 + }, + { + "epoch": 21.127040454222854, + "grad_norm": 0.10186636447906494, + "learning_rate": 7.888190205819731e-05, + "loss": 0.023115485906600952, + "step": 148840 + }, + { + "epoch": 21.128459900638752, + "grad_norm": 0.041122883558273315, + "learning_rate": 7.888048261178142e-05, + "loss": 0.00964810773730278, + "step": 148850 + }, + { + "epoch": 21.129879347054647, + "grad_norm": 3.91687273979187, + "learning_rate": 7.88790631653655e-05, + "loss": 0.01825466752052307, + "step": 148860 + }, + { + "epoch": 21.131298793470545, + "grad_norm": 0.070044606924057, + "learning_rate": 7.887764371894962e-05, + "loss": 0.005874689295887947, + "step": 148870 + }, + { + "epoch": 21.132718239886444, + "grad_norm": 0.8859872817993164, + "learning_rate": 7.887622427253371e-05, + "loss": 0.0074994295835495, + "step": 148880 + }, + { + "epoch": 21.134137686302342, + "grad_norm": 2.5177249908447266, + "learning_rate": 7.887480482611782e-05, + "loss": 0.011135026812553406, + "step": 148890 + }, + { + "epoch": 21.13555713271824, + "grad_norm": 14.554292678833008, + "learning_rate": 7.887338537970192e-05, + "loss": 0.031386905908584596, + "step": 148900 + }, + { + "epoch": 21.13697657913414, + "grad_norm": 17.042827606201172, + "learning_rate": 7.887196593328602e-05, + "loss": 0.03421303629875183, + "step": 148910 + }, + { + "epoch": 21.138396025550037, + "grad_norm": 0.012734088115394115, + "learning_rate": 7.887054648687013e-05, + "loss": 0.02591225504875183, + "step": 148920 + }, + { + "epoch": 21.13981547196593, + "grad_norm": 15.798944473266602, + "learning_rate": 7.886912704045423e-05, + "loss": 0.06235888600349426, + "step": 148930 + }, + { + "epoch": 21.14123491838183, + "grad_norm": 5.152287483215332, + "learning_rate": 7.886770759403834e-05, + "loss": 0.004116514325141906, + "step": 148940 + }, + { + "epoch": 21.14265436479773, + "grad_norm": 1.1158864498138428, + "learning_rate": 7.886628814762243e-05, + "loss": 0.0065805584192276, + "step": 148950 + }, + { + "epoch": 21.144073811213627, + "grad_norm": 0.14404965937137604, + "learning_rate": 7.886486870120653e-05, + "loss": 0.00519159696996212, + "step": 148960 + }, + { + "epoch": 21.145493257629525, + "grad_norm": 0.12660571932792664, + "learning_rate": 7.886344925479063e-05, + "loss": 0.019508817791938783, + "step": 148970 + }, + { + "epoch": 21.146912704045423, + "grad_norm": 0.06188570335507393, + "learning_rate": 7.886202980837474e-05, + "loss": 0.01638663709163666, + "step": 148980 + }, + { + "epoch": 21.14833215046132, + "grad_norm": 6.220306873321533, + "learning_rate": 7.886061036195884e-05, + "loss": 0.026396429538726805, + "step": 148990 + }, + { + "epoch": 21.149751596877216, + "grad_norm": 0.21326115727424622, + "learning_rate": 7.885919091554295e-05, + "loss": 0.0054646778851747515, + "step": 149000 + }, + { + "epoch": 21.149751596877216, + "eval_accuracy": 0.9843581102562472, + "eval_loss": 0.05596425384283066, + "eval_runtime": 33.9592, + "eval_samples_per_second": 463.115, + "eval_steps_per_second": 14.488, + "step": 149000 + }, + { + "epoch": 21.151171043293115, + "grad_norm": 0.10017932951450348, + "learning_rate": 7.885777146912705e-05, + "loss": 0.003989888355135918, + "step": 149010 + }, + { + "epoch": 21.152590489709013, + "grad_norm": 0.16859035193920135, + "learning_rate": 7.885635202271114e-05, + "loss": 0.012894025444984436, + "step": 149020 + }, + { + "epoch": 21.15400993612491, + "grad_norm": 5.904428005218506, + "learning_rate": 7.885493257629525e-05, + "loss": 0.013205081224441528, + "step": 149030 + }, + { + "epoch": 21.15542938254081, + "grad_norm": 11.679054260253906, + "learning_rate": 7.885351312987935e-05, + "loss": 0.01643451452255249, + "step": 149040 + }, + { + "epoch": 21.156848828956708, + "grad_norm": 0.16656693816184998, + "learning_rate": 7.885209368346346e-05, + "loss": 0.02144826650619507, + "step": 149050 + }, + { + "epoch": 21.158268275372606, + "grad_norm": 0.16254442930221558, + "learning_rate": 7.885067423704755e-05, + "loss": 0.003293904289603233, + "step": 149060 + }, + { + "epoch": 21.1596877217885, + "grad_norm": 2.4509053230285645, + "learning_rate": 7.884925479063166e-05, + "loss": 0.034020134806633, + "step": 149070 + }, + { + "epoch": 21.1611071682044, + "grad_norm": 1.207975149154663, + "learning_rate": 7.884783534421575e-05, + "loss": 0.009652296453714371, + "step": 149080 + }, + { + "epoch": 21.162526614620297, + "grad_norm": 0.09561438858509064, + "learning_rate": 7.884641589779987e-05, + "loss": 0.018088242411613463, + "step": 149090 + }, + { + "epoch": 21.163946061036196, + "grad_norm": 0.022872552275657654, + "learning_rate": 7.884499645138396e-05, + "loss": 0.01549045592546463, + "step": 149100 + }, + { + "epoch": 21.165365507452094, + "grad_norm": 1.28530752658844, + "learning_rate": 7.884357700496806e-05, + "loss": 0.03410793542861938, + "step": 149110 + }, + { + "epoch": 21.166784953867992, + "grad_norm": 1.5677680969238281, + "learning_rate": 7.884215755855217e-05, + "loss": 0.023614349961280822, + "step": 149120 + }, + { + "epoch": 21.16820440028389, + "grad_norm": 0.2772119641304016, + "learning_rate": 7.884073811213627e-05, + "loss": 0.007127901911735535, + "step": 149130 + }, + { + "epoch": 21.169623846699785, + "grad_norm": 8.086417198181152, + "learning_rate": 7.883931866572038e-05, + "loss": 0.026209649443626405, + "step": 149140 + }, + { + "epoch": 21.171043293115684, + "grad_norm": 0.0157693549990654, + "learning_rate": 7.883789921930448e-05, + "loss": 0.01039692834019661, + "step": 149150 + }, + { + "epoch": 21.172462739531582, + "grad_norm": 0.3234774172306061, + "learning_rate": 7.883647977288857e-05, + "loss": 0.0021013259887695313, + "step": 149160 + }, + { + "epoch": 21.17388218594748, + "grad_norm": 0.044209253042936325, + "learning_rate": 7.883506032647267e-05, + "loss": 0.03961020112037659, + "step": 149170 + }, + { + "epoch": 21.17530163236338, + "grad_norm": 0.4048836827278137, + "learning_rate": 7.883364088005678e-05, + "loss": 0.029474282264709474, + "step": 149180 + }, + { + "epoch": 21.176721078779277, + "grad_norm": 0.05802635848522186, + "learning_rate": 7.883222143364088e-05, + "loss": 0.014293667674064637, + "step": 149190 + }, + { + "epoch": 21.178140525195175, + "grad_norm": 19.978744506835938, + "learning_rate": 7.883080198722499e-05, + "loss": 0.03169751763343811, + "step": 149200 + }, + { + "epoch": 21.17955997161107, + "grad_norm": 0.16670829057693481, + "learning_rate": 7.882938254080909e-05, + "loss": 0.011525410413742065, + "step": 149210 + }, + { + "epoch": 21.18097941802697, + "grad_norm": 2.011305093765259, + "learning_rate": 7.882796309439319e-05, + "loss": 0.011454127728939056, + "step": 149220 + }, + { + "epoch": 21.182398864442867, + "grad_norm": 7.367702484130859, + "learning_rate": 7.88265436479773e-05, + "loss": 0.00945751965045929, + "step": 149230 + }, + { + "epoch": 21.183818310858765, + "grad_norm": 0.022088466212153435, + "learning_rate": 7.88251242015614e-05, + "loss": 0.03147422671318054, + "step": 149240 + }, + { + "epoch": 21.185237757274663, + "grad_norm": 0.2937646806240082, + "learning_rate": 7.88237047551455e-05, + "loss": 0.0033141195774078367, + "step": 149250 + }, + { + "epoch": 21.18665720369056, + "grad_norm": 0.01177141722291708, + "learning_rate": 7.88222853087296e-05, + "loss": 0.005521759763360024, + "step": 149260 + }, + { + "epoch": 21.18807665010646, + "grad_norm": 10.552308082580566, + "learning_rate": 7.88208658623137e-05, + "loss": 0.028582280874252318, + "step": 149270 + }, + { + "epoch": 21.189496096522355, + "grad_norm": 0.10572256147861481, + "learning_rate": 7.88194464158978e-05, + "loss": 0.013605988025665284, + "step": 149280 + }, + { + "epoch": 21.190915542938253, + "grad_norm": 0.023849427700042725, + "learning_rate": 7.881802696948191e-05, + "loss": 0.02131710946559906, + "step": 149290 + }, + { + "epoch": 21.19233498935415, + "grad_norm": 5.187261581420898, + "learning_rate": 7.8816607523066e-05, + "loss": 0.009778132289648056, + "step": 149300 + }, + { + "epoch": 21.19375443577005, + "grad_norm": 0.041895847767591476, + "learning_rate": 7.881518807665012e-05, + "loss": 0.040613090991973876, + "step": 149310 + }, + { + "epoch": 21.195173882185948, + "grad_norm": 0.23249617218971252, + "learning_rate": 7.881376863023421e-05, + "loss": 0.020703494548797607, + "step": 149320 + }, + { + "epoch": 21.196593328601846, + "grad_norm": 4.627407073974609, + "learning_rate": 7.881234918381831e-05, + "loss": 0.0025595139712095262, + "step": 149330 + }, + { + "epoch": 21.198012775017745, + "grad_norm": 0.047897692769765854, + "learning_rate": 7.881092973740242e-05, + "loss": 0.002219580113887787, + "step": 149340 + }, + { + "epoch": 21.19943222143364, + "grad_norm": 0.42066144943237305, + "learning_rate": 7.880951029098652e-05, + "loss": 0.010955195128917693, + "step": 149350 + }, + { + "epoch": 21.200851667849538, + "grad_norm": 0.02329222857952118, + "learning_rate": 7.880809084457063e-05, + "loss": 0.008941583335399628, + "step": 149360 + }, + { + "epoch": 21.202271114265436, + "grad_norm": 3.5009148120880127, + "learning_rate": 7.880667139815471e-05, + "loss": 0.0018908549100160599, + "step": 149370 + }, + { + "epoch": 21.203690560681334, + "grad_norm": 9.166903495788574, + "learning_rate": 7.880525195173883e-05, + "loss": 0.02328534722328186, + "step": 149380 + }, + { + "epoch": 21.205110007097232, + "grad_norm": 0.15870539844036102, + "learning_rate": 7.880383250532292e-05, + "loss": 0.02850722074508667, + "step": 149390 + }, + { + "epoch": 21.20652945351313, + "grad_norm": 0.11906500160694122, + "learning_rate": 7.880241305890703e-05, + "loss": 0.003218761831521988, + "step": 149400 + }, + { + "epoch": 21.20794889992903, + "grad_norm": 0.16724181175231934, + "learning_rate": 7.880099361249113e-05, + "loss": 0.00875551700592041, + "step": 149410 + }, + { + "epoch": 21.209368346344924, + "grad_norm": 0.34400519728660583, + "learning_rate": 7.879957416607523e-05, + "loss": 0.003290281072258949, + "step": 149420 + }, + { + "epoch": 21.210787792760822, + "grad_norm": 0.4774434566497803, + "learning_rate": 7.879815471965934e-05, + "loss": 0.021557924151420594, + "step": 149430 + }, + { + "epoch": 21.21220723917672, + "grad_norm": 10.315865516662598, + "learning_rate": 7.879673527324344e-05, + "loss": 0.11811084747314453, + "step": 149440 + }, + { + "epoch": 21.21362668559262, + "grad_norm": 0.0030532616656273603, + "learning_rate": 7.879531582682755e-05, + "loss": 0.02968076169490814, + "step": 149450 + }, + { + "epoch": 21.215046132008517, + "grad_norm": 0.045845694839954376, + "learning_rate": 7.879389638041164e-05, + "loss": 0.0042793162167072294, + "step": 149460 + }, + { + "epoch": 21.216465578424415, + "grad_norm": 6.407303333282471, + "learning_rate": 7.879247693399574e-05, + "loss": 0.023627130687236785, + "step": 149470 + }, + { + "epoch": 21.217885024840314, + "grad_norm": 1.6257554292678833, + "learning_rate": 7.879105748757984e-05, + "loss": 0.019711318612098693, + "step": 149480 + }, + { + "epoch": 21.21930447125621, + "grad_norm": 15.701323509216309, + "learning_rate": 7.878963804116395e-05, + "loss": 0.028219512104988097, + "step": 149490 + }, + { + "epoch": 21.220723917672107, + "grad_norm": 0.804357647895813, + "learning_rate": 7.878821859474805e-05, + "loss": 0.015253381431102752, + "step": 149500 + }, + { + "epoch": 21.220723917672107, + "eval_accuracy": 0.9830228269854391, + "eval_loss": 0.06609699130058289, + "eval_runtime": 32.7473, + "eval_samples_per_second": 480.253, + "eval_steps_per_second": 15.024, + "step": 149500 + }, + { + "epoch": 21.222143364088005, + "grad_norm": 0.4352744221687317, + "learning_rate": 7.878679914833216e-05, + "loss": 0.027441534399986266, + "step": 149510 + }, + { + "epoch": 21.223562810503903, + "grad_norm": 0.1629934310913086, + "learning_rate": 7.878537970191626e-05, + "loss": 0.01376473754644394, + "step": 149520 + }, + { + "epoch": 21.2249822569198, + "grad_norm": 0.7121432423591614, + "learning_rate": 7.878396025550035e-05, + "loss": 0.014056505262851715, + "step": 149530 + }, + { + "epoch": 21.2264017033357, + "grad_norm": 0.049422234296798706, + "learning_rate": 7.878254080908446e-05, + "loss": 0.010615213960409164, + "step": 149540 + }, + { + "epoch": 21.2278211497516, + "grad_norm": 2.8347020149230957, + "learning_rate": 7.878112136266856e-05, + "loss": 0.05742787718772888, + "step": 149550 + }, + { + "epoch": 21.229240596167493, + "grad_norm": 4.875875473022461, + "learning_rate": 7.877970191625267e-05, + "loss": 0.03403811156749725, + "step": 149560 + }, + { + "epoch": 21.23066004258339, + "grad_norm": 1.2117923498153687, + "learning_rate": 7.877828246983677e-05, + "loss": 0.0569779634475708, + "step": 149570 + }, + { + "epoch": 21.23207948899929, + "grad_norm": 2.2622833251953125, + "learning_rate": 7.877686302342087e-05, + "loss": 0.023309798538684846, + "step": 149580 + }, + { + "epoch": 21.233498935415188, + "grad_norm": 0.25336310267448425, + "learning_rate": 7.877544357700496e-05, + "loss": 0.018870651721954346, + "step": 149590 + }, + { + "epoch": 21.234918381831086, + "grad_norm": 0.10979675501585007, + "learning_rate": 7.877402413058908e-05, + "loss": 0.023595008254051208, + "step": 149600 + }, + { + "epoch": 21.236337828246985, + "grad_norm": 0.4305548369884491, + "learning_rate": 7.877260468417317e-05, + "loss": 0.015587338805198669, + "step": 149610 + }, + { + "epoch": 21.237757274662883, + "grad_norm": 13.081771850585938, + "learning_rate": 7.877118523775728e-05, + "loss": 0.04992917776107788, + "step": 149620 + }, + { + "epoch": 21.239176721078778, + "grad_norm": 0.13968588411808014, + "learning_rate": 7.876976579134138e-05, + "loss": 0.019231310486793517, + "step": 149630 + }, + { + "epoch": 21.240596167494676, + "grad_norm": 0.6029143929481506, + "learning_rate": 7.876834634492548e-05, + "loss": 0.01839500665664673, + "step": 149640 + }, + { + "epoch": 21.242015613910574, + "grad_norm": 4.611687660217285, + "learning_rate": 7.876692689850959e-05, + "loss": 0.008380243182182312, + "step": 149650 + }, + { + "epoch": 21.243435060326473, + "grad_norm": 4.504095077514648, + "learning_rate": 7.876550745209369e-05, + "loss": 0.010931670665740967, + "step": 149660 + }, + { + "epoch": 21.24485450674237, + "grad_norm": 0.38341444730758667, + "learning_rate": 7.87640880056778e-05, + "loss": 0.012220959365367889, + "step": 149670 + }, + { + "epoch": 21.24627395315827, + "grad_norm": 0.44801631569862366, + "learning_rate": 7.876266855926188e-05, + "loss": 0.001156998798251152, + "step": 149680 + }, + { + "epoch": 21.247693399574167, + "grad_norm": 0.1432446539402008, + "learning_rate": 7.876124911284599e-05, + "loss": 0.004803478717803955, + "step": 149690 + }, + { + "epoch": 21.249112845990062, + "grad_norm": 0.07753727585077286, + "learning_rate": 7.875982966643009e-05, + "loss": 0.007115858048200608, + "step": 149700 + }, + { + "epoch": 21.25053229240596, + "grad_norm": 0.12571723759174347, + "learning_rate": 7.87584102200142e-05, + "loss": 0.044754701852798465, + "step": 149710 + }, + { + "epoch": 21.25195173882186, + "grad_norm": 0.25680607557296753, + "learning_rate": 7.87569907735983e-05, + "loss": 0.04310904741287232, + "step": 149720 + }, + { + "epoch": 21.253371185237757, + "grad_norm": 0.15984700620174408, + "learning_rate": 7.87555713271824e-05, + "loss": 0.005724727734923362, + "step": 149730 + }, + { + "epoch": 21.254790631653655, + "grad_norm": 1.0998551845550537, + "learning_rate": 7.875415188076651e-05, + "loss": 0.004690051823854446, + "step": 149740 + }, + { + "epoch": 21.256210078069554, + "grad_norm": 0.5134467482566833, + "learning_rate": 7.87527324343506e-05, + "loss": 0.01059403195977211, + "step": 149750 + }, + { + "epoch": 21.257629524485452, + "grad_norm": 0.169934943318367, + "learning_rate": 7.875131298793472e-05, + "loss": 0.011590559780597687, + "step": 149760 + }, + { + "epoch": 21.259048970901347, + "grad_norm": 1.2508761882781982, + "learning_rate": 7.874989354151881e-05, + "loss": 0.019144350290298463, + "step": 149770 + }, + { + "epoch": 21.260468417317245, + "grad_norm": 0.7891367077827454, + "learning_rate": 7.874847409510291e-05, + "loss": 0.00802907645702362, + "step": 149780 + }, + { + "epoch": 21.261887863733143, + "grad_norm": 0.07552396506071091, + "learning_rate": 7.874705464868701e-05, + "loss": 0.005626166984438896, + "step": 149790 + }, + { + "epoch": 21.26330731014904, + "grad_norm": 4.451970100402832, + "learning_rate": 7.874563520227112e-05, + "loss": 0.005020375922322273, + "step": 149800 + }, + { + "epoch": 21.26472675656494, + "grad_norm": 3.3432581424713135, + "learning_rate": 7.874421575585522e-05, + "loss": 0.009960376471281052, + "step": 149810 + }, + { + "epoch": 21.26614620298084, + "grad_norm": 0.24494674801826477, + "learning_rate": 7.874279630943933e-05, + "loss": 0.012257133424282075, + "step": 149820 + }, + { + "epoch": 21.267565649396737, + "grad_norm": 0.302452027797699, + "learning_rate": 7.874137686302342e-05, + "loss": 0.039670103788375856, + "step": 149830 + }, + { + "epoch": 21.26898509581263, + "grad_norm": 0.6783290505409241, + "learning_rate": 7.873995741660752e-05, + "loss": 0.08179160356521606, + "step": 149840 + }, + { + "epoch": 21.27040454222853, + "grad_norm": 4.003798007965088, + "learning_rate": 7.873853797019163e-05, + "loss": 0.003940673172473907, + "step": 149850 + }, + { + "epoch": 21.271823988644428, + "grad_norm": 0.05971608683466911, + "learning_rate": 7.873711852377573e-05, + "loss": 0.017813093960285187, + "step": 149860 + }, + { + "epoch": 21.273243435060326, + "grad_norm": 1.8861987590789795, + "learning_rate": 7.873569907735984e-05, + "loss": 0.03248091340065003, + "step": 149870 + }, + { + "epoch": 21.274662881476225, + "grad_norm": 0.453815221786499, + "learning_rate": 7.873427963094392e-05, + "loss": 0.02618357241153717, + "step": 149880 + }, + { + "epoch": 21.276082327892123, + "grad_norm": 0.06526563316583633, + "learning_rate": 7.873286018452804e-05, + "loss": 0.014142660796642304, + "step": 149890 + }, + { + "epoch": 21.27750177430802, + "grad_norm": 0.05871858820319176, + "learning_rate": 7.873144073811213e-05, + "loss": 0.024177791178226472, + "step": 149900 + }, + { + "epoch": 21.278921220723916, + "grad_norm": 2.80397367477417, + "learning_rate": 7.873002129169624e-05, + "loss": 0.03723135888576508, + "step": 149910 + }, + { + "epoch": 21.280340667139814, + "grad_norm": 0.012509427964687347, + "learning_rate": 7.872860184528034e-05, + "loss": 0.00331510491669178, + "step": 149920 + }, + { + "epoch": 21.281760113555713, + "grad_norm": 4.653242111206055, + "learning_rate": 7.872718239886445e-05, + "loss": 0.003215700387954712, + "step": 149930 + }, + { + "epoch": 21.28317955997161, + "grad_norm": 0.04042194038629532, + "learning_rate": 7.872576295244855e-05, + "loss": 0.01799624413251877, + "step": 149940 + }, + { + "epoch": 21.28459900638751, + "grad_norm": 1.5009921789169312, + "learning_rate": 7.872434350603265e-05, + "loss": 0.013056948781013489, + "step": 149950 + }, + { + "epoch": 21.286018452803408, + "grad_norm": 0.13292047381401062, + "learning_rate": 7.872292405961676e-05, + "loss": 0.003729250282049179, + "step": 149960 + }, + { + "epoch": 21.287437899219306, + "grad_norm": 0.006223233882337809, + "learning_rate": 7.872150461320085e-05, + "loss": 0.0020738404244184496, + "step": 149970 + }, + { + "epoch": 21.2888573456352, + "grad_norm": 2.8292734622955322, + "learning_rate": 7.872008516678497e-05, + "loss": 0.03463707268238068, + "step": 149980 + }, + { + "epoch": 21.2902767920511, + "grad_norm": 1.4998805522918701, + "learning_rate": 7.871866572036905e-05, + "loss": 0.006433401256799698, + "step": 149990 + }, + { + "epoch": 21.291696238466997, + "grad_norm": 0.07294634729623795, + "learning_rate": 7.871724627395316e-05, + "loss": 0.004600503295660019, + "step": 150000 + }, + { + "epoch": 21.291696238466997, + "eval_accuracy": 0.984930374515165, + "eval_loss": 0.06009546294808388, + "eval_runtime": 34.1226, + "eval_samples_per_second": 460.897, + "eval_steps_per_second": 14.419, + "step": 150000 + }, + { + "epoch": 21.293115684882896, + "grad_norm": 0.15897905826568604, + "learning_rate": 7.871582682753726e-05, + "loss": 0.029994645714759828, + "step": 150010 + }, + { + "epoch": 21.294535131298794, + "grad_norm": 0.024007586762309074, + "learning_rate": 7.871440738112137e-05, + "loss": 0.004563559964299202, + "step": 150020 + }, + { + "epoch": 21.295954577714692, + "grad_norm": 0.012047787196934223, + "learning_rate": 7.871298793470548e-05, + "loss": 0.012067283689975738, + "step": 150030 + }, + { + "epoch": 21.29737402413059, + "grad_norm": 0.9900059700012207, + "learning_rate": 7.871156848828956e-05, + "loss": 0.0017314765602350235, + "step": 150040 + }, + { + "epoch": 21.298793470546485, + "grad_norm": 0.08268841356039047, + "learning_rate": 7.871014904187367e-05, + "loss": 0.010683967173099518, + "step": 150050 + }, + { + "epoch": 21.300212916962384, + "grad_norm": 1.4302480220794678, + "learning_rate": 7.870872959545777e-05, + "loss": 0.13366085290908813, + "step": 150060 + }, + { + "epoch": 21.301632363378282, + "grad_norm": 0.07603032886981964, + "learning_rate": 7.870731014904188e-05, + "loss": 0.01684039831161499, + "step": 150070 + }, + { + "epoch": 21.30305180979418, + "grad_norm": 0.014241009950637817, + "learning_rate": 7.870589070262598e-05, + "loss": 0.014112381637096405, + "step": 150080 + }, + { + "epoch": 21.30447125621008, + "grad_norm": 7.978214263916016, + "learning_rate": 7.870447125621008e-05, + "loss": 0.02085186243057251, + "step": 150090 + }, + { + "epoch": 21.305890702625977, + "grad_norm": 0.028473349288105965, + "learning_rate": 7.870305180979418e-05, + "loss": 0.0024428483098745345, + "step": 150100 + }, + { + "epoch": 21.307310149041875, + "grad_norm": 4.6767096519470215, + "learning_rate": 7.870163236337829e-05, + "loss": 0.016639645397663116, + "step": 150110 + }, + { + "epoch": 21.30872959545777, + "grad_norm": 8.268738746643066, + "learning_rate": 7.87002129169624e-05, + "loss": 0.02899589538574219, + "step": 150120 + }, + { + "epoch": 21.310149041873668, + "grad_norm": 0.25534874200820923, + "learning_rate": 7.86987934705465e-05, + "loss": 0.01323978155851364, + "step": 150130 + }, + { + "epoch": 21.311568488289566, + "grad_norm": 10.629423141479492, + "learning_rate": 7.869737402413059e-05, + "loss": 0.0429989218711853, + "step": 150140 + }, + { + "epoch": 21.312987934705465, + "grad_norm": 0.11195272952318192, + "learning_rate": 7.869595457771469e-05, + "loss": 0.018785808980464936, + "step": 150150 + }, + { + "epoch": 21.314407381121363, + "grad_norm": 0.016783302649855614, + "learning_rate": 7.86945351312988e-05, + "loss": 0.006526833772659302, + "step": 150160 + }, + { + "epoch": 21.31582682753726, + "grad_norm": 0.34147539734840393, + "learning_rate": 7.86931156848829e-05, + "loss": 0.018001502752304076, + "step": 150170 + }, + { + "epoch": 21.31724627395316, + "grad_norm": 0.1380467265844345, + "learning_rate": 7.869169623846701e-05, + "loss": 0.027275654673576354, + "step": 150180 + }, + { + "epoch": 21.318665720369054, + "grad_norm": 9.12823486328125, + "learning_rate": 7.869027679205109e-05, + "loss": 0.007624239474534988, + "step": 150190 + }, + { + "epoch": 21.320085166784953, + "grad_norm": 0.7584025859832764, + "learning_rate": 7.86888573456352e-05, + "loss": 0.00371272973716259, + "step": 150200 + }, + { + "epoch": 21.32150461320085, + "grad_norm": 0.0240999273955822, + "learning_rate": 7.868743789921931e-05, + "loss": 0.0006559953093528747, + "step": 150210 + }, + { + "epoch": 21.32292405961675, + "grad_norm": 0.8504788875579834, + "learning_rate": 7.868601845280341e-05, + "loss": 0.010433420538902283, + "step": 150220 + }, + { + "epoch": 21.324343506032648, + "grad_norm": 0.43513473868370056, + "learning_rate": 7.868459900638752e-05, + "loss": 0.005265964567661286, + "step": 150230 + }, + { + "epoch": 21.325762952448546, + "grad_norm": 2.0596883296966553, + "learning_rate": 7.868317955997162e-05, + "loss": 0.02267058491706848, + "step": 150240 + }, + { + "epoch": 21.327182398864444, + "grad_norm": 0.08681211620569229, + "learning_rate": 7.868176011355572e-05, + "loss": 0.009743700921535491, + "step": 150250 + }, + { + "epoch": 21.32860184528034, + "grad_norm": 0.16763818264007568, + "learning_rate": 7.868034066713981e-05, + "loss": 0.08114421963691712, + "step": 150260 + }, + { + "epoch": 21.330021291696237, + "grad_norm": 2.8899662494659424, + "learning_rate": 7.867892122072393e-05, + "loss": 0.0042728368192911145, + "step": 150270 + }, + { + "epoch": 21.331440738112136, + "grad_norm": 0.18685731291770935, + "learning_rate": 7.867750177430802e-05, + "loss": 0.004065968841314316, + "step": 150280 + }, + { + "epoch": 21.332860184528034, + "grad_norm": 3.799468755722046, + "learning_rate": 7.867608232789213e-05, + "loss": 0.012602724134922028, + "step": 150290 + }, + { + "epoch": 21.334279630943932, + "grad_norm": 5.050230979919434, + "learning_rate": 7.867466288147623e-05, + "loss": 0.008401226997375489, + "step": 150300 + }, + { + "epoch": 21.33569907735983, + "grad_norm": 2.748739719390869, + "learning_rate": 7.867324343506033e-05, + "loss": 0.002787080779671669, + "step": 150310 + }, + { + "epoch": 21.33711852377573, + "grad_norm": 0.005689374636858702, + "learning_rate": 7.867182398864444e-05, + "loss": 0.005105862393975258, + "step": 150320 + }, + { + "epoch": 21.338537970191624, + "grad_norm": 0.1499873846769333, + "learning_rate": 7.867040454222854e-05, + "loss": 0.05594694018363953, + "step": 150330 + }, + { + "epoch": 21.339957416607522, + "grad_norm": 1.1138839721679688, + "learning_rate": 7.866898509581265e-05, + "loss": 0.010386820137500762, + "step": 150340 + }, + { + "epoch": 21.34137686302342, + "grad_norm": 0.01635448820888996, + "learning_rate": 7.866756564939673e-05, + "loss": 0.003584672883152962, + "step": 150350 + }, + { + "epoch": 21.34279630943932, + "grad_norm": 0.3007226288318634, + "learning_rate": 7.866614620298084e-05, + "loss": 0.028697729110717773, + "step": 150360 + }, + { + "epoch": 21.344215755855217, + "grad_norm": 9.357942581176758, + "learning_rate": 7.866472675656494e-05, + "loss": 0.023924729228019713, + "step": 150370 + }, + { + "epoch": 21.345635202271115, + "grad_norm": 0.012174699455499649, + "learning_rate": 7.866330731014905e-05, + "loss": 0.013487359881401062, + "step": 150380 + }, + { + "epoch": 21.347054648687013, + "grad_norm": 0.09372057765722275, + "learning_rate": 7.866188786373315e-05, + "loss": 0.016023290157318116, + "step": 150390 + }, + { + "epoch": 21.348474095102908, + "grad_norm": 17.5643310546875, + "learning_rate": 7.866046841731725e-05, + "loss": 0.028237324953079224, + "step": 150400 + }, + { + "epoch": 21.349893541518806, + "grad_norm": 3.367307186126709, + "learning_rate": 7.865904897090136e-05, + "loss": 0.09425501823425293, + "step": 150410 + }, + { + "epoch": 21.351312987934705, + "grad_norm": 0.027510514482855797, + "learning_rate": 7.865762952448545e-05, + "loss": 0.009037439525127412, + "step": 150420 + }, + { + "epoch": 21.352732434350603, + "grad_norm": 1.4646201133728027, + "learning_rate": 7.865621007806956e-05, + "loss": 0.012955763936042785, + "step": 150430 + }, + { + "epoch": 21.3541518807665, + "grad_norm": 7.762776851654053, + "learning_rate": 7.865479063165366e-05, + "loss": 0.010075832903385162, + "step": 150440 + }, + { + "epoch": 21.3555713271824, + "grad_norm": 0.12593406438827515, + "learning_rate": 7.865337118523776e-05, + "loss": 0.03233673572540283, + "step": 150450 + }, + { + "epoch": 21.356990773598298, + "grad_norm": 0.033880434930324554, + "learning_rate": 7.865195173882186e-05, + "loss": 0.010391590744256973, + "step": 150460 + }, + { + "epoch": 21.358410220014193, + "grad_norm": 2.8902103900909424, + "learning_rate": 7.865053229240597e-05, + "loss": 0.010583482682704926, + "step": 150470 + }, + { + "epoch": 21.35982966643009, + "grad_norm": 0.7179674506187439, + "learning_rate": 7.864911284599007e-05, + "loss": 0.00959889143705368, + "step": 150480 + }, + { + "epoch": 21.36124911284599, + "grad_norm": 0.02201680652797222, + "learning_rate": 7.864769339957418e-05, + "loss": 0.01786481887102127, + "step": 150490 + }, + { + "epoch": 21.362668559261888, + "grad_norm": 8.945985794067383, + "learning_rate": 7.864627395315827e-05, + "loss": 0.037659674882888794, + "step": 150500 + }, + { + "epoch": 21.362668559261888, + "eval_accuracy": 0.9903986774337127, + "eval_loss": 0.042516887187957764, + "eval_runtime": 33.164, + "eval_samples_per_second": 474.22, + "eval_steps_per_second": 14.835, + "step": 150500 + }, + { + "epoch": 21.364088005677786, + "grad_norm": 0.18884125351905823, + "learning_rate": 7.864485450674237e-05, + "loss": 0.0012963451445102692, + "step": 150510 + }, + { + "epoch": 21.365507452093684, + "grad_norm": 0.17575238645076752, + "learning_rate": 7.864343506032648e-05, + "loss": 0.0020883496850728988, + "step": 150520 + }, + { + "epoch": 21.366926898509583, + "grad_norm": 0.015365666709840298, + "learning_rate": 7.864201561391058e-05, + "loss": 0.0010480429977178574, + "step": 150530 + }, + { + "epoch": 21.368346344925477, + "grad_norm": 0.27821290493011475, + "learning_rate": 7.864059616749469e-05, + "loss": 0.0010680004954338074, + "step": 150540 + }, + { + "epoch": 21.369765791341376, + "grad_norm": 0.06767478585243225, + "learning_rate": 7.863917672107877e-05, + "loss": 0.008181358128786087, + "step": 150550 + }, + { + "epoch": 21.371185237757274, + "grad_norm": 0.1096239686012268, + "learning_rate": 7.863775727466288e-05, + "loss": 0.024930296838283537, + "step": 150560 + }, + { + "epoch": 21.372604684173172, + "grad_norm": 0.05571400374174118, + "learning_rate": 7.863633782824698e-05, + "loss": 0.005970773845911026, + "step": 150570 + }, + { + "epoch": 21.37402413058907, + "grad_norm": 15.8751220703125, + "learning_rate": 7.863491838183109e-05, + "loss": 0.016945794224739075, + "step": 150580 + }, + { + "epoch": 21.37544357700497, + "grad_norm": 2.176967144012451, + "learning_rate": 7.863349893541519e-05, + "loss": 0.011744683980941773, + "step": 150590 + }, + { + "epoch": 21.376863023420867, + "grad_norm": 3.326159715652466, + "learning_rate": 7.86320794889993e-05, + "loss": 0.015500251948833466, + "step": 150600 + }, + { + "epoch": 21.378282469836762, + "grad_norm": 2.1551811695098877, + "learning_rate": 7.86306600425834e-05, + "loss": 0.015465983748435974, + "step": 150610 + }, + { + "epoch": 21.37970191625266, + "grad_norm": 0.20209509134292603, + "learning_rate": 7.86292405961675e-05, + "loss": 0.009489495307207108, + "step": 150620 + }, + { + "epoch": 21.38112136266856, + "grad_norm": 0.8307889699935913, + "learning_rate": 7.862782114975161e-05, + "loss": 0.025382262468338013, + "step": 150630 + }, + { + "epoch": 21.382540809084457, + "grad_norm": 4.720732688903809, + "learning_rate": 7.86264017033357e-05, + "loss": 0.00349072590470314, + "step": 150640 + }, + { + "epoch": 21.383960255500355, + "grad_norm": 0.02627347782254219, + "learning_rate": 7.862498225691982e-05, + "loss": 0.0038695957511663435, + "step": 150650 + }, + { + "epoch": 21.385379701916253, + "grad_norm": 0.06080394238233566, + "learning_rate": 7.86235628105039e-05, + "loss": 0.0024956516921520234, + "step": 150660 + }, + { + "epoch": 21.386799148332152, + "grad_norm": 0.1417836993932724, + "learning_rate": 7.862214336408801e-05, + "loss": 0.018171261250972747, + "step": 150670 + }, + { + "epoch": 21.388218594748047, + "grad_norm": 0.0988999754190445, + "learning_rate": 7.862072391767211e-05, + "loss": 0.04312205016613006, + "step": 150680 + }, + { + "epoch": 21.389638041163945, + "grad_norm": 0.025450212880969048, + "learning_rate": 7.861930447125622e-05, + "loss": 0.03194099366664886, + "step": 150690 + }, + { + "epoch": 21.391057487579843, + "grad_norm": 0.1004941314458847, + "learning_rate": 7.861788502484032e-05, + "loss": 0.004960120469331741, + "step": 150700 + }, + { + "epoch": 21.39247693399574, + "grad_norm": 10.053168296813965, + "learning_rate": 7.861646557842441e-05, + "loss": 0.027144354581832886, + "step": 150710 + }, + { + "epoch": 21.39389638041164, + "grad_norm": 0.06395307928323746, + "learning_rate": 7.861504613200852e-05, + "loss": 0.012635675072669984, + "step": 150720 + }, + { + "epoch": 21.395315826827538, + "grad_norm": 0.09054845571517944, + "learning_rate": 7.861362668559262e-05, + "loss": 0.01358504593372345, + "step": 150730 + }, + { + "epoch": 21.396735273243436, + "grad_norm": 8.707944869995117, + "learning_rate": 7.861220723917673e-05, + "loss": 0.005878636240959167, + "step": 150740 + }, + { + "epoch": 21.39815471965933, + "grad_norm": 3.1433773040771484, + "learning_rate": 7.861078779276083e-05, + "loss": 0.010115499794483184, + "step": 150750 + }, + { + "epoch": 21.39957416607523, + "grad_norm": 2.557936668395996, + "learning_rate": 7.860936834634493e-05, + "loss": 0.009467591345310212, + "step": 150760 + }, + { + "epoch": 21.400993612491128, + "grad_norm": 0.21462243795394897, + "learning_rate": 7.860794889992902e-05, + "loss": 0.009905040264129639, + "step": 150770 + }, + { + "epoch": 21.402413058907026, + "grad_norm": 0.007481276988983154, + "learning_rate": 7.860652945351314e-05, + "loss": 0.04812899827957153, + "step": 150780 + }, + { + "epoch": 21.403832505322924, + "grad_norm": 0.05361972004175186, + "learning_rate": 7.860511000709723e-05, + "loss": 0.03187766671180725, + "step": 150790 + }, + { + "epoch": 21.405251951738823, + "grad_norm": 6.2702250480651855, + "learning_rate": 7.860369056068134e-05, + "loss": 0.050520843267440795, + "step": 150800 + }, + { + "epoch": 21.40667139815472, + "grad_norm": 1.156450867652893, + "learning_rate": 7.860227111426544e-05, + "loss": 0.00858406201004982, + "step": 150810 + }, + { + "epoch": 21.408090844570616, + "grad_norm": 0.09947626292705536, + "learning_rate": 7.860085166784954e-05, + "loss": 0.01698766350746155, + "step": 150820 + }, + { + "epoch": 21.409510290986514, + "grad_norm": 0.14933201670646667, + "learning_rate": 7.859943222143365e-05, + "loss": 0.013561058044433593, + "step": 150830 + }, + { + "epoch": 21.410929737402412, + "grad_norm": 0.37955328822135925, + "learning_rate": 7.859801277501775e-05, + "loss": 0.002048031985759735, + "step": 150840 + }, + { + "epoch": 21.41234918381831, + "grad_norm": 0.7365771532058716, + "learning_rate": 7.859659332860186e-05, + "loss": 0.006261476874351501, + "step": 150850 + }, + { + "epoch": 21.41376863023421, + "grad_norm": 3.0955123901367188, + "learning_rate": 7.859517388218594e-05, + "loss": 0.008317974209785462, + "step": 150860 + }, + { + "epoch": 21.415188076650107, + "grad_norm": 10.35219955444336, + "learning_rate": 7.859375443577005e-05, + "loss": 0.04212158918380737, + "step": 150870 + }, + { + "epoch": 21.416607523066006, + "grad_norm": 0.14260686933994293, + "learning_rate": 7.859233498935415e-05, + "loss": 0.004925109073519706, + "step": 150880 + }, + { + "epoch": 21.4180269694819, + "grad_norm": 2.855813503265381, + "learning_rate": 7.859091554293826e-05, + "loss": 0.006777619570493698, + "step": 150890 + }, + { + "epoch": 21.4194464158978, + "grad_norm": 11.65463638305664, + "learning_rate": 7.858949609652236e-05, + "loss": 0.0227961465716362, + "step": 150900 + }, + { + "epoch": 21.420865862313697, + "grad_norm": 16.396881103515625, + "learning_rate": 7.858807665010646e-05, + "loss": 0.047901982069015504, + "step": 150910 + }, + { + "epoch": 21.422285308729595, + "grad_norm": 1.0305626392364502, + "learning_rate": 7.858665720369057e-05, + "loss": 0.01431875377893448, + "step": 150920 + }, + { + "epoch": 21.423704755145494, + "grad_norm": 0.03939332067966461, + "learning_rate": 7.858523775727466e-05, + "loss": 0.025107622146606445, + "step": 150930 + }, + { + "epoch": 21.425124201561392, + "grad_norm": 0.3796674907207489, + "learning_rate": 7.858381831085877e-05, + "loss": 0.0028860975056886674, + "step": 150940 + }, + { + "epoch": 21.42654364797729, + "grad_norm": 1.1133980751037598, + "learning_rate": 7.858239886444287e-05, + "loss": 0.005138388648629188, + "step": 150950 + }, + { + "epoch": 21.427963094393185, + "grad_norm": 3.2256035804748535, + "learning_rate": 7.858097941802698e-05, + "loss": 0.005157262086868286, + "step": 150960 + }, + { + "epoch": 21.429382540809083, + "grad_norm": 0.03242958337068558, + "learning_rate": 7.857955997161107e-05, + "loss": 0.03624560832977295, + "step": 150970 + }, + { + "epoch": 21.43080198722498, + "grad_norm": 0.008251283317804337, + "learning_rate": 7.857814052519518e-05, + "loss": 0.00393541157245636, + "step": 150980 + }, + { + "epoch": 21.43222143364088, + "grad_norm": 3.116025924682617, + "learning_rate": 7.857672107877928e-05, + "loss": 0.0325472891330719, + "step": 150990 + }, + { + "epoch": 21.433640880056778, + "grad_norm": 1.8862682580947876, + "learning_rate": 7.857530163236339e-05, + "loss": 0.0145626962184906, + "step": 151000 + }, + { + "epoch": 21.433640880056778, + "eval_accuracy": 0.9883003751510142, + "eval_loss": 0.03922225907444954, + "eval_runtime": 33.2504, + "eval_samples_per_second": 472.987, + "eval_steps_per_second": 14.797, + "step": 151000 + }, + { + "epoch": 21.435060326472676, + "grad_norm": 3.9699912071228027, + "learning_rate": 7.857388218594748e-05, + "loss": 0.009154099225997924, + "step": 151010 + }, + { + "epoch": 21.436479772888575, + "grad_norm": 0.08011793345212936, + "learning_rate": 7.857246273953158e-05, + "loss": 0.01022351235151291, + "step": 151020 + }, + { + "epoch": 21.43789921930447, + "grad_norm": 0.07765154540538788, + "learning_rate": 7.857104329311569e-05, + "loss": 0.003188994154334068, + "step": 151030 + }, + { + "epoch": 21.439318665720368, + "grad_norm": 8.666377067565918, + "learning_rate": 7.856962384669979e-05, + "loss": 0.02733677327632904, + "step": 151040 + }, + { + "epoch": 21.440738112136266, + "grad_norm": 6.4535136222839355, + "learning_rate": 7.85682044002839e-05, + "loss": 0.01949133276939392, + "step": 151050 + }, + { + "epoch": 21.442157558552164, + "grad_norm": 13.909867286682129, + "learning_rate": 7.8566784953868e-05, + "loss": 0.04076787233352661, + "step": 151060 + }, + { + "epoch": 21.443577004968063, + "grad_norm": 2.3354668617248535, + "learning_rate": 7.85653655074521e-05, + "loss": 0.03087102174758911, + "step": 151070 + }, + { + "epoch": 21.44499645138396, + "grad_norm": 10.25372314453125, + "learning_rate": 7.856394606103619e-05, + "loss": 0.02637479603290558, + "step": 151080 + }, + { + "epoch": 21.44641589779986, + "grad_norm": 2.522825002670288, + "learning_rate": 7.85625266146203e-05, + "loss": 0.016149772703647612, + "step": 151090 + }, + { + "epoch": 21.447835344215754, + "grad_norm": 4.060473442077637, + "learning_rate": 7.85611071682044e-05, + "loss": 0.01869649887084961, + "step": 151100 + }, + { + "epoch": 21.449254790631652, + "grad_norm": 13.556977272033691, + "learning_rate": 7.855968772178851e-05, + "loss": 0.011236745119094848, + "step": 151110 + }, + { + "epoch": 21.45067423704755, + "grad_norm": 1.6991263628005981, + "learning_rate": 7.855826827537261e-05, + "loss": 0.01087583526968956, + "step": 151120 + }, + { + "epoch": 21.45209368346345, + "grad_norm": 0.024971749633550644, + "learning_rate": 7.85568488289567e-05, + "loss": 0.006789686530828476, + "step": 151130 + }, + { + "epoch": 21.453513129879347, + "grad_norm": 0.03289659321308136, + "learning_rate": 7.855542938254082e-05, + "loss": 0.005812932550907135, + "step": 151140 + }, + { + "epoch": 21.454932576295246, + "grad_norm": 0.198566272854805, + "learning_rate": 7.855400993612491e-05, + "loss": 0.006988225877285004, + "step": 151150 + }, + { + "epoch": 21.456352022711144, + "grad_norm": 0.05777517706155777, + "learning_rate": 7.855259048970903e-05, + "loss": 0.0184954434633255, + "step": 151160 + }, + { + "epoch": 21.45777146912704, + "grad_norm": 0.0018599008908495307, + "learning_rate": 7.855131298793471e-05, + "loss": 0.04307417273521423, + "step": 151170 + }, + { + "epoch": 21.459190915542937, + "grad_norm": 0.021755773574113846, + "learning_rate": 7.854989354151882e-05, + "loss": 0.033227342367172244, + "step": 151180 + }, + { + "epoch": 21.460610361958835, + "grad_norm": 0.5058996081352234, + "learning_rate": 7.85484740951029e-05, + "loss": 0.007414519786834717, + "step": 151190 + }, + { + "epoch": 21.462029808374734, + "grad_norm": 0.43465590476989746, + "learning_rate": 7.854705464868702e-05, + "loss": 0.024872206151485443, + "step": 151200 + }, + { + "epoch": 21.463449254790632, + "grad_norm": 2.0657308101654053, + "learning_rate": 7.854563520227111e-05, + "loss": 0.03384566009044647, + "step": 151210 + }, + { + "epoch": 21.46486870120653, + "grad_norm": 0.2793665826320648, + "learning_rate": 7.854421575585522e-05, + "loss": 0.026503607630729675, + "step": 151220 + }, + { + "epoch": 21.46628814762243, + "grad_norm": 2.751476526260376, + "learning_rate": 7.854279630943932e-05, + "loss": 0.01590556502342224, + "step": 151230 + }, + { + "epoch": 21.467707594038323, + "grad_norm": 0.014748545363545418, + "learning_rate": 7.854137686302342e-05, + "loss": 0.012569315731525421, + "step": 151240 + }, + { + "epoch": 21.46912704045422, + "grad_norm": 0.8992433547973633, + "learning_rate": 7.853995741660753e-05, + "loss": 0.001101396232843399, + "step": 151250 + }, + { + "epoch": 21.47054648687012, + "grad_norm": 3.8561654090881348, + "learning_rate": 7.853853797019163e-05, + "loss": 0.041339057683944705, + "step": 151260 + }, + { + "epoch": 21.471965933286018, + "grad_norm": 0.08361608535051346, + "learning_rate": 7.853711852377574e-05, + "loss": 0.0031107179820537567, + "step": 151270 + }, + { + "epoch": 21.473385379701917, + "grad_norm": 0.03552709519863129, + "learning_rate": 7.853569907735984e-05, + "loss": 0.014787481725215912, + "step": 151280 + }, + { + "epoch": 21.474804826117815, + "grad_norm": 12.277571678161621, + "learning_rate": 7.853427963094395e-05, + "loss": 0.032057172060012816, + "step": 151290 + }, + { + "epoch": 21.476224272533713, + "grad_norm": 0.9301695227622986, + "learning_rate": 7.853286018452803e-05, + "loss": 0.0033328257501125335, + "step": 151300 + }, + { + "epoch": 21.477643718949608, + "grad_norm": 0.9506941437721252, + "learning_rate": 7.853144073811214e-05, + "loss": 0.02144779860973358, + "step": 151310 + }, + { + "epoch": 21.479063165365506, + "grad_norm": 0.17734889686107635, + "learning_rate": 7.853016323633783e-05, + "loss": 0.020137874782085417, + "step": 151320 + }, + { + "epoch": 21.480482611781405, + "grad_norm": 0.11234434694051743, + "learning_rate": 7.852874378992194e-05, + "loss": 0.0346947968006134, + "step": 151330 + }, + { + "epoch": 21.481902058197303, + "grad_norm": 7.248856067657471, + "learning_rate": 7.852732434350603e-05, + "loss": 0.004561421275138855, + "step": 151340 + }, + { + "epoch": 21.4833215046132, + "grad_norm": 0.02298351190984249, + "learning_rate": 7.852590489709015e-05, + "loss": 0.01116151362657547, + "step": 151350 + }, + { + "epoch": 21.4847409510291, + "grad_norm": 6.220264434814453, + "learning_rate": 7.852448545067424e-05, + "loss": 0.015824052691459655, + "step": 151360 + }, + { + "epoch": 21.486160397444998, + "grad_norm": 0.00616351468488574, + "learning_rate": 7.852306600425834e-05, + "loss": 0.013357287645339966, + "step": 151370 + }, + { + "epoch": 21.487579843860892, + "grad_norm": 0.035549961030483246, + "learning_rate": 7.852164655784244e-05, + "loss": 0.06319154500961303, + "step": 151380 + }, + { + "epoch": 21.48899929027679, + "grad_norm": 0.04777948558330536, + "learning_rate": 7.852022711142655e-05, + "loss": 0.020310570299625397, + "step": 151390 + }, + { + "epoch": 21.49041873669269, + "grad_norm": 0.32220709323883057, + "learning_rate": 7.851880766501065e-05, + "loss": 0.04787653088569641, + "step": 151400 + }, + { + "epoch": 21.491838183108587, + "grad_norm": 0.1581021398305893, + "learning_rate": 7.851738821859476e-05, + "loss": 0.052335423231124875, + "step": 151410 + }, + { + "epoch": 21.493257629524486, + "grad_norm": 0.043111275881528854, + "learning_rate": 7.851596877217885e-05, + "loss": 0.05034602880477905, + "step": 151420 + }, + { + "epoch": 21.494677075940384, + "grad_norm": 20.289844512939453, + "learning_rate": 7.851454932576295e-05, + "loss": 0.027281126379966734, + "step": 151430 + }, + { + "epoch": 21.496096522356282, + "grad_norm": 0.02261030115187168, + "learning_rate": 7.851312987934706e-05, + "loss": 0.07745199799537658, + "step": 151440 + }, + { + "epoch": 21.497515968772177, + "grad_norm": 0.08407948166131973, + "learning_rate": 7.851171043293116e-05, + "loss": 0.07415912747383117, + "step": 151450 + }, + { + "epoch": 21.498935415188075, + "grad_norm": 0.03448100760579109, + "learning_rate": 7.851029098651527e-05, + "loss": 0.041525131464004515, + "step": 151460 + }, + { + "epoch": 21.500354861603974, + "grad_norm": 0.1729927361011505, + "learning_rate": 7.850887154009935e-05, + "loss": 0.06439347267150879, + "step": 151470 + }, + { + "epoch": 21.501774308019872, + "grad_norm": 0.06491457670927048, + "learning_rate": 7.850745209368347e-05, + "loss": 0.011304600536823273, + "step": 151480 + }, + { + "epoch": 21.50319375443577, + "grad_norm": 1.4591714143753052, + "learning_rate": 7.850603264726756e-05, + "loss": 0.008775433897972107, + "step": 151490 + }, + { + "epoch": 21.50461320085167, + "grad_norm": 0.08496806025505066, + "learning_rate": 7.850461320085167e-05, + "loss": 0.011562662571668625, + "step": 151500 + }, + { + "epoch": 21.50461320085167, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.04979580640792847, + "eval_runtime": 34.0156, + "eval_samples_per_second": 462.347, + "eval_steps_per_second": 14.464, + "step": 151500 + }, + { + "epoch": 21.506032647267567, + "grad_norm": 3.178960084915161, + "learning_rate": 7.850319375443577e-05, + "loss": 0.007797259092330933, + "step": 151510 + }, + { + "epoch": 21.50745209368346, + "grad_norm": 0.038536395877599716, + "learning_rate": 7.850177430801987e-05, + "loss": 0.008369236439466476, + "step": 151520 + }, + { + "epoch": 21.50887154009936, + "grad_norm": 3.8564586639404297, + "learning_rate": 7.850035486160398e-05, + "loss": 0.020008955895900727, + "step": 151530 + }, + { + "epoch": 21.51029098651526, + "grad_norm": 5.1011810302734375, + "learning_rate": 7.849893541518808e-05, + "loss": 0.0332260251045227, + "step": 151540 + }, + { + "epoch": 21.511710432931157, + "grad_norm": 1.4952186346054077, + "learning_rate": 7.849751596877219e-05, + "loss": 0.028032743930816652, + "step": 151550 + }, + { + "epoch": 21.513129879347055, + "grad_norm": 0.22964413464069366, + "learning_rate": 7.849609652235629e-05, + "loss": 0.012953773140907288, + "step": 151560 + }, + { + "epoch": 21.514549325762953, + "grad_norm": 0.006146691273897886, + "learning_rate": 7.849467707594038e-05, + "loss": 0.026652398705482482, + "step": 151570 + }, + { + "epoch": 21.51596877217885, + "grad_norm": 12.586472511291504, + "learning_rate": 7.849325762952448e-05, + "loss": 0.0649193525314331, + "step": 151580 + }, + { + "epoch": 21.517388218594746, + "grad_norm": 0.04074056074023247, + "learning_rate": 7.849183818310859e-05, + "loss": 0.012222684919834137, + "step": 151590 + }, + { + "epoch": 21.518807665010645, + "grad_norm": 0.03556948900222778, + "learning_rate": 7.849041873669269e-05, + "loss": 0.003958575427532196, + "step": 151600 + }, + { + "epoch": 21.520227111426543, + "grad_norm": 0.25249791145324707, + "learning_rate": 7.84889992902768e-05, + "loss": 0.016062289476394653, + "step": 151610 + }, + { + "epoch": 21.52164655784244, + "grad_norm": 0.05954114720225334, + "learning_rate": 7.84875798438609e-05, + "loss": 0.011932015419006348, + "step": 151620 + }, + { + "epoch": 21.52306600425834, + "grad_norm": 1.0195636749267578, + "learning_rate": 7.8486160397445e-05, + "loss": 0.034069576859474184, + "step": 151630 + }, + { + "epoch": 21.524485450674238, + "grad_norm": 0.02515377849340439, + "learning_rate": 7.84847409510291e-05, + "loss": 0.022289544343948364, + "step": 151640 + }, + { + "epoch": 21.525904897090136, + "grad_norm": 0.7952381372451782, + "learning_rate": 7.84833215046132e-05, + "loss": 0.01175462305545807, + "step": 151650 + }, + { + "epoch": 21.52732434350603, + "grad_norm": 0.19000306725502014, + "learning_rate": 7.848190205819731e-05, + "loss": 0.00858311951160431, + "step": 151660 + }, + { + "epoch": 21.52874378992193, + "grad_norm": 0.6700640916824341, + "learning_rate": 7.848048261178141e-05, + "loss": 0.011870662868022918, + "step": 151670 + }, + { + "epoch": 21.530163236337827, + "grad_norm": 0.5441744327545166, + "learning_rate": 7.847906316536551e-05, + "loss": 0.013257697224617004, + "step": 151680 + }, + { + "epoch": 21.531582682753726, + "grad_norm": 2.5086989402770996, + "learning_rate": 7.84776437189496e-05, + "loss": 0.032514017820358274, + "step": 151690 + }, + { + "epoch": 21.533002129169624, + "grad_norm": 8.370426177978516, + "learning_rate": 7.847622427253372e-05, + "loss": 0.023941080272197723, + "step": 151700 + }, + { + "epoch": 21.534421575585522, + "grad_norm": 1.7398954629898071, + "learning_rate": 7.847480482611781e-05, + "loss": 0.028538635373115538, + "step": 151710 + }, + { + "epoch": 21.53584102200142, + "grad_norm": 0.08829309046268463, + "learning_rate": 7.847338537970192e-05, + "loss": 0.02795908451080322, + "step": 151720 + }, + { + "epoch": 21.537260468417315, + "grad_norm": 1.117104172706604, + "learning_rate": 7.847196593328602e-05, + "loss": 0.03707756996154785, + "step": 151730 + }, + { + "epoch": 21.538679914833214, + "grad_norm": 0.14757725596427917, + "learning_rate": 7.847054648687012e-05, + "loss": 0.021930478513240814, + "step": 151740 + }, + { + "epoch": 21.540099361249112, + "grad_norm": 0.4779832661151886, + "learning_rate": 7.846912704045423e-05, + "loss": 0.051328796148300174, + "step": 151750 + }, + { + "epoch": 21.54151880766501, + "grad_norm": 0.22907935082912445, + "learning_rate": 7.846770759403833e-05, + "loss": 0.02731609046459198, + "step": 151760 + }, + { + "epoch": 21.54293825408091, + "grad_norm": 0.04998154938220978, + "learning_rate": 7.846628814762244e-05, + "loss": 0.0193104088306427, + "step": 151770 + }, + { + "epoch": 21.544357700496807, + "grad_norm": 1.2074629068374634, + "learning_rate": 7.846486870120652e-05, + "loss": 0.013345879316329957, + "step": 151780 + }, + { + "epoch": 21.545777146912705, + "grad_norm": 6.929937362670898, + "learning_rate": 7.846344925479063e-05, + "loss": 0.0268373966217041, + "step": 151790 + }, + { + "epoch": 21.5471965933286, + "grad_norm": 0.3554370105266571, + "learning_rate": 7.846202980837473e-05, + "loss": 0.028102007508277894, + "step": 151800 + }, + { + "epoch": 21.5486160397445, + "grad_norm": 9.087981224060059, + "learning_rate": 7.846061036195884e-05, + "loss": 0.015488366782665252, + "step": 151810 + }, + { + "epoch": 21.550035486160397, + "grad_norm": 3.680603504180908, + "learning_rate": 7.845919091554295e-05, + "loss": 0.03278346061706543, + "step": 151820 + }, + { + "epoch": 21.551454932576295, + "grad_norm": 6.832675933837891, + "learning_rate": 7.845777146912704e-05, + "loss": 0.00902310311794281, + "step": 151830 + }, + { + "epoch": 21.552874378992193, + "grad_norm": 0.22405454516410828, + "learning_rate": 7.845635202271115e-05, + "loss": 0.004962005466222763, + "step": 151840 + }, + { + "epoch": 21.55429382540809, + "grad_norm": 0.6889753937721252, + "learning_rate": 7.845493257629524e-05, + "loss": 0.03494943082332611, + "step": 151850 + }, + { + "epoch": 21.55571327182399, + "grad_norm": 7.488932132720947, + "learning_rate": 7.845351312987936e-05, + "loss": 0.017930571734905244, + "step": 151860 + }, + { + "epoch": 21.557132718239885, + "grad_norm": 4.345661163330078, + "learning_rate": 7.845209368346345e-05, + "loss": 0.03221082091331482, + "step": 151870 + }, + { + "epoch": 21.558552164655783, + "grad_norm": 0.660284161567688, + "learning_rate": 7.845067423704755e-05, + "loss": 0.039191988110542295, + "step": 151880 + }, + { + "epoch": 21.55997161107168, + "grad_norm": 5.562705993652344, + "learning_rate": 7.844925479063165e-05, + "loss": 0.031162354350090026, + "step": 151890 + }, + { + "epoch": 21.56139105748758, + "grad_norm": 5.807373523712158, + "learning_rate": 7.844783534421576e-05, + "loss": 0.04043145477771759, + "step": 151900 + }, + { + "epoch": 21.562810503903478, + "grad_norm": 0.3123059570789337, + "learning_rate": 7.844641589779987e-05, + "loss": 0.0059552215039730075, + "step": 151910 + }, + { + "epoch": 21.564229950319376, + "grad_norm": 0.3118497133255005, + "learning_rate": 7.844499645138397e-05, + "loss": 0.0038182146847248076, + "step": 151920 + }, + { + "epoch": 21.565649396735274, + "grad_norm": 0.498828262090683, + "learning_rate": 7.844357700496806e-05, + "loss": 0.015501269698143005, + "step": 151930 + }, + { + "epoch": 21.56706884315117, + "grad_norm": 0.1011466309428215, + "learning_rate": 7.844215755855216e-05, + "loss": 0.018344078958034516, + "step": 151940 + }, + { + "epoch": 21.568488289567068, + "grad_norm": 0.6332476735115051, + "learning_rate": 7.844073811213627e-05, + "loss": 0.0037667736411094665, + "step": 151950 + }, + { + "epoch": 21.569907735982966, + "grad_norm": 0.02605326659977436, + "learning_rate": 7.843931866572037e-05, + "loss": 0.006295233964920044, + "step": 151960 + }, + { + "epoch": 21.571327182398864, + "grad_norm": 0.06874022632837296, + "learning_rate": 7.843789921930448e-05, + "loss": 0.028352853655815125, + "step": 151970 + }, + { + "epoch": 21.572746628814762, + "grad_norm": 9.683326721191406, + "learning_rate": 7.843647977288858e-05, + "loss": 0.030613276362419128, + "step": 151980 + }, + { + "epoch": 21.57416607523066, + "grad_norm": 0.13800127804279327, + "learning_rate": 7.843506032647268e-05, + "loss": 0.00311313234269619, + "step": 151990 + }, + { + "epoch": 21.57558552164656, + "grad_norm": 13.856358528137207, + "learning_rate": 7.843364088005679e-05, + "loss": 0.024808631837368013, + "step": 152000 + }, + { + "epoch": 21.57558552164656, + "eval_accuracy": 0.9837222610796719, + "eval_loss": 0.0643128976225853, + "eval_runtime": 32.8673, + "eval_samples_per_second": 478.5, + "eval_steps_per_second": 14.969, + "step": 152000 + }, + { + "epoch": 21.577004968062454, + "grad_norm": 5.568290710449219, + "learning_rate": 7.843222143364088e-05, + "loss": 0.0208395779132843, + "step": 152010 + }, + { + "epoch": 21.578424414478352, + "grad_norm": 12.746216773986816, + "learning_rate": 7.8430801987225e-05, + "loss": 0.02679084837436676, + "step": 152020 + }, + { + "epoch": 21.57984386089425, + "grad_norm": 0.06685172766447067, + "learning_rate": 7.842938254080909e-05, + "loss": 0.015651381015777587, + "step": 152030 + }, + { + "epoch": 21.58126330731015, + "grad_norm": 0.09050565212965012, + "learning_rate": 7.842796309439319e-05, + "loss": 0.014569933712482452, + "step": 152040 + }, + { + "epoch": 21.582682753726047, + "grad_norm": 0.24236080050468445, + "learning_rate": 7.842654364797729e-05, + "loss": 0.021165430545806885, + "step": 152050 + }, + { + "epoch": 21.584102200141945, + "grad_norm": 0.008904325775802135, + "learning_rate": 7.84251242015614e-05, + "loss": 0.010866181552410125, + "step": 152060 + }, + { + "epoch": 21.585521646557844, + "grad_norm": 0.031062092632055283, + "learning_rate": 7.84237047551455e-05, + "loss": 0.004277568310499191, + "step": 152070 + }, + { + "epoch": 21.58694109297374, + "grad_norm": 0.7683753371238708, + "learning_rate": 7.84222853087296e-05, + "loss": 0.02519258260726929, + "step": 152080 + }, + { + "epoch": 21.588360539389637, + "grad_norm": 5.247213363647461, + "learning_rate": 7.84208658623137e-05, + "loss": 0.012705086171627045, + "step": 152090 + }, + { + "epoch": 21.589779985805535, + "grad_norm": 3.2510831356048584, + "learning_rate": 7.84194464158978e-05, + "loss": 0.01146530881524086, + "step": 152100 + }, + { + "epoch": 21.591199432221433, + "grad_norm": 0.004748798441141844, + "learning_rate": 7.841802696948191e-05, + "loss": 0.02204233407974243, + "step": 152110 + }, + { + "epoch": 21.59261887863733, + "grad_norm": 3.57722544670105, + "learning_rate": 7.841660752306601e-05, + "loss": 0.009905403852462769, + "step": 152120 + }, + { + "epoch": 21.59403832505323, + "grad_norm": 0.3647531569004059, + "learning_rate": 7.841518807665012e-05, + "loss": 0.0053014099597930905, + "step": 152130 + }, + { + "epoch": 21.59545777146913, + "grad_norm": 0.26090630888938904, + "learning_rate": 7.84137686302342e-05, + "loss": 0.006839233636856079, + "step": 152140 + }, + { + "epoch": 21.596877217885023, + "grad_norm": 12.831460952758789, + "learning_rate": 7.841234918381832e-05, + "loss": 0.03786657452583313, + "step": 152150 + }, + { + "epoch": 21.59829666430092, + "grad_norm": 0.010324862785637379, + "learning_rate": 7.841092973740241e-05, + "loss": 0.04443672299385071, + "step": 152160 + }, + { + "epoch": 21.59971611071682, + "grad_norm": 0.08419083803892136, + "learning_rate": 7.840951029098652e-05, + "loss": 0.02065536379814148, + "step": 152170 + }, + { + "epoch": 21.601135557132718, + "grad_norm": 0.2694663405418396, + "learning_rate": 7.840809084457062e-05, + "loss": 0.006206683814525604, + "step": 152180 + }, + { + "epoch": 21.602555003548616, + "grad_norm": 0.03833192214369774, + "learning_rate": 7.840667139815472e-05, + "loss": 0.003031591698527336, + "step": 152190 + }, + { + "epoch": 21.603974449964515, + "grad_norm": 10.01204776763916, + "learning_rate": 7.840525195173883e-05, + "loss": 0.03839593529701233, + "step": 152200 + }, + { + "epoch": 21.605393896380413, + "grad_norm": 0.04172615706920624, + "learning_rate": 7.840383250532293e-05, + "loss": 0.004294439032673836, + "step": 152210 + }, + { + "epoch": 21.606813342796308, + "grad_norm": 0.07307696342468262, + "learning_rate": 7.840241305890704e-05, + "loss": 0.005307265371084213, + "step": 152220 + }, + { + "epoch": 21.608232789212206, + "grad_norm": 0.03262358158826828, + "learning_rate": 7.840099361249113e-05, + "loss": 0.017885203659534454, + "step": 152230 + }, + { + "epoch": 21.609652235628104, + "grad_norm": 0.038288265466690063, + "learning_rate": 7.839957416607523e-05, + "loss": 0.01856917142868042, + "step": 152240 + }, + { + "epoch": 21.611071682044003, + "grad_norm": 1.747052788734436, + "learning_rate": 7.839815471965933e-05, + "loss": 0.002584974840283394, + "step": 152250 + }, + { + "epoch": 21.6124911284599, + "grad_norm": 0.024260008707642555, + "learning_rate": 7.839673527324344e-05, + "loss": 0.031819355487823484, + "step": 152260 + }, + { + "epoch": 21.6139105748758, + "grad_norm": 0.20856183767318726, + "learning_rate": 7.839531582682754e-05, + "loss": 0.010454510152339936, + "step": 152270 + }, + { + "epoch": 21.615330021291697, + "grad_norm": 0.4019526541233063, + "learning_rate": 7.839389638041165e-05, + "loss": 0.02036903202533722, + "step": 152280 + }, + { + "epoch": 21.616749467707596, + "grad_norm": 0.3659839332103729, + "learning_rate": 7.839247693399575e-05, + "loss": 0.0034633234143257143, + "step": 152290 + }, + { + "epoch": 21.61816891412349, + "grad_norm": 0.0339253731071949, + "learning_rate": 7.839105748757984e-05, + "loss": 0.016078473627567293, + "step": 152300 + }, + { + "epoch": 21.61958836053939, + "grad_norm": 12.125288009643555, + "learning_rate": 7.838963804116395e-05, + "loss": 0.040557178854942325, + "step": 152310 + }, + { + "epoch": 21.621007806955287, + "grad_norm": 0.3626663088798523, + "learning_rate": 7.838821859474805e-05, + "loss": 0.008389383554458618, + "step": 152320 + }, + { + "epoch": 21.622427253371185, + "grad_norm": 18.448715209960938, + "learning_rate": 7.838679914833216e-05, + "loss": 0.04948891401290893, + "step": 152330 + }, + { + "epoch": 21.623846699787084, + "grad_norm": 0.020645665004849434, + "learning_rate": 7.838537970191626e-05, + "loss": 0.03470139503479004, + "step": 152340 + }, + { + "epoch": 21.625266146202982, + "grad_norm": 0.6421548128128052, + "learning_rate": 7.838396025550036e-05, + "loss": 0.013004614412784577, + "step": 152350 + }, + { + "epoch": 21.62668559261888, + "grad_norm": 6.731452941894531, + "learning_rate": 7.838254080908445e-05, + "loss": 0.005312181636691093, + "step": 152360 + }, + { + "epoch": 21.628105039034775, + "grad_norm": 18.10218048095703, + "learning_rate": 7.838112136266857e-05, + "loss": 0.026407283544540406, + "step": 152370 + }, + { + "epoch": 21.629524485450673, + "grad_norm": 17.99582290649414, + "learning_rate": 7.837970191625266e-05, + "loss": 0.06036527156829834, + "step": 152380 + }, + { + "epoch": 21.63094393186657, + "grad_norm": 2.5905075073242188, + "learning_rate": 7.837828246983677e-05, + "loss": 0.039636701345443726, + "step": 152390 + }, + { + "epoch": 21.63236337828247, + "grad_norm": 2.6859405040740967, + "learning_rate": 7.837686302342087e-05, + "loss": 0.013888399302959441, + "step": 152400 + }, + { + "epoch": 21.63378282469837, + "grad_norm": 0.031518228352069855, + "learning_rate": 7.837544357700497e-05, + "loss": 0.018441928923130034, + "step": 152410 + }, + { + "epoch": 21.635202271114267, + "grad_norm": 0.4661126732826233, + "learning_rate": 7.837402413058908e-05, + "loss": 0.006019876897335052, + "step": 152420 + }, + { + "epoch": 21.636621717530165, + "grad_norm": 0.10166867822408676, + "learning_rate": 7.837260468417318e-05, + "loss": 0.004625126346945762, + "step": 152430 + }, + { + "epoch": 21.63804116394606, + "grad_norm": 0.6484425067901611, + "learning_rate": 7.837118523775729e-05, + "loss": 0.0011087708175182343, + "step": 152440 + }, + { + "epoch": 21.639460610361958, + "grad_norm": 10.967620849609375, + "learning_rate": 7.836976579134137e-05, + "loss": 0.0309456467628479, + "step": 152450 + }, + { + "epoch": 21.640880056777856, + "grad_norm": 4.121605396270752, + "learning_rate": 7.836834634492548e-05, + "loss": 0.020584002137184143, + "step": 152460 + }, + { + "epoch": 21.642299503193755, + "grad_norm": 0.04061639681458473, + "learning_rate": 7.836692689850958e-05, + "loss": 0.010291200876235963, + "step": 152470 + }, + { + "epoch": 21.643718949609653, + "grad_norm": 0.23950409889221191, + "learning_rate": 7.836550745209369e-05, + "loss": 0.0016857832670211792, + "step": 152480 + }, + { + "epoch": 21.64513839602555, + "grad_norm": 0.42462536692619324, + "learning_rate": 7.836408800567779e-05, + "loss": 0.007773725688457489, + "step": 152490 + }, + { + "epoch": 21.64655784244145, + "grad_norm": 0.3635140657424927, + "learning_rate": 7.836266855926189e-05, + "loss": 0.0369134783744812, + "step": 152500 + }, + { + "epoch": 21.64655784244145, + "eval_accuracy": 0.9891905639982196, + "eval_loss": 0.03835885599255562, + "eval_runtime": 32.9061, + "eval_samples_per_second": 477.936, + "eval_steps_per_second": 14.952, + "step": 152500 + }, + { + "epoch": 21.647977288857344, + "grad_norm": 0.18462832272052765, + "learning_rate": 7.8361249112846e-05, + "loss": 0.012234371900558472, + "step": 152510 + }, + { + "epoch": 21.649396735273243, + "grad_norm": 0.2796073853969574, + "learning_rate": 7.83598296664301e-05, + "loss": 0.020059014856815337, + "step": 152520 + }, + { + "epoch": 21.65081618168914, + "grad_norm": 0.0446401946246624, + "learning_rate": 7.83584102200142e-05, + "loss": 0.001259029284119606, + "step": 152530 + }, + { + "epoch": 21.65223562810504, + "grad_norm": 11.217521667480469, + "learning_rate": 7.83569907735983e-05, + "loss": 0.009736073017120362, + "step": 152540 + }, + { + "epoch": 21.653655074520938, + "grad_norm": 0.11533302068710327, + "learning_rate": 7.83555713271824e-05, + "loss": 0.008176079392433167, + "step": 152550 + }, + { + "epoch": 21.655074520936836, + "grad_norm": 1.5604084730148315, + "learning_rate": 7.83541518807665e-05, + "loss": 0.0175702303647995, + "step": 152560 + }, + { + "epoch": 21.656493967352734, + "grad_norm": 0.0766150951385498, + "learning_rate": 7.835273243435061e-05, + "loss": 0.001595323532819748, + "step": 152570 + }, + { + "epoch": 21.65791341376863, + "grad_norm": 10.407796859741211, + "learning_rate": 7.83513129879347e-05, + "loss": 0.02409340739250183, + "step": 152580 + }, + { + "epoch": 21.659332860184527, + "grad_norm": 0.23720917105674744, + "learning_rate": 7.834989354151882e-05, + "loss": 0.031746044754981995, + "step": 152590 + }, + { + "epoch": 21.660752306600425, + "grad_norm": 4.345385551452637, + "learning_rate": 7.834847409510291e-05, + "loss": 0.008757757395505905, + "step": 152600 + }, + { + "epoch": 21.662171753016324, + "grad_norm": 9.527156829833984, + "learning_rate": 7.834705464868701e-05, + "loss": 0.013363993167877198, + "step": 152610 + }, + { + "epoch": 21.663591199432222, + "grad_norm": 1.1009225845336914, + "learning_rate": 7.834563520227112e-05, + "loss": 0.007256819307804108, + "step": 152620 + }, + { + "epoch": 21.66501064584812, + "grad_norm": 12.470254898071289, + "learning_rate": 7.834421575585522e-05, + "loss": 0.04620176553726196, + "step": 152630 + }, + { + "epoch": 21.66643009226402, + "grad_norm": 12.882330894470215, + "learning_rate": 7.834279630943933e-05, + "loss": 0.006510545313358307, + "step": 152640 + }, + { + "epoch": 21.667849538679913, + "grad_norm": 0.03848636895418167, + "learning_rate": 7.834137686302343e-05, + "loss": 0.01155758872628212, + "step": 152650 + }, + { + "epoch": 21.669268985095812, + "grad_norm": 3.6144261360168457, + "learning_rate": 7.833995741660753e-05, + "loss": 0.015034833550453186, + "step": 152660 + }, + { + "epoch": 21.67068843151171, + "grad_norm": 7.527446269989014, + "learning_rate": 7.833853797019162e-05, + "loss": 0.01768469959497452, + "step": 152670 + }, + { + "epoch": 21.67210787792761, + "grad_norm": 0.1264679729938507, + "learning_rate": 7.833711852377573e-05, + "loss": 0.015281379222869873, + "step": 152680 + }, + { + "epoch": 21.673527324343507, + "grad_norm": 0.018651677295565605, + "learning_rate": 7.833569907735983e-05, + "loss": 0.009713730961084365, + "step": 152690 + }, + { + "epoch": 21.674946770759405, + "grad_norm": 13.745284080505371, + "learning_rate": 7.833427963094394e-05, + "loss": 0.031453061103820804, + "step": 152700 + }, + { + "epoch": 21.676366217175303, + "grad_norm": 18.231679916381836, + "learning_rate": 7.833286018452804e-05, + "loss": 0.033922961354255675, + "step": 152710 + }, + { + "epoch": 21.677785663591198, + "grad_norm": 0.6862601637840271, + "learning_rate": 7.833144073811214e-05, + "loss": 0.0017259906977415084, + "step": 152720 + }, + { + "epoch": 21.679205110007096, + "grad_norm": 0.09152831882238388, + "learning_rate": 7.833002129169625e-05, + "loss": 0.02092452049255371, + "step": 152730 + }, + { + "epoch": 21.680624556422995, + "grad_norm": 11.26236629486084, + "learning_rate": 7.832860184528034e-05, + "loss": 0.005976811051368713, + "step": 152740 + }, + { + "epoch": 21.682044002838893, + "grad_norm": 0.039307139813899994, + "learning_rate": 7.832718239886446e-05, + "loss": 0.012691052258014679, + "step": 152750 + }, + { + "epoch": 21.68346344925479, + "grad_norm": 0.018423497676849365, + "learning_rate": 7.832576295244854e-05, + "loss": 0.0050474081188440325, + "step": 152760 + }, + { + "epoch": 21.68488289567069, + "grad_norm": 7.184344291687012, + "learning_rate": 7.832434350603265e-05, + "loss": 0.027224555611610413, + "step": 152770 + }, + { + "epoch": 21.686302342086588, + "grad_norm": 0.027859846130013466, + "learning_rate": 7.832292405961675e-05, + "loss": 0.04073755145072937, + "step": 152780 + }, + { + "epoch": 21.687721788502483, + "grad_norm": 0.008797813206911087, + "learning_rate": 7.832150461320086e-05, + "loss": 0.0014724902808666228, + "step": 152790 + }, + { + "epoch": 21.68914123491838, + "grad_norm": 0.03329332172870636, + "learning_rate": 7.832008516678496e-05, + "loss": 0.013764767348766327, + "step": 152800 + }, + { + "epoch": 21.69056068133428, + "grad_norm": 12.390008926391602, + "learning_rate": 7.831866572036905e-05, + "loss": 0.009320709109306335, + "step": 152810 + }, + { + "epoch": 21.691980127750178, + "grad_norm": 0.056433115154504776, + "learning_rate": 7.831724627395316e-05, + "loss": 0.006772693991661072, + "step": 152820 + }, + { + "epoch": 21.693399574166076, + "grad_norm": 0.21056437492370605, + "learning_rate": 7.831582682753726e-05, + "loss": 0.026394689083099367, + "step": 152830 + }, + { + "epoch": 21.694819020581974, + "grad_norm": 0.7948499917984009, + "learning_rate": 7.831440738112137e-05, + "loss": 0.004227866604924202, + "step": 152840 + }, + { + "epoch": 21.696238466997873, + "grad_norm": 0.14679521322250366, + "learning_rate": 7.831298793470547e-05, + "loss": 0.009984910488128662, + "step": 152850 + }, + { + "epoch": 21.697657913413767, + "grad_norm": 2.3353826999664307, + "learning_rate": 7.831156848828957e-05, + "loss": 0.018813665211200713, + "step": 152860 + }, + { + "epoch": 21.699077359829666, + "grad_norm": 0.9101122617721558, + "learning_rate": 7.831014904187366e-05, + "loss": 0.02913129925727844, + "step": 152870 + }, + { + "epoch": 21.700496806245564, + "grad_norm": 0.01335136778652668, + "learning_rate": 7.830872959545778e-05, + "loss": 0.015131418406963349, + "step": 152880 + }, + { + "epoch": 21.701916252661462, + "grad_norm": 1.1140717267990112, + "learning_rate": 7.830731014904187e-05, + "loss": 0.009835602343082428, + "step": 152890 + }, + { + "epoch": 21.70333569907736, + "grad_norm": 0.2199859768152237, + "learning_rate": 7.830589070262598e-05, + "loss": 0.006575032323598862, + "step": 152900 + }, + { + "epoch": 21.70475514549326, + "grad_norm": 0.2876320481300354, + "learning_rate": 7.830447125621008e-05, + "loss": 0.003952596709132195, + "step": 152910 + }, + { + "epoch": 21.706174591909157, + "grad_norm": 5.086370468139648, + "learning_rate": 7.830305180979418e-05, + "loss": 0.015462754666805268, + "step": 152920 + }, + { + "epoch": 21.707594038325052, + "grad_norm": 12.667468070983887, + "learning_rate": 7.830163236337829e-05, + "loss": 0.025055831670761107, + "step": 152930 + }, + { + "epoch": 21.70901348474095, + "grad_norm": 9.183591842651367, + "learning_rate": 7.830021291696239e-05, + "loss": 0.033831816911697385, + "step": 152940 + }, + { + "epoch": 21.71043293115685, + "grad_norm": 0.04314134269952774, + "learning_rate": 7.82987934705465e-05, + "loss": 0.009031272679567336, + "step": 152950 + }, + { + "epoch": 21.711852377572747, + "grad_norm": 2.467323064804077, + "learning_rate": 7.829737402413058e-05, + "loss": 0.02150009125471115, + "step": 152960 + }, + { + "epoch": 21.713271823988645, + "grad_norm": 1.4363232851028442, + "learning_rate": 7.829595457771469e-05, + "loss": 0.012687674164772034, + "step": 152970 + }, + { + "epoch": 21.714691270404543, + "grad_norm": 0.04794326052069664, + "learning_rate": 7.829453513129879e-05, + "loss": 0.048961618542671205, + "step": 152980 + }, + { + "epoch": 21.71611071682044, + "grad_norm": 0.5843566656112671, + "learning_rate": 7.82931156848829e-05, + "loss": 0.013639800250530243, + "step": 152990 + }, + { + "epoch": 21.717530163236336, + "grad_norm": 0.04825000837445259, + "learning_rate": 7.8291696238467e-05, + "loss": 0.013065311312675475, + "step": 153000 + }, + { + "epoch": 21.717530163236336, + "eval_accuracy": 0.9851847141857951, + "eval_loss": 0.0607418417930603, + "eval_runtime": 32.6736, + "eval_samples_per_second": 481.337, + "eval_steps_per_second": 15.058, + "step": 153000 + }, + { + "epoch": 21.718949609652235, + "grad_norm": 7.110385417938232, + "learning_rate": 7.829027679205111e-05, + "loss": 0.04205000102519989, + "step": 153010 + }, + { + "epoch": 21.720369056068133, + "grad_norm": 0.6613390445709229, + "learning_rate": 7.828885734563521e-05, + "loss": 0.010283558070659638, + "step": 153020 + }, + { + "epoch": 21.72178850248403, + "grad_norm": 2.8529183864593506, + "learning_rate": 7.82874378992193e-05, + "loss": 0.04090785384178162, + "step": 153030 + }, + { + "epoch": 21.72320794889993, + "grad_norm": 1.9200466871261597, + "learning_rate": 7.828601845280342e-05, + "loss": 0.02506008744239807, + "step": 153040 + }, + { + "epoch": 21.724627395315828, + "grad_norm": 0.3326008915901184, + "learning_rate": 7.828459900638751e-05, + "loss": 0.015887264907360078, + "step": 153050 + }, + { + "epoch": 21.726046841731726, + "grad_norm": 3.2874748706817627, + "learning_rate": 7.828317955997162e-05, + "loss": 0.025572916865348815, + "step": 153060 + }, + { + "epoch": 21.72746628814762, + "grad_norm": 5.185192108154297, + "learning_rate": 7.828176011355571e-05, + "loss": 0.03429182767868042, + "step": 153070 + }, + { + "epoch": 21.72888573456352, + "grad_norm": 0.056781888008117676, + "learning_rate": 7.828034066713982e-05, + "loss": 0.004727344214916229, + "step": 153080 + }, + { + "epoch": 21.730305180979418, + "grad_norm": 0.03537728264927864, + "learning_rate": 7.827892122072392e-05, + "loss": 0.021144220232963563, + "step": 153090 + }, + { + "epoch": 21.731724627395316, + "grad_norm": 2.2575128078460693, + "learning_rate": 7.827750177430803e-05, + "loss": 0.04427759349346161, + "step": 153100 + }, + { + "epoch": 21.733144073811214, + "grad_norm": 0.07793278992176056, + "learning_rate": 7.827608232789214e-05, + "loss": 0.0266229510307312, + "step": 153110 + }, + { + "epoch": 21.734563520227113, + "grad_norm": 0.1084359660744667, + "learning_rate": 7.827466288147622e-05, + "loss": 0.013895079493522644, + "step": 153120 + }, + { + "epoch": 21.73598296664301, + "grad_norm": 0.5192191004753113, + "learning_rate": 7.827324343506033e-05, + "loss": 0.021502503752708436, + "step": 153130 + }, + { + "epoch": 21.737402413058906, + "grad_norm": 0.18260230123996735, + "learning_rate": 7.827182398864443e-05, + "loss": 0.019814154505729674, + "step": 153140 + }, + { + "epoch": 21.738821859474804, + "grad_norm": 0.08858510106801987, + "learning_rate": 7.827040454222854e-05, + "loss": 0.008932486921548844, + "step": 153150 + }, + { + "epoch": 21.740241305890702, + "grad_norm": 9.836788177490234, + "learning_rate": 7.826898509581264e-05, + "loss": 0.011774344742298127, + "step": 153160 + }, + { + "epoch": 21.7416607523066, + "grad_norm": 0.1849929690361023, + "learning_rate": 7.826756564939674e-05, + "loss": 0.0014038391411304474, + "step": 153170 + }, + { + "epoch": 21.7430801987225, + "grad_norm": 0.1347888857126236, + "learning_rate": 7.826614620298083e-05, + "loss": 0.014295504987239837, + "step": 153180 + }, + { + "epoch": 21.744499645138397, + "grad_norm": 0.25591740012168884, + "learning_rate": 7.826472675656494e-05, + "loss": 0.0289584219455719, + "step": 153190 + }, + { + "epoch": 21.745919091554295, + "grad_norm": 0.6944572925567627, + "learning_rate": 7.826330731014904e-05, + "loss": 0.022837814688682557, + "step": 153200 + }, + { + "epoch": 21.74733853797019, + "grad_norm": 0.04656362161040306, + "learning_rate": 7.826188786373315e-05, + "loss": 0.03208479881286621, + "step": 153210 + }, + { + "epoch": 21.74875798438609, + "grad_norm": 0.02454860508441925, + "learning_rate": 7.826046841731725e-05, + "loss": 0.023649747669696807, + "step": 153220 + }, + { + "epoch": 21.750177430801987, + "grad_norm": 0.16942612826824188, + "learning_rate": 7.825904897090135e-05, + "loss": 0.004064076766371727, + "step": 153230 + }, + { + "epoch": 21.751596877217885, + "grad_norm": 3.852083921432495, + "learning_rate": 7.825762952448546e-05, + "loss": 0.008263742923736573, + "step": 153240 + }, + { + "epoch": 21.753016323633783, + "grad_norm": 0.017206136137247086, + "learning_rate": 7.825621007806955e-05, + "loss": 0.0032096982002258303, + "step": 153250 + }, + { + "epoch": 21.75443577004968, + "grad_norm": 0.8438628315925598, + "learning_rate": 7.825479063165367e-05, + "loss": 0.013080710172653198, + "step": 153260 + }, + { + "epoch": 21.75585521646558, + "grad_norm": 0.1992635875940323, + "learning_rate": 7.825337118523775e-05, + "loss": 0.03803298175334931, + "step": 153270 + }, + { + "epoch": 21.757274662881475, + "grad_norm": 0.5195702910423279, + "learning_rate": 7.825195173882186e-05, + "loss": 0.007099941372871399, + "step": 153280 + }, + { + "epoch": 21.758694109297373, + "grad_norm": 0.12054092437028885, + "learning_rate": 7.825053229240596e-05, + "loss": 0.025510650873184205, + "step": 153290 + }, + { + "epoch": 21.76011355571327, + "grad_norm": 0.06492996215820312, + "learning_rate": 7.824911284599007e-05, + "loss": 0.0024735111743211745, + "step": 153300 + }, + { + "epoch": 21.76153300212917, + "grad_norm": 0.21134309470653534, + "learning_rate": 7.824769339957418e-05, + "loss": 0.01704067438840866, + "step": 153310 + }, + { + "epoch": 21.762952448545068, + "grad_norm": 0.019842656329274178, + "learning_rate": 7.824627395315826e-05, + "loss": 0.02201473116874695, + "step": 153320 + }, + { + "epoch": 21.764371894960966, + "grad_norm": 0.06087081879377365, + "learning_rate": 7.824485450674237e-05, + "loss": 0.0027958892285823824, + "step": 153330 + }, + { + "epoch": 21.765791341376865, + "grad_norm": 0.09023954719305038, + "learning_rate": 7.824343506032647e-05, + "loss": 0.011594323813915253, + "step": 153340 + }, + { + "epoch": 21.76721078779276, + "grad_norm": 0.05745745077729225, + "learning_rate": 7.824201561391058e-05, + "loss": 0.031160730123519897, + "step": 153350 + }, + { + "epoch": 21.768630234208658, + "grad_norm": 5.336764812469482, + "learning_rate": 7.824059616749468e-05, + "loss": 0.055111253261566163, + "step": 153360 + }, + { + "epoch": 21.770049680624556, + "grad_norm": 0.14497631788253784, + "learning_rate": 7.823917672107879e-05, + "loss": 0.028630128502845763, + "step": 153370 + }, + { + "epoch": 21.771469127040454, + "grad_norm": 0.046554479748010635, + "learning_rate": 7.823775727466288e-05, + "loss": 0.022051742672920226, + "step": 153380 + }, + { + "epoch": 21.772888573456353, + "grad_norm": 0.025010576471686363, + "learning_rate": 7.823633782824699e-05, + "loss": 0.013690409064292908, + "step": 153390 + }, + { + "epoch": 21.77430801987225, + "grad_norm": 0.5650805234909058, + "learning_rate": 7.82349183818311e-05, + "loss": 0.006429316103458404, + "step": 153400 + }, + { + "epoch": 21.77572746628815, + "grad_norm": 0.007748200558125973, + "learning_rate": 7.82334989354152e-05, + "loss": 0.012419889867305755, + "step": 153410 + }, + { + "epoch": 21.777146912704044, + "grad_norm": 1.962743878364563, + "learning_rate": 7.82320794889993e-05, + "loss": 0.009915488958358764, + "step": 153420 + }, + { + "epoch": 21.778566359119942, + "grad_norm": 0.16568337380886078, + "learning_rate": 7.823066004258339e-05, + "loss": 0.02838837504386902, + "step": 153430 + }, + { + "epoch": 21.77998580553584, + "grad_norm": 2.308868646621704, + "learning_rate": 7.82292405961675e-05, + "loss": 0.004485277086496353, + "step": 153440 + }, + { + "epoch": 21.78140525195174, + "grad_norm": 0.2990691065788269, + "learning_rate": 7.82278211497516e-05, + "loss": 0.0011759202927350998, + "step": 153450 + }, + { + "epoch": 21.782824698367637, + "grad_norm": 6.5357584953308105, + "learning_rate": 7.822640170333571e-05, + "loss": 0.03398991227149963, + "step": 153460 + }, + { + "epoch": 21.784244144783536, + "grad_norm": 14.344533920288086, + "learning_rate": 7.82249822569198e-05, + "loss": 0.06538378596305847, + "step": 153470 + }, + { + "epoch": 21.785663591199434, + "grad_norm": 0.051818374544382095, + "learning_rate": 7.82235628105039e-05, + "loss": 0.0518205463886261, + "step": 153480 + }, + { + "epoch": 21.78708303761533, + "grad_norm": 0.013941051438450813, + "learning_rate": 7.822214336408801e-05, + "loss": 0.03436464667320251, + "step": 153490 + }, + { + "epoch": 21.788502484031227, + "grad_norm": 14.041629791259766, + "learning_rate": 7.822072391767211e-05, + "loss": 0.019104237854480743, + "step": 153500 + }, + { + "epoch": 21.788502484031227, + "eval_accuracy": 0.9867743371272334, + "eval_loss": 0.05121549963951111, + "eval_runtime": 35.8729, + "eval_samples_per_second": 438.409, + "eval_steps_per_second": 13.715, + "step": 153500 + }, + { + "epoch": 21.789921930447125, + "grad_norm": 11.241312980651855, + "learning_rate": 7.821930447125622e-05, + "loss": 0.02281576246023178, + "step": 153510 + }, + { + "epoch": 21.791341376863024, + "grad_norm": 8.965463638305664, + "learning_rate": 7.821788502484032e-05, + "loss": 0.027075889706611633, + "step": 153520 + }, + { + "epoch": 21.792760823278922, + "grad_norm": 0.48159852623939514, + "learning_rate": 7.821646557842442e-05, + "loss": 0.002733178809285164, + "step": 153530 + }, + { + "epoch": 21.79418026969482, + "grad_norm": 0.2264951914548874, + "learning_rate": 7.821504613200851e-05, + "loss": 0.023916023969650268, + "step": 153540 + }, + { + "epoch": 21.79559971611072, + "grad_norm": 0.10723818093538284, + "learning_rate": 7.821362668559263e-05, + "loss": 0.0016887500882148744, + "step": 153550 + }, + { + "epoch": 21.797019162526613, + "grad_norm": 0.5163711905479431, + "learning_rate": 7.821220723917672e-05, + "loss": 0.0022620681673288347, + "step": 153560 + }, + { + "epoch": 21.79843860894251, + "grad_norm": 0.25083398818969727, + "learning_rate": 7.821078779276083e-05, + "loss": 0.01218239963054657, + "step": 153570 + }, + { + "epoch": 21.79985805535841, + "grad_norm": 0.05272674188017845, + "learning_rate": 7.820936834634493e-05, + "loss": 0.029647791385650636, + "step": 153580 + }, + { + "epoch": 21.801277501774308, + "grad_norm": 0.1452862173318863, + "learning_rate": 7.820794889992903e-05, + "loss": 0.004344970360398293, + "step": 153590 + }, + { + "epoch": 21.802696948190206, + "grad_norm": 0.5718705058097839, + "learning_rate": 7.820652945351314e-05, + "loss": 0.02483063340187073, + "step": 153600 + }, + { + "epoch": 21.804116394606105, + "grad_norm": 0.11687186360359192, + "learning_rate": 7.820511000709724e-05, + "loss": 0.011093208938837052, + "step": 153610 + }, + { + "epoch": 21.805535841022003, + "grad_norm": 0.17301425337791443, + "learning_rate": 7.820369056068135e-05, + "loss": 0.020261986553668974, + "step": 153620 + }, + { + "epoch": 21.806955287437898, + "grad_norm": 4.297011852264404, + "learning_rate": 7.820227111426543e-05, + "loss": 0.011696719378232957, + "step": 153630 + }, + { + "epoch": 21.808374733853796, + "grad_norm": 0.0050639924593269825, + "learning_rate": 7.820085166784954e-05, + "loss": 0.0010669901967048645, + "step": 153640 + }, + { + "epoch": 21.809794180269694, + "grad_norm": 0.01162521168589592, + "learning_rate": 7.819943222143364e-05, + "loss": 0.001954749599099159, + "step": 153650 + }, + { + "epoch": 21.811213626685593, + "grad_norm": 1.3984345197677612, + "learning_rate": 7.819801277501775e-05, + "loss": 0.012833082675933838, + "step": 153660 + }, + { + "epoch": 21.81263307310149, + "grad_norm": 9.12103271484375, + "learning_rate": 7.819659332860185e-05, + "loss": 0.009531941264867783, + "step": 153670 + }, + { + "epoch": 21.81405251951739, + "grad_norm": 0.0724441409111023, + "learning_rate": 7.819517388218595e-05, + "loss": 0.014586853981018066, + "step": 153680 + }, + { + "epoch": 21.815471965933288, + "grad_norm": 1.0554171800613403, + "learning_rate": 7.819375443577006e-05, + "loss": 0.01356469988822937, + "step": 153690 + }, + { + "epoch": 21.816891412349182, + "grad_norm": 0.07517526298761368, + "learning_rate": 7.819233498935415e-05, + "loss": 0.03190800249576568, + "step": 153700 + }, + { + "epoch": 21.81831085876508, + "grad_norm": 6.034252643585205, + "learning_rate": 7.819091554293826e-05, + "loss": 0.027534720301628113, + "step": 153710 + }, + { + "epoch": 21.81973030518098, + "grad_norm": 7.549919128417969, + "learning_rate": 7.818949609652236e-05, + "loss": 0.021649383008480072, + "step": 153720 + }, + { + "epoch": 21.821149751596877, + "grad_norm": 1.6536773443222046, + "learning_rate": 7.818807665010647e-05, + "loss": 0.008604285120964051, + "step": 153730 + }, + { + "epoch": 21.822569198012776, + "grad_norm": 0.340405136346817, + "learning_rate": 7.818665720369056e-05, + "loss": 0.01318999081850052, + "step": 153740 + }, + { + "epoch": 21.823988644428674, + "grad_norm": 18.807472229003906, + "learning_rate": 7.818523775727467e-05, + "loss": 0.059586310386657716, + "step": 153750 + }, + { + "epoch": 21.825408090844572, + "grad_norm": 2.030266761779785, + "learning_rate": 7.818381831085877e-05, + "loss": 0.04988524913787842, + "step": 153760 + }, + { + "epoch": 21.826827537260467, + "grad_norm": 0.04594357684254646, + "learning_rate": 7.818239886444288e-05, + "loss": 0.004629241675138474, + "step": 153770 + }, + { + "epoch": 21.828246983676365, + "grad_norm": 0.2503243386745453, + "learning_rate": 7.818097941802697e-05, + "loss": 0.0433184415102005, + "step": 153780 + }, + { + "epoch": 21.829666430092264, + "grad_norm": 13.012614250183105, + "learning_rate": 7.817955997161107e-05, + "loss": 0.02149115353822708, + "step": 153790 + }, + { + "epoch": 21.831085876508162, + "grad_norm": 0.09607389569282532, + "learning_rate": 7.817814052519518e-05, + "loss": 0.0733042061328888, + "step": 153800 + }, + { + "epoch": 21.83250532292406, + "grad_norm": 2.308772563934326, + "learning_rate": 7.817672107877928e-05, + "loss": 0.03575766682624817, + "step": 153810 + }, + { + "epoch": 21.83392476933996, + "grad_norm": 2.3713185787200928, + "learning_rate": 7.817530163236339e-05, + "loss": 0.031943559646606445, + "step": 153820 + }, + { + "epoch": 21.835344215755857, + "grad_norm": 0.12006524205207825, + "learning_rate": 7.817388218594749e-05, + "loss": 0.014590749144554138, + "step": 153830 + }, + { + "epoch": 21.83676366217175, + "grad_norm": 0.27648451924324036, + "learning_rate": 7.817246273953158e-05, + "loss": 0.013884636759757995, + "step": 153840 + }, + { + "epoch": 21.83818310858765, + "grad_norm": 0.5197448134422302, + "learning_rate": 7.817104329311568e-05, + "loss": 0.017557638883590698, + "step": 153850 + }, + { + "epoch": 21.839602555003548, + "grad_norm": 0.04726022481918335, + "learning_rate": 7.816962384669979e-05, + "loss": 0.01442246288061142, + "step": 153860 + }, + { + "epoch": 21.841022001419446, + "grad_norm": 0.04562228173017502, + "learning_rate": 7.816820440028389e-05, + "loss": 0.03814417719841003, + "step": 153870 + }, + { + "epoch": 21.842441447835345, + "grad_norm": 0.03501862660050392, + "learning_rate": 7.8166784953868e-05, + "loss": 0.0066195674240589145, + "step": 153880 + }, + { + "epoch": 21.843860894251243, + "grad_norm": 0.043678659945726395, + "learning_rate": 7.81653655074521e-05, + "loss": 0.012177056074142456, + "step": 153890 + }, + { + "epoch": 21.84528034066714, + "grad_norm": 0.5099119544029236, + "learning_rate": 7.81639460610362e-05, + "loss": 0.02323257625102997, + "step": 153900 + }, + { + "epoch": 21.846699787083036, + "grad_norm": 7.5391364097595215, + "learning_rate": 7.816252661462031e-05, + "loss": 0.018948441743850707, + "step": 153910 + }, + { + "epoch": 21.848119233498934, + "grad_norm": 7.553627014160156, + "learning_rate": 7.81611071682044e-05, + "loss": 0.02902989387512207, + "step": 153920 + }, + { + "epoch": 21.849538679914833, + "grad_norm": 0.03822808340191841, + "learning_rate": 7.815968772178852e-05, + "loss": 0.018215447664260864, + "step": 153930 + }, + { + "epoch": 21.85095812633073, + "grad_norm": 0.015239638276398182, + "learning_rate": 7.81582682753726e-05, + "loss": 0.014152388274669647, + "step": 153940 + }, + { + "epoch": 21.85237757274663, + "grad_norm": 11.15691089630127, + "learning_rate": 7.815684882895671e-05, + "loss": 0.017082099616527558, + "step": 153950 + }, + { + "epoch": 21.853797019162528, + "grad_norm": 0.4507918655872345, + "learning_rate": 7.815542938254081e-05, + "loss": 0.020993798971176147, + "step": 153960 + }, + { + "epoch": 21.855216465578426, + "grad_norm": 0.5891238451004028, + "learning_rate": 7.815400993612492e-05, + "loss": 0.0020621318370103838, + "step": 153970 + }, + { + "epoch": 21.85663591199432, + "grad_norm": 1.1228102445602417, + "learning_rate": 7.815259048970902e-05, + "loss": 0.020698103308677673, + "step": 153980 + }, + { + "epoch": 21.85805535841022, + "grad_norm": 0.1619359850883484, + "learning_rate": 7.815117104329311e-05, + "loss": 0.00960942655801773, + "step": 153990 + }, + { + "epoch": 21.859474804826117, + "grad_norm": 0.05868132412433624, + "learning_rate": 7.814975159687722e-05, + "loss": 0.04172232151031494, + "step": 154000 + }, + { + "epoch": 21.859474804826117, + "eval_accuracy": 0.9889362243275895, + "eval_loss": 0.03679247573018074, + "eval_runtime": 32.566, + "eval_samples_per_second": 482.926, + "eval_steps_per_second": 15.108, + "step": 154000 + }, + { + "epoch": 21.860894251242016, + "grad_norm": 2.9139366149902344, + "learning_rate": 7.814833215046132e-05, + "loss": 0.024889492988586427, + "step": 154010 + }, + { + "epoch": 21.862313697657914, + "grad_norm": 14.321552276611328, + "learning_rate": 7.814691270404543e-05, + "loss": 0.05515966415405273, + "step": 154020 + }, + { + "epoch": 21.863733144073812, + "grad_norm": 0.34195181727409363, + "learning_rate": 7.814549325762953e-05, + "loss": 0.01406717300415039, + "step": 154030 + }, + { + "epoch": 21.86515259048971, + "grad_norm": 1.4232006072998047, + "learning_rate": 7.814407381121363e-05, + "loss": 0.030384355783462526, + "step": 154040 + }, + { + "epoch": 21.866572036905605, + "grad_norm": 0.09030896425247192, + "learning_rate": 7.814265436479772e-05, + "loss": 0.01863052248954773, + "step": 154050 + }, + { + "epoch": 21.867991483321504, + "grad_norm": 0.04144523665308952, + "learning_rate": 7.814123491838184e-05, + "loss": 0.011323368549346924, + "step": 154060 + }, + { + "epoch": 21.869410929737402, + "grad_norm": 0.37034347653388977, + "learning_rate": 7.813981547196593e-05, + "loss": 0.011937695741653442, + "step": 154070 + }, + { + "epoch": 21.8708303761533, + "grad_norm": 3.7782363891601562, + "learning_rate": 7.813839602555004e-05, + "loss": 0.02044772505760193, + "step": 154080 + }, + { + "epoch": 21.8722498225692, + "grad_norm": 0.13608616590499878, + "learning_rate": 7.813697657913414e-05, + "loss": 0.016649089753627777, + "step": 154090 + }, + { + "epoch": 21.873669268985097, + "grad_norm": 0.12344853579998016, + "learning_rate": 7.813555713271824e-05, + "loss": 0.013016656041145325, + "step": 154100 + }, + { + "epoch": 21.875088715400995, + "grad_norm": 0.27702587842941284, + "learning_rate": 7.813413768630235e-05, + "loss": 0.009413078427314758, + "step": 154110 + }, + { + "epoch": 21.87650816181689, + "grad_norm": 0.018509123474359512, + "learning_rate": 7.813271823988645e-05, + "loss": 0.005292629078030586, + "step": 154120 + }, + { + "epoch": 21.87792760823279, + "grad_norm": 0.5372377038002014, + "learning_rate": 7.813129879347056e-05, + "loss": 0.023788055777549742, + "step": 154130 + }, + { + "epoch": 21.879347054648687, + "grad_norm": 2.963995933532715, + "learning_rate": 7.812987934705466e-05, + "loss": 0.01722568869590759, + "step": 154140 + }, + { + "epoch": 21.880766501064585, + "grad_norm": 1.9832671880722046, + "learning_rate": 7.812845990063875e-05, + "loss": 0.0031067762523889542, + "step": 154150 + }, + { + "epoch": 21.882185947480483, + "grad_norm": 0.6360449194908142, + "learning_rate": 7.812704045422285e-05, + "loss": 0.01643763333559036, + "step": 154160 + }, + { + "epoch": 21.88360539389638, + "grad_norm": 2.5252861976623535, + "learning_rate": 7.812562100780696e-05, + "loss": 0.020189228653907775, + "step": 154170 + }, + { + "epoch": 21.88502484031228, + "grad_norm": 0.014000259339809418, + "learning_rate": 7.812420156139106e-05, + "loss": 0.03599994480609894, + "step": 154180 + }, + { + "epoch": 21.886444286728175, + "grad_norm": 0.9683640003204346, + "learning_rate": 7.812278211497517e-05, + "loss": 0.010131759941577912, + "step": 154190 + }, + { + "epoch": 21.887863733144073, + "grad_norm": 5.502777099609375, + "learning_rate": 7.812136266855927e-05, + "loss": 0.007615327835083008, + "step": 154200 + }, + { + "epoch": 21.88928317955997, + "grad_norm": 0.09901970624923706, + "learning_rate": 7.811994322214336e-05, + "loss": 0.05956762433052063, + "step": 154210 + }, + { + "epoch": 21.89070262597587, + "grad_norm": 0.07101402431726456, + "learning_rate": 7.811852377572747e-05, + "loss": 0.03807120621204376, + "step": 154220 + }, + { + "epoch": 21.892122072391768, + "grad_norm": 0.17352159321308136, + "learning_rate": 7.811710432931157e-05, + "loss": 0.037606841325759886, + "step": 154230 + }, + { + "epoch": 21.893541518807666, + "grad_norm": 0.03480452671647072, + "learning_rate": 7.811568488289568e-05, + "loss": 0.02123567759990692, + "step": 154240 + }, + { + "epoch": 21.894960965223564, + "grad_norm": 1.6938343048095703, + "learning_rate": 7.811426543647977e-05, + "loss": 0.009575704485177994, + "step": 154250 + }, + { + "epoch": 21.89638041163946, + "grad_norm": 5.563751220703125, + "learning_rate": 7.811284599006388e-05, + "loss": 0.014619916677474976, + "step": 154260 + }, + { + "epoch": 21.897799858055357, + "grad_norm": 3.6850554943084717, + "learning_rate": 7.811142654364798e-05, + "loss": 0.012736782431602478, + "step": 154270 + }, + { + "epoch": 21.899219304471256, + "grad_norm": 0.18301618099212646, + "learning_rate": 7.811000709723209e-05, + "loss": 0.04421873688697815, + "step": 154280 + }, + { + "epoch": 21.900638750887154, + "grad_norm": 1.3291219472885132, + "learning_rate": 7.810858765081618e-05, + "loss": 0.014394429326057435, + "step": 154290 + }, + { + "epoch": 21.902058197303052, + "grad_norm": 9.68277645111084, + "learning_rate": 7.810716820440028e-05, + "loss": 0.024882644414901733, + "step": 154300 + }, + { + "epoch": 21.90347764371895, + "grad_norm": 2.079035997390747, + "learning_rate": 7.810574875798439e-05, + "loss": 0.0065599560737609865, + "step": 154310 + }, + { + "epoch": 21.90489709013485, + "grad_norm": 0.15813897550106049, + "learning_rate": 7.810432931156849e-05, + "loss": 0.002763701230287552, + "step": 154320 + }, + { + "epoch": 21.906316536550744, + "grad_norm": 15.168010711669922, + "learning_rate": 7.81029098651526e-05, + "loss": 0.0380634069442749, + "step": 154330 + }, + { + "epoch": 21.907735982966642, + "grad_norm": 0.3788428008556366, + "learning_rate": 7.81014904187367e-05, + "loss": 0.002792154997587204, + "step": 154340 + }, + { + "epoch": 21.90915542938254, + "grad_norm": 0.1061665341258049, + "learning_rate": 7.81000709723208e-05, + "loss": 0.010237842798233032, + "step": 154350 + }, + { + "epoch": 21.91057487579844, + "grad_norm": 0.2932637631893158, + "learning_rate": 7.809865152590489e-05, + "loss": 0.00483965128660202, + "step": 154360 + }, + { + "epoch": 21.911994322214337, + "grad_norm": 0.6183751225471497, + "learning_rate": 7.8097232079489e-05, + "loss": 0.009147481620311737, + "step": 154370 + }, + { + "epoch": 21.913413768630235, + "grad_norm": 1.8843038082122803, + "learning_rate": 7.80958126330731e-05, + "loss": 0.02588422894477844, + "step": 154380 + }, + { + "epoch": 21.914833215046134, + "grad_norm": 0.3445003926753998, + "learning_rate": 7.809439318665721e-05, + "loss": 0.01821531355381012, + "step": 154390 + }, + { + "epoch": 21.91625266146203, + "grad_norm": 6.810329437255859, + "learning_rate": 7.809297374024131e-05, + "loss": 0.014560402929782867, + "step": 154400 + }, + { + "epoch": 21.917672107877927, + "grad_norm": 0.10481465607881546, + "learning_rate": 7.80915542938254e-05, + "loss": 0.006879754364490509, + "step": 154410 + }, + { + "epoch": 21.919091554293825, + "grad_norm": 0.07877585291862488, + "learning_rate": 7.809013484740952e-05, + "loss": 0.0070834577083587645, + "step": 154420 + }, + { + "epoch": 21.920511000709723, + "grad_norm": 0.13409355282783508, + "learning_rate": 7.808871540099361e-05, + "loss": 0.003646743297576904, + "step": 154430 + }, + { + "epoch": 21.92193044712562, + "grad_norm": 0.06437573581933975, + "learning_rate": 7.808729595457773e-05, + "loss": 0.0024181947112083433, + "step": 154440 + }, + { + "epoch": 21.92334989354152, + "grad_norm": 0.24486714601516724, + "learning_rate": 7.808587650816182e-05, + "loss": 0.04109184443950653, + "step": 154450 + }, + { + "epoch": 21.924769339957418, + "grad_norm": 1.623465657234192, + "learning_rate": 7.808459900638752e-05, + "loss": 0.043286874890327454, + "step": 154460 + }, + { + "epoch": 21.926188786373313, + "grad_norm": 9.09732723236084, + "learning_rate": 7.808317955997162e-05, + "loss": 0.04276767373085022, + "step": 154470 + }, + { + "epoch": 21.92760823278921, + "grad_norm": 0.1123630627989769, + "learning_rate": 7.808176011355572e-05, + "loss": 0.03018498718738556, + "step": 154480 + }, + { + "epoch": 21.92902767920511, + "grad_norm": 1.2968755960464478, + "learning_rate": 7.808034066713981e-05, + "loss": 0.00842936933040619, + "step": 154490 + }, + { + "epoch": 21.930447125621008, + "grad_norm": 6.410153388977051, + "learning_rate": 7.807892122072392e-05, + "loss": 0.029785120487213136, + "step": 154500 + }, + { + "epoch": 21.930447125621008, + "eval_accuracy": 0.9849939594328225, + "eval_loss": 0.055502839386463165, + "eval_runtime": 30.8404, + "eval_samples_per_second": 509.948, + "eval_steps_per_second": 15.953, + "step": 154500 + }, + { + "epoch": 21.931866572036906, + "grad_norm": 2.214832305908203, + "learning_rate": 7.807750177430802e-05, + "loss": 0.0024904105812311172, + "step": 154510 + }, + { + "epoch": 21.933286018452804, + "grad_norm": 0.6666872501373291, + "learning_rate": 7.807608232789213e-05, + "loss": 0.016865538060665132, + "step": 154520 + }, + { + "epoch": 21.934705464868703, + "grad_norm": 0.034638773649930954, + "learning_rate": 7.807466288147623e-05, + "loss": 0.03732748031616211, + "step": 154530 + }, + { + "epoch": 21.936124911284598, + "grad_norm": 0.013335819356143475, + "learning_rate": 7.807324343506033e-05, + "loss": 0.03298913538455963, + "step": 154540 + }, + { + "epoch": 21.937544357700496, + "grad_norm": 1.7169420719146729, + "learning_rate": 7.807182398864444e-05, + "loss": 0.01729605495929718, + "step": 154550 + }, + { + "epoch": 21.938963804116394, + "grad_norm": 2.5404300689697266, + "learning_rate": 7.807040454222854e-05, + "loss": 0.024908363819122314, + "step": 154560 + }, + { + "epoch": 21.940383250532292, + "grad_norm": 0.19617094099521637, + "learning_rate": 7.806898509581265e-05, + "loss": 0.019650378823280336, + "step": 154570 + }, + { + "epoch": 21.94180269694819, + "grad_norm": 0.06800790876150131, + "learning_rate": 7.806756564939673e-05, + "loss": 0.029100710153579713, + "step": 154580 + }, + { + "epoch": 21.94322214336409, + "grad_norm": 0.03311114013195038, + "learning_rate": 7.806614620298084e-05, + "loss": 0.039703932404518125, + "step": 154590 + }, + { + "epoch": 21.944641589779987, + "grad_norm": 1.2995303869247437, + "learning_rate": 7.806472675656494e-05, + "loss": 0.014335697889328003, + "step": 154600 + }, + { + "epoch": 21.946061036195882, + "grad_norm": 0.0780157744884491, + "learning_rate": 7.806330731014905e-05, + "loss": 0.04504201710224152, + "step": 154610 + }, + { + "epoch": 21.94748048261178, + "grad_norm": 0.02289555035531521, + "learning_rate": 7.806188786373315e-05, + "loss": 0.05627798438072205, + "step": 154620 + }, + { + "epoch": 21.94889992902768, + "grad_norm": 2.8841443061828613, + "learning_rate": 7.806046841731724e-05, + "loss": 0.015477654337882996, + "step": 154630 + }, + { + "epoch": 21.950319375443577, + "grad_norm": 0.0059392815455794334, + "learning_rate": 7.805904897090136e-05, + "loss": 0.01016545444726944, + "step": 154640 + }, + { + "epoch": 21.951738821859475, + "grad_norm": 3.1028687953948975, + "learning_rate": 7.805762952448545e-05, + "loss": 0.011103053390979768, + "step": 154650 + }, + { + "epoch": 21.953158268275374, + "grad_norm": 0.3804731070995331, + "learning_rate": 7.805621007806956e-05, + "loss": 0.011945173144340515, + "step": 154660 + }, + { + "epoch": 21.954577714691272, + "grad_norm": 3.268479824066162, + "learning_rate": 7.805479063165366e-05, + "loss": 0.0068371020257472995, + "step": 154670 + }, + { + "epoch": 21.955997161107167, + "grad_norm": 0.09896805137395859, + "learning_rate": 7.805337118523776e-05, + "loss": 0.007723100483417511, + "step": 154680 + }, + { + "epoch": 21.957416607523065, + "grad_norm": 13.422646522521973, + "learning_rate": 7.805195173882186e-05, + "loss": 0.05607501268386841, + "step": 154690 + }, + { + "epoch": 21.958836053938963, + "grad_norm": 2.0482535362243652, + "learning_rate": 7.805053229240597e-05, + "loss": 0.009087377041578294, + "step": 154700 + }, + { + "epoch": 21.96025550035486, + "grad_norm": 0.26454684138298035, + "learning_rate": 7.804911284599006e-05, + "loss": 0.010782875120639801, + "step": 154710 + }, + { + "epoch": 21.96167494677076, + "grad_norm": 0.24704919755458832, + "learning_rate": 7.804769339957417e-05, + "loss": 0.016835980117321014, + "step": 154720 + }, + { + "epoch": 21.96309439318666, + "grad_norm": 0.2495969831943512, + "learning_rate": 7.804627395315827e-05, + "loss": 0.024556544423103333, + "step": 154730 + }, + { + "epoch": 21.964513839602557, + "grad_norm": 0.10143609344959259, + "learning_rate": 7.804485450674237e-05, + "loss": 0.002733834460377693, + "step": 154740 + }, + { + "epoch": 21.96593328601845, + "grad_norm": 0.8265405297279358, + "learning_rate": 7.804343506032648e-05, + "loss": 0.021858350932598115, + "step": 154750 + }, + { + "epoch": 21.96735273243435, + "grad_norm": 8.584877014160156, + "learning_rate": 7.804201561391058e-05, + "loss": 0.03559152483940124, + "step": 154760 + }, + { + "epoch": 21.968772178850248, + "grad_norm": 0.08707303553819656, + "learning_rate": 7.804059616749469e-05, + "loss": 0.004393692687153816, + "step": 154770 + }, + { + "epoch": 21.970191625266146, + "grad_norm": 0.012651021592319012, + "learning_rate": 7.803917672107879e-05, + "loss": 0.016604630649089812, + "step": 154780 + }, + { + "epoch": 21.971611071682045, + "grad_norm": 0.004768799990415573, + "learning_rate": 7.803775727466288e-05, + "loss": 0.056801730394363405, + "step": 154790 + }, + { + "epoch": 21.973030518097943, + "grad_norm": 0.4480474591255188, + "learning_rate": 7.803633782824698e-05, + "loss": 0.035220798850059507, + "step": 154800 + }, + { + "epoch": 21.97444996451384, + "grad_norm": 0.21288400888442993, + "learning_rate": 7.803491838183109e-05, + "loss": 0.01290290653705597, + "step": 154810 + }, + { + "epoch": 21.975869410929736, + "grad_norm": 0.025511683896183968, + "learning_rate": 7.803349893541519e-05, + "loss": 0.011616971343755722, + "step": 154820 + }, + { + "epoch": 21.977288857345634, + "grad_norm": 0.034911077469587326, + "learning_rate": 7.80320794889993e-05, + "loss": 0.008030341565608978, + "step": 154830 + }, + { + "epoch": 21.978708303761533, + "grad_norm": 1.0137912034988403, + "learning_rate": 7.80306600425834e-05, + "loss": 0.01242779940366745, + "step": 154840 + }, + { + "epoch": 21.98012775017743, + "grad_norm": 7.8683295249938965, + "learning_rate": 7.80292405961675e-05, + "loss": 0.032010632753372195, + "step": 154850 + }, + { + "epoch": 21.98154719659333, + "grad_norm": 0.04715187847614288, + "learning_rate": 7.80278211497516e-05, + "loss": 0.017515525221824646, + "step": 154860 + }, + { + "epoch": 21.982966643009227, + "grad_norm": 0.2354537546634674, + "learning_rate": 7.80264017033357e-05, + "loss": 0.027616971731185914, + "step": 154870 + }, + { + "epoch": 21.984386089425126, + "grad_norm": 0.28635096549987793, + "learning_rate": 7.802498225691981e-05, + "loss": 0.0351090282201767, + "step": 154880 + }, + { + "epoch": 21.98580553584102, + "grad_norm": 0.05818726867437363, + "learning_rate": 7.80235628105039e-05, + "loss": 0.011704829335212708, + "step": 154890 + }, + { + "epoch": 21.98722498225692, + "grad_norm": 11.157441139221191, + "learning_rate": 7.802214336408801e-05, + "loss": 0.02067708820104599, + "step": 154900 + }, + { + "epoch": 21.988644428672817, + "grad_norm": 0.09199775010347366, + "learning_rate": 7.80207239176721e-05, + "loss": 0.02338854968547821, + "step": 154910 + }, + { + "epoch": 21.990063875088715, + "grad_norm": 0.391520231962204, + "learning_rate": 7.801930447125622e-05, + "loss": 0.030699992179870607, + "step": 154920 + }, + { + "epoch": 21.991483321504614, + "grad_norm": 0.27043333649635315, + "learning_rate": 7.801788502484031e-05, + "loss": 0.011358876526355744, + "step": 154930 + }, + { + "epoch": 21.992902767920512, + "grad_norm": 4.141388416290283, + "learning_rate": 7.801646557842441e-05, + "loss": 0.016875098645687103, + "step": 154940 + }, + { + "epoch": 21.99432221433641, + "grad_norm": 0.01946868561208248, + "learning_rate": 7.801504613200852e-05, + "loss": 0.013694144785404205, + "step": 154950 + }, + { + "epoch": 21.995741660752305, + "grad_norm": 0.2421337515115738, + "learning_rate": 7.801362668559262e-05, + "loss": 0.01427844762802124, + "step": 154960 + }, + { + "epoch": 21.997161107168203, + "grad_norm": 0.4987391531467438, + "learning_rate": 7.801220723917673e-05, + "loss": 0.009556710720062256, + "step": 154970 + }, + { + "epoch": 21.9985805535841, + "grad_norm": 3.287604331970215, + "learning_rate": 7.801078779276083e-05, + "loss": 0.004888007044792175, + "step": 154980 + }, + { + "epoch": 22.0, + "grad_norm": 0.33692628145217896, + "learning_rate": 7.800936834634493e-05, + "loss": 0.0034068193286657333, + "step": 154990 + }, + { + "epoch": 22.0014194464159, + "grad_norm": 0.5378328561782837, + "learning_rate": 7.800794889992902e-05, + "loss": 0.008489087224006653, + "step": 155000 + }, + { + "epoch": 22.0014194464159, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.045378126204013824, + "eval_runtime": 31.4507, + "eval_samples_per_second": 500.053, + "eval_steps_per_second": 15.644, + "step": 155000 + }, + { + "epoch": 22.002838892831797, + "grad_norm": 0.517014741897583, + "learning_rate": 7.800652945351313e-05, + "loss": 0.003619857132434845, + "step": 155010 + }, + { + "epoch": 22.004258339247695, + "grad_norm": 0.2467700093984604, + "learning_rate": 7.800511000709723e-05, + "loss": 0.005431044474244117, + "step": 155020 + }, + { + "epoch": 22.00567778566359, + "grad_norm": 0.6217114925384521, + "learning_rate": 7.800369056068134e-05, + "loss": 0.004910755157470703, + "step": 155030 + }, + { + "epoch": 22.007097232079488, + "grad_norm": 0.05812761187553406, + "learning_rate": 7.800227111426544e-05, + "loss": 0.0025191348046064376, + "step": 155040 + }, + { + "epoch": 22.008516678495386, + "grad_norm": 0.09940718859434128, + "learning_rate": 7.800085166784954e-05, + "loss": 0.03566046059131622, + "step": 155050 + }, + { + "epoch": 22.009936124911285, + "grad_norm": 0.022134188562631607, + "learning_rate": 7.799943222143365e-05, + "loss": 0.009848525375127792, + "step": 155060 + }, + { + "epoch": 22.011355571327183, + "grad_norm": 1.8984607458114624, + "learning_rate": 7.799801277501775e-05, + "loss": 0.0072865508496761325, + "step": 155070 + }, + { + "epoch": 22.01277501774308, + "grad_norm": 1.6500037908554077, + "learning_rate": 7.799659332860186e-05, + "loss": 0.01307196319103241, + "step": 155080 + }, + { + "epoch": 22.01419446415898, + "grad_norm": 1.2104592323303223, + "learning_rate": 7.799517388218594e-05, + "loss": 0.029790017008781432, + "step": 155090 + }, + { + "epoch": 22.015613910574874, + "grad_norm": 11.868368148803711, + "learning_rate": 7.799375443577005e-05, + "loss": 0.03271047472953796, + "step": 155100 + }, + { + "epoch": 22.017033356990773, + "grad_norm": 0.30061787366867065, + "learning_rate": 7.799233498935415e-05, + "loss": 0.008466323465108871, + "step": 155110 + }, + { + "epoch": 22.01845280340667, + "grad_norm": 0.18652912974357605, + "learning_rate": 7.799091554293826e-05, + "loss": 0.023596185445785522, + "step": 155120 + }, + { + "epoch": 22.01987224982257, + "grad_norm": 0.5794810652732849, + "learning_rate": 7.798949609652236e-05, + "loss": 0.010494596511125564, + "step": 155130 + }, + { + "epoch": 22.021291696238467, + "grad_norm": 11.264876365661621, + "learning_rate": 7.798807665010647e-05, + "loss": 0.026669526100158693, + "step": 155140 + }, + { + "epoch": 22.022711142654366, + "grad_norm": 0.9030976295471191, + "learning_rate": 7.798665720369057e-05, + "loss": 0.016747140884399415, + "step": 155150 + }, + { + "epoch": 22.024130589070264, + "grad_norm": 0.01608671434223652, + "learning_rate": 7.798523775727466e-05, + "loss": 0.013013333082199097, + "step": 155160 + }, + { + "epoch": 22.02555003548616, + "grad_norm": 0.04131733253598213, + "learning_rate": 7.798381831085877e-05, + "loss": 0.03603638708591461, + "step": 155170 + }, + { + "epoch": 22.026969481902057, + "grad_norm": 9.654023170471191, + "learning_rate": 7.798239886444287e-05, + "loss": 0.047304791212081906, + "step": 155180 + }, + { + "epoch": 22.028388928317955, + "grad_norm": 0.5198435187339783, + "learning_rate": 7.798097941802698e-05, + "loss": 0.005861113592982292, + "step": 155190 + }, + { + "epoch": 22.029808374733854, + "grad_norm": 0.008671704679727554, + "learning_rate": 7.797955997161107e-05, + "loss": 0.010075002908706665, + "step": 155200 + }, + { + "epoch": 22.031227821149752, + "grad_norm": 0.005082256160676479, + "learning_rate": 7.797814052519518e-05, + "loss": 0.0028138749301433565, + "step": 155210 + }, + { + "epoch": 22.03264726756565, + "grad_norm": 0.27768898010253906, + "learning_rate": 7.797672107877927e-05, + "loss": 0.03913818895816803, + "step": 155220 + }, + { + "epoch": 22.03406671398155, + "grad_norm": 0.01929730363190174, + "learning_rate": 7.797530163236339e-05, + "loss": 0.011073459684848786, + "step": 155230 + }, + { + "epoch": 22.035486160397443, + "grad_norm": 0.4176993668079376, + "learning_rate": 7.797388218594748e-05, + "loss": 0.0042030222713947294, + "step": 155240 + }, + { + "epoch": 22.03690560681334, + "grad_norm": 7.411360263824463, + "learning_rate": 7.797246273953158e-05, + "loss": 0.07728085517883301, + "step": 155250 + }, + { + "epoch": 22.03832505322924, + "grad_norm": 0.8839936852455139, + "learning_rate": 7.797104329311569e-05, + "loss": 0.007268577069044113, + "step": 155260 + }, + { + "epoch": 22.03974449964514, + "grad_norm": 0.004229824058711529, + "learning_rate": 7.796962384669979e-05, + "loss": 0.010494904220104217, + "step": 155270 + }, + { + "epoch": 22.041163946061037, + "grad_norm": 0.007509848568588495, + "learning_rate": 7.79682044002839e-05, + "loss": 0.013756307959556579, + "step": 155280 + }, + { + "epoch": 22.042583392476935, + "grad_norm": 2.2406997680664062, + "learning_rate": 7.7966784953868e-05, + "loss": 0.04622211754322052, + "step": 155290 + }, + { + "epoch": 22.044002838892833, + "grad_norm": 0.1742996722459793, + "learning_rate": 7.79653655074521e-05, + "loss": 0.0646554946899414, + "step": 155300 + }, + { + "epoch": 22.045422285308728, + "grad_norm": 0.39729222655296326, + "learning_rate": 7.796394606103619e-05, + "loss": 0.008759084343910217, + "step": 155310 + }, + { + "epoch": 22.046841731724626, + "grad_norm": 11.189043998718262, + "learning_rate": 7.79625266146203e-05, + "loss": 0.09578610062599183, + "step": 155320 + }, + { + "epoch": 22.048261178140525, + "grad_norm": 2.2552695274353027, + "learning_rate": 7.79611071682044e-05, + "loss": 0.03078637719154358, + "step": 155330 + }, + { + "epoch": 22.049680624556423, + "grad_norm": 16.450414657592773, + "learning_rate": 7.795968772178851e-05, + "loss": 0.053510856628417966, + "step": 155340 + }, + { + "epoch": 22.05110007097232, + "grad_norm": 0.1485617756843567, + "learning_rate": 7.795826827537261e-05, + "loss": 0.0057105042040348055, + "step": 155350 + }, + { + "epoch": 22.05251951738822, + "grad_norm": 0.3045009970664978, + "learning_rate": 7.79568488289567e-05, + "loss": 0.004956775158643722, + "step": 155360 + }, + { + "epoch": 22.053938963804118, + "grad_norm": 2.128180980682373, + "learning_rate": 7.795542938254082e-05, + "loss": 0.006717742979526519, + "step": 155370 + }, + { + "epoch": 22.055358410220013, + "grad_norm": 2.804896831512451, + "learning_rate": 7.795400993612491e-05, + "loss": 0.005779685080051422, + "step": 155380 + }, + { + "epoch": 22.05677785663591, + "grad_norm": 0.546726405620575, + "learning_rate": 7.795259048970902e-05, + "loss": 0.012899559736251832, + "step": 155390 + }, + { + "epoch": 22.05819730305181, + "grad_norm": 0.16220535337924957, + "learning_rate": 7.795117104329311e-05, + "loss": 0.013610082864761352, + "step": 155400 + }, + { + "epoch": 22.059616749467708, + "grad_norm": 10.711543083190918, + "learning_rate": 7.794975159687722e-05, + "loss": 0.018066035211086275, + "step": 155410 + }, + { + "epoch": 22.061036195883606, + "grad_norm": 0.4655384421348572, + "learning_rate": 7.794833215046132e-05, + "loss": 0.006116500124335289, + "step": 155420 + }, + { + "epoch": 22.062455642299504, + "grad_norm": 0.0061499085277318954, + "learning_rate": 7.794691270404543e-05, + "loss": 0.016710303723812103, + "step": 155430 + }, + { + "epoch": 22.063875088715402, + "grad_norm": 0.04304055497050285, + "learning_rate": 7.794549325762952e-05, + "loss": 0.022531065344810485, + "step": 155440 + }, + { + "epoch": 22.065294535131297, + "grad_norm": 2.517754316329956, + "learning_rate": 7.794407381121364e-05, + "loss": 0.020814283192157744, + "step": 155450 + }, + { + "epoch": 22.066713981547196, + "grad_norm": 0.05555732548236847, + "learning_rate": 7.794265436479773e-05, + "loss": 0.010783886909484864, + "step": 155460 + }, + { + "epoch": 22.068133427963094, + "grad_norm": 6.615963935852051, + "learning_rate": 7.794123491838183e-05, + "loss": 0.027560511231422426, + "step": 155470 + }, + { + "epoch": 22.069552874378992, + "grad_norm": 1.3628363609313965, + "learning_rate": 7.793981547196594e-05, + "loss": 0.03230634331703186, + "step": 155480 + }, + { + "epoch": 22.07097232079489, + "grad_norm": 1.6696698665618896, + "learning_rate": 7.793839602555004e-05, + "loss": 0.011623720824718475, + "step": 155490 + }, + { + "epoch": 22.07239176721079, + "grad_norm": 0.768450915813446, + "learning_rate": 7.793697657913415e-05, + "loss": 0.08195879459381103, + "step": 155500 + }, + { + "epoch": 22.07239176721079, + "eval_accuracy": 0.9865835823742608, + "eval_loss": 0.05170947313308716, + "eval_runtime": 32.4853, + "eval_samples_per_second": 484.127, + "eval_steps_per_second": 15.145, + "step": 155500 + }, + { + "epoch": 22.073811213626687, + "grad_norm": 0.7256195545196533, + "learning_rate": 7.793555713271823e-05, + "loss": 0.002033026143908501, + "step": 155510 + }, + { + "epoch": 22.075230660042582, + "grad_norm": 0.22973446547985077, + "learning_rate": 7.793413768630234e-05, + "loss": 0.02083883583545685, + "step": 155520 + }, + { + "epoch": 22.07665010645848, + "grad_norm": 0.2658345103263855, + "learning_rate": 7.793271823988644e-05, + "loss": 0.003149613365530968, + "step": 155530 + }, + { + "epoch": 22.07806955287438, + "grad_norm": 1.177682876586914, + "learning_rate": 7.793129879347055e-05, + "loss": 0.00700526088476181, + "step": 155540 + }, + { + "epoch": 22.079488999290277, + "grad_norm": 2.328082323074341, + "learning_rate": 7.792987934705466e-05, + "loss": 0.012177440524101257, + "step": 155550 + }, + { + "epoch": 22.080908445706175, + "grad_norm": 0.4193543493747711, + "learning_rate": 7.792845990063875e-05, + "loss": 0.007613614201545715, + "step": 155560 + }, + { + "epoch": 22.082327892122073, + "grad_norm": 0.35491296648979187, + "learning_rate": 7.792704045422286e-05, + "loss": 0.002593757212162018, + "step": 155570 + }, + { + "epoch": 22.08374733853797, + "grad_norm": 0.04715914651751518, + "learning_rate": 7.792562100780696e-05, + "loss": 0.02183646559715271, + "step": 155580 + }, + { + "epoch": 22.085166784953866, + "grad_norm": 0.04513808339834213, + "learning_rate": 7.792420156139107e-05, + "loss": 0.002988416701555252, + "step": 155590 + }, + { + "epoch": 22.086586231369765, + "grad_norm": 8.57950210571289, + "learning_rate": 7.792278211497516e-05, + "loss": 0.029389482736587525, + "step": 155600 + }, + { + "epoch": 22.088005677785663, + "grad_norm": 0.2615174353122711, + "learning_rate": 7.792136266855926e-05, + "loss": 0.007799801230430603, + "step": 155610 + }, + { + "epoch": 22.08942512420156, + "grad_norm": 0.010170727968215942, + "learning_rate": 7.791994322214336e-05, + "loss": 0.018558862805366515, + "step": 155620 + }, + { + "epoch": 22.09084457061746, + "grad_norm": 19.932592391967773, + "learning_rate": 7.791852377572747e-05, + "loss": 0.02398626655340195, + "step": 155630 + }, + { + "epoch": 22.092264017033358, + "grad_norm": 0.012701639905571938, + "learning_rate": 7.791710432931158e-05, + "loss": 0.008687663078308105, + "step": 155640 + }, + { + "epoch": 22.093683463449256, + "grad_norm": 0.399959921836853, + "learning_rate": 7.791568488289568e-05, + "loss": 0.013000153005123138, + "step": 155650 + }, + { + "epoch": 22.09510290986515, + "grad_norm": 7.813028812408447, + "learning_rate": 7.791426543647978e-05, + "loss": 0.0059307973831892015, + "step": 155660 + }, + { + "epoch": 22.09652235628105, + "grad_norm": 0.09939685463905334, + "learning_rate": 7.791284599006387e-05, + "loss": 0.016990141570568086, + "step": 155670 + }, + { + "epoch": 22.097941802696948, + "grad_norm": 1.6872514486312866, + "learning_rate": 7.791142654364798e-05, + "loss": 0.006230312958359718, + "step": 155680 + }, + { + "epoch": 22.099361249112846, + "grad_norm": 0.01356822531670332, + "learning_rate": 7.791000709723208e-05, + "loss": 0.0019114609807729722, + "step": 155690 + }, + { + "epoch": 22.100780695528744, + "grad_norm": 10.108484268188477, + "learning_rate": 7.790858765081619e-05, + "loss": 0.034482714533805844, + "step": 155700 + }, + { + "epoch": 22.102200141944643, + "grad_norm": 0.4600825905799866, + "learning_rate": 7.790716820440028e-05, + "loss": 0.012743535637855529, + "step": 155710 + }, + { + "epoch": 22.10361958836054, + "grad_norm": 10.901869773864746, + "learning_rate": 7.790574875798439e-05, + "loss": 0.06481307744979858, + "step": 155720 + }, + { + "epoch": 22.105039034776436, + "grad_norm": 9.981513977050781, + "learning_rate": 7.79043293115685e-05, + "loss": 0.030334275960922242, + "step": 155730 + }, + { + "epoch": 22.106458481192334, + "grad_norm": 0.007742475252598524, + "learning_rate": 7.79029098651526e-05, + "loss": 0.003006871044635773, + "step": 155740 + }, + { + "epoch": 22.107877927608232, + "grad_norm": 0.05650017783045769, + "learning_rate": 7.79014904187367e-05, + "loss": 0.002586229890584946, + "step": 155750 + }, + { + "epoch": 22.10929737402413, + "grad_norm": 0.3237476348876953, + "learning_rate": 7.790007097232079e-05, + "loss": 0.014832794666290283, + "step": 155760 + }, + { + "epoch": 22.11071682044003, + "grad_norm": 0.2647223174571991, + "learning_rate": 7.78986515259049e-05, + "loss": 0.005772634968161583, + "step": 155770 + }, + { + "epoch": 22.112136266855927, + "grad_norm": 0.04670483246445656, + "learning_rate": 7.7897232079489e-05, + "loss": 0.004701429232954979, + "step": 155780 + }, + { + "epoch": 22.113555713271825, + "grad_norm": 0.0132818091660738, + "learning_rate": 7.789581263307311e-05, + "loss": 0.006494057178497314, + "step": 155790 + }, + { + "epoch": 22.11497515968772, + "grad_norm": 0.6307598948478699, + "learning_rate": 7.78943931866572e-05, + "loss": 0.0071422293782234195, + "step": 155800 + }, + { + "epoch": 22.11639460610362, + "grad_norm": 0.6477800011634827, + "learning_rate": 7.789297374024132e-05, + "loss": 0.0558569610118866, + "step": 155810 + }, + { + "epoch": 22.117814052519517, + "grad_norm": 0.05954930931329727, + "learning_rate": 7.789155429382541e-05, + "loss": 0.0054099738597869875, + "step": 155820 + }, + { + "epoch": 22.119233498935415, + "grad_norm": 0.0803491473197937, + "learning_rate": 7.789013484740951e-05, + "loss": 0.0041809286922216415, + "step": 155830 + }, + { + "epoch": 22.120652945351313, + "grad_norm": 0.008594626560807228, + "learning_rate": 7.788871540099362e-05, + "loss": 0.011200736463069915, + "step": 155840 + }, + { + "epoch": 22.12207239176721, + "grad_norm": 9.658167839050293, + "learning_rate": 7.788729595457772e-05, + "loss": 0.034884396195411685, + "step": 155850 + }, + { + "epoch": 22.12349183818311, + "grad_norm": 0.0076732453890144825, + "learning_rate": 7.788587650816183e-05, + "loss": 0.02350471615791321, + "step": 155860 + }, + { + "epoch": 22.124911284599005, + "grad_norm": 12.060302734375, + "learning_rate": 7.788445706174592e-05, + "loss": 0.017297054827213287, + "step": 155870 + }, + { + "epoch": 22.126330731014903, + "grad_norm": 0.3089055120944977, + "learning_rate": 7.788303761533003e-05, + "loss": 0.016819316148757934, + "step": 155880 + }, + { + "epoch": 22.1277501774308, + "grad_norm": 0.0244551170617342, + "learning_rate": 7.788161816891412e-05, + "loss": 0.026865240931510926, + "step": 155890 + }, + { + "epoch": 22.1291696238467, + "grad_norm": 0.034610018134117126, + "learning_rate": 7.788019872249823e-05, + "loss": 0.009499045461416245, + "step": 155900 + }, + { + "epoch": 22.130589070262598, + "grad_norm": 0.7301345467567444, + "learning_rate": 7.787877927608233e-05, + "loss": 0.0036213181912899016, + "step": 155910 + }, + { + "epoch": 22.132008516678496, + "grad_norm": 0.12266136705875397, + "learning_rate": 7.787735982966643e-05, + "loss": 0.0022346299141645432, + "step": 155920 + }, + { + "epoch": 22.133427963094395, + "grad_norm": 0.01565738581120968, + "learning_rate": 7.787608232789213e-05, + "loss": 0.016079407930374146, + "step": 155930 + }, + { + "epoch": 22.13484740951029, + "grad_norm": 0.14432527124881744, + "learning_rate": 7.787466288147623e-05, + "loss": 0.040336829423904416, + "step": 155940 + }, + { + "epoch": 22.136266855926188, + "grad_norm": 0.11140721291303635, + "learning_rate": 7.787324343506032e-05, + "loss": 0.014967834949493409, + "step": 155950 + }, + { + "epoch": 22.137686302342086, + "grad_norm": 0.1769515872001648, + "learning_rate": 7.787182398864443e-05, + "loss": 0.0019591905176639555, + "step": 155960 + }, + { + "epoch": 22.139105748757984, + "grad_norm": 3.8908329010009766, + "learning_rate": 7.787040454222853e-05, + "loss": 0.010915882140398025, + "step": 155970 + }, + { + "epoch": 22.140525195173883, + "grad_norm": 0.024419957771897316, + "learning_rate": 7.786898509581264e-05, + "loss": 0.02162826955318451, + "step": 155980 + }, + { + "epoch": 22.14194464158978, + "grad_norm": 18.346527099609375, + "learning_rate": 7.786756564939674e-05, + "loss": 0.016015176475048066, + "step": 155990 + }, + { + "epoch": 22.14336408800568, + "grad_norm": 0.615151047706604, + "learning_rate": 7.786614620298084e-05, + "loss": 0.004118350148200989, + "step": 156000 + }, + { + "epoch": 22.14336408800568, + "eval_accuracy": 0.988046035480384, + "eval_loss": 0.043466608971357346, + "eval_runtime": 31.2886, + "eval_samples_per_second": 502.644, + "eval_steps_per_second": 15.725, + "step": 156000 + }, + { + "epoch": 22.144783534421574, + "grad_norm": 0.07165104150772095, + "learning_rate": 7.786472675656495e-05, + "loss": 0.0008565597236156464, + "step": 156010 + }, + { + "epoch": 22.146202980837472, + "grad_norm": 2.2204015254974365, + "learning_rate": 7.786330731014904e-05, + "loss": 0.02306968718767166, + "step": 156020 + }, + { + "epoch": 22.14762242725337, + "grad_norm": 0.03883130103349686, + "learning_rate": 7.786188786373316e-05, + "loss": 0.024065551161766053, + "step": 156030 + }, + { + "epoch": 22.14904187366927, + "grad_norm": 0.06128864735364914, + "learning_rate": 7.786046841731724e-05, + "loss": 0.01836361885070801, + "step": 156040 + }, + { + "epoch": 22.150461320085167, + "grad_norm": 0.8209577202796936, + "learning_rate": 7.785904897090135e-05, + "loss": 0.03084845244884491, + "step": 156050 + }, + { + "epoch": 22.151880766501066, + "grad_norm": 1.1226838827133179, + "learning_rate": 7.785762952448545e-05, + "loss": 0.027660369873046875, + "step": 156060 + }, + { + "epoch": 22.153300212916964, + "grad_norm": 7.093630313873291, + "learning_rate": 7.785621007806956e-05, + "loss": 0.04921625852584839, + "step": 156070 + }, + { + "epoch": 22.15471965933286, + "grad_norm": 3.578007221221924, + "learning_rate": 7.785479063165366e-05, + "loss": 0.05138797163963318, + "step": 156080 + }, + { + "epoch": 22.156139105748757, + "grad_norm": 0.05414951220154762, + "learning_rate": 7.785337118523775e-05, + "loss": 0.015434172749519349, + "step": 156090 + }, + { + "epoch": 22.157558552164655, + "grad_norm": 6.173698902130127, + "learning_rate": 7.785195173882186e-05, + "loss": 0.037377592921257016, + "step": 156100 + }, + { + "epoch": 22.158977998580554, + "grad_norm": 0.24203209578990936, + "learning_rate": 7.785053229240596e-05, + "loss": 0.01127246767282486, + "step": 156110 + }, + { + "epoch": 22.160397444996452, + "grad_norm": 0.042526714503765106, + "learning_rate": 7.784911284599007e-05, + "loss": 0.028720852732658387, + "step": 156120 + }, + { + "epoch": 22.16181689141235, + "grad_norm": 0.2957881689071655, + "learning_rate": 7.784769339957417e-05, + "loss": 0.03476574420928955, + "step": 156130 + }, + { + "epoch": 22.16323633782825, + "grad_norm": 0.34151363372802734, + "learning_rate": 7.784627395315828e-05, + "loss": 0.02009095549583435, + "step": 156140 + }, + { + "epoch": 22.164655784244143, + "grad_norm": 0.34117230772972107, + "learning_rate": 7.784485450674236e-05, + "loss": 0.005833473056554794, + "step": 156150 + }, + { + "epoch": 22.16607523066004, + "grad_norm": 6.286214351654053, + "learning_rate": 7.784343506032648e-05, + "loss": 0.06364951133728028, + "step": 156160 + }, + { + "epoch": 22.16749467707594, + "grad_norm": 0.3828961253166199, + "learning_rate": 7.784201561391057e-05, + "loss": 0.007324223220348358, + "step": 156170 + }, + { + "epoch": 22.168914123491838, + "grad_norm": 0.07064826786518097, + "learning_rate": 7.784059616749468e-05, + "loss": 0.02856385111808777, + "step": 156180 + }, + { + "epoch": 22.170333569907736, + "grad_norm": 0.00878112856298685, + "learning_rate": 7.783917672107878e-05, + "loss": 0.0006358951330184936, + "step": 156190 + }, + { + "epoch": 22.171753016323635, + "grad_norm": 0.23981712758541107, + "learning_rate": 7.783775727466288e-05, + "loss": 0.0037559378892183305, + "step": 156200 + }, + { + "epoch": 22.173172462739533, + "grad_norm": 0.040314484387636185, + "learning_rate": 7.783633782824699e-05, + "loss": 0.02502982020378113, + "step": 156210 + }, + { + "epoch": 22.174591909155428, + "grad_norm": 0.09642792493104935, + "learning_rate": 7.783491838183109e-05, + "loss": 0.028772905468940735, + "step": 156220 + }, + { + "epoch": 22.176011355571326, + "grad_norm": 0.07010980695486069, + "learning_rate": 7.78334989354152e-05, + "loss": 0.01995015740394592, + "step": 156230 + }, + { + "epoch": 22.177430801987224, + "grad_norm": 1.133081316947937, + "learning_rate": 7.78320794889993e-05, + "loss": 0.005280789360404015, + "step": 156240 + }, + { + "epoch": 22.178850248403123, + "grad_norm": 0.9036301970481873, + "learning_rate": 7.783066004258339e-05, + "loss": 0.003670858219265938, + "step": 156250 + }, + { + "epoch": 22.18026969481902, + "grad_norm": 0.7835737466812134, + "learning_rate": 7.782924059616749e-05, + "loss": 0.007000745832920074, + "step": 156260 + }, + { + "epoch": 22.18168914123492, + "grad_norm": 0.018337877467274666, + "learning_rate": 7.78278211497516e-05, + "loss": 0.013435213267803192, + "step": 156270 + }, + { + "epoch": 22.183108587650818, + "grad_norm": 6.791632175445557, + "learning_rate": 7.78264017033357e-05, + "loss": 0.00841963365674019, + "step": 156280 + }, + { + "epoch": 22.184528034066712, + "grad_norm": 0.7795776128768921, + "learning_rate": 7.782498225691981e-05, + "loss": 0.027487963438034058, + "step": 156290 + }, + { + "epoch": 22.18594748048261, + "grad_norm": 0.30205202102661133, + "learning_rate": 7.782356281050391e-05, + "loss": 0.025008493661880495, + "step": 156300 + }, + { + "epoch": 22.18736692689851, + "grad_norm": 0.6490525007247925, + "learning_rate": 7.7822143364088e-05, + "loss": 0.004195484891533851, + "step": 156310 + }, + { + "epoch": 22.188786373314407, + "grad_norm": 0.056629300117492676, + "learning_rate": 7.782072391767212e-05, + "loss": 0.001991637423634529, + "step": 156320 + }, + { + "epoch": 22.190205819730306, + "grad_norm": 3.0197110176086426, + "learning_rate": 7.781930447125621e-05, + "loss": 0.014416144788265228, + "step": 156330 + }, + { + "epoch": 22.191625266146204, + "grad_norm": 0.0038614629302173853, + "learning_rate": 7.781788502484032e-05, + "loss": 0.004608276858925819, + "step": 156340 + }, + { + "epoch": 22.193044712562102, + "grad_norm": 3.7356653213500977, + "learning_rate": 7.781646557842441e-05, + "loss": 0.021060559153556823, + "step": 156350 + }, + { + "epoch": 22.194464158977997, + "grad_norm": 0.13739293813705444, + "learning_rate": 7.781504613200852e-05, + "loss": 0.02142437398433685, + "step": 156360 + }, + { + "epoch": 22.195883605393895, + "grad_norm": 7.362116813659668, + "learning_rate": 7.781362668559262e-05, + "loss": 0.030367448925971985, + "step": 156370 + }, + { + "epoch": 22.197303051809794, + "grad_norm": 1.7559149265289307, + "learning_rate": 7.781220723917673e-05, + "loss": 0.012588085234165191, + "step": 156380 + }, + { + "epoch": 22.198722498225692, + "grad_norm": 0.055812519043684006, + "learning_rate": 7.781078779276084e-05, + "loss": 0.011686773598194122, + "step": 156390 + }, + { + "epoch": 22.20014194464159, + "grad_norm": 6.098696708679199, + "learning_rate": 7.780936834634492e-05, + "loss": 0.028709763288497926, + "step": 156400 + }, + { + "epoch": 22.20156139105749, + "grad_norm": 8.061178207397461, + "learning_rate": 7.780794889992903e-05, + "loss": 0.06026015281677246, + "step": 156410 + }, + { + "epoch": 22.202980837473387, + "grad_norm": 0.26346340775489807, + "learning_rate": 7.780652945351313e-05, + "loss": 0.03732381761074066, + "step": 156420 + }, + { + "epoch": 22.20440028388928, + "grad_norm": 2.7442028522491455, + "learning_rate": 7.780511000709724e-05, + "loss": 0.015069298446178436, + "step": 156430 + }, + { + "epoch": 22.20581973030518, + "grad_norm": 0.07424886524677277, + "learning_rate": 7.780369056068134e-05, + "loss": 0.022237886488437653, + "step": 156440 + }, + { + "epoch": 22.207239176721078, + "grad_norm": 2.513817071914673, + "learning_rate": 7.780227111426544e-05, + "loss": 0.04265256226062775, + "step": 156450 + }, + { + "epoch": 22.208658623136976, + "grad_norm": 0.5560611486434937, + "learning_rate": 7.780085166784953e-05, + "loss": 0.004275476559996605, + "step": 156460 + }, + { + "epoch": 22.210078069552875, + "grad_norm": 0.09412333369255066, + "learning_rate": 7.779943222143364e-05, + "loss": 0.0209195077419281, + "step": 156470 + }, + { + "epoch": 22.211497515968773, + "grad_norm": 0.28312772512435913, + "learning_rate": 7.779801277501775e-05, + "loss": 0.01196388527750969, + "step": 156480 + }, + { + "epoch": 22.21291696238467, + "grad_norm": 0.15546339750289917, + "learning_rate": 7.779659332860185e-05, + "loss": 0.025868341326713562, + "step": 156490 + }, + { + "epoch": 22.214336408800566, + "grad_norm": 1.3942861557006836, + "learning_rate": 7.779517388218596e-05, + "loss": 0.00564596951007843, + "step": 156500 + }, + { + "epoch": 22.214336408800566, + "eval_accuracy": 0.985884148280028, + "eval_loss": 0.056530602276325226, + "eval_runtime": 32.2675, + "eval_samples_per_second": 487.395, + "eval_steps_per_second": 15.248, + "step": 156500 + }, + { + "epoch": 22.215755855216464, + "grad_norm": 0.04614492878317833, + "learning_rate": 7.779375443577005e-05, + "loss": 0.014297233521938324, + "step": 156510 + }, + { + "epoch": 22.217175301632363, + "grad_norm": 0.02491775155067444, + "learning_rate": 7.779233498935416e-05, + "loss": 0.036369362473487855, + "step": 156520 + }, + { + "epoch": 22.21859474804826, + "grad_norm": 0.02053004316985607, + "learning_rate": 7.779091554293825e-05, + "loss": 0.012032422423362731, + "step": 156530 + }, + { + "epoch": 22.22001419446416, + "grad_norm": 0.031289100646972656, + "learning_rate": 7.778949609652237e-05, + "loss": 0.021046262979507447, + "step": 156540 + }, + { + "epoch": 22.221433640880058, + "grad_norm": 0.0913107842206955, + "learning_rate": 7.778807665010646e-05, + "loss": 0.005926747992634773, + "step": 156550 + }, + { + "epoch": 22.222853087295956, + "grad_norm": 0.11398302018642426, + "learning_rate": 7.778665720369056e-05, + "loss": 0.003598828986287117, + "step": 156560 + }, + { + "epoch": 22.22427253371185, + "grad_norm": 7.798530578613281, + "learning_rate": 7.778523775727467e-05, + "loss": 0.017229855060577393, + "step": 156570 + }, + { + "epoch": 22.22569198012775, + "grad_norm": 0.46363547444343567, + "learning_rate": 7.778381831085877e-05, + "loss": 0.016476720571517944, + "step": 156580 + }, + { + "epoch": 22.227111426543647, + "grad_norm": 0.21871525049209595, + "learning_rate": 7.778239886444288e-05, + "loss": 0.008306494355201722, + "step": 156590 + }, + { + "epoch": 22.228530872959546, + "grad_norm": 0.1471615731716156, + "learning_rate": 7.778097941802698e-05, + "loss": 0.015444383025169373, + "step": 156600 + }, + { + "epoch": 22.229950319375444, + "grad_norm": 3.514227867126465, + "learning_rate": 7.777955997161107e-05, + "loss": 0.017786872386932374, + "step": 156610 + }, + { + "epoch": 22.231369765791342, + "grad_norm": 0.8832190632820129, + "learning_rate": 7.777814052519517e-05, + "loss": 0.002369149774312973, + "step": 156620 + }, + { + "epoch": 22.23278921220724, + "grad_norm": 4.089088439941406, + "learning_rate": 7.777672107877928e-05, + "loss": 0.040671488642692565, + "step": 156630 + }, + { + "epoch": 22.234208658623135, + "grad_norm": 0.008016172796487808, + "learning_rate": 7.777530163236338e-05, + "loss": 0.002869865298271179, + "step": 156640 + }, + { + "epoch": 22.235628105039034, + "grad_norm": 2.2210845947265625, + "learning_rate": 7.777388218594749e-05, + "loss": 0.010424254834651947, + "step": 156650 + }, + { + "epoch": 22.237047551454932, + "grad_norm": 9.416260719299316, + "learning_rate": 7.777246273953159e-05, + "loss": 0.006325635313987732, + "step": 156660 + }, + { + "epoch": 22.23846699787083, + "grad_norm": 0.022244581952691078, + "learning_rate": 7.777104329311569e-05, + "loss": 0.028625178337097167, + "step": 156670 + }, + { + "epoch": 22.23988644428673, + "grad_norm": 6.181304931640625, + "learning_rate": 7.77696238466998e-05, + "loss": 0.048624515533447266, + "step": 156680 + }, + { + "epoch": 22.241305890702627, + "grad_norm": 4.98421049118042, + "learning_rate": 7.77682044002839e-05, + "loss": 0.026758480072021484, + "step": 156690 + }, + { + "epoch": 22.242725337118525, + "grad_norm": 2.7381484508514404, + "learning_rate": 7.7766784953868e-05, + "loss": 0.010892677307128906, + "step": 156700 + }, + { + "epoch": 22.24414478353442, + "grad_norm": 0.22052429616451263, + "learning_rate": 7.776536550745209e-05, + "loss": 0.018509407341480256, + "step": 156710 + }, + { + "epoch": 22.24556422995032, + "grad_norm": 4.599610805511475, + "learning_rate": 7.77639460610362e-05, + "loss": 0.05486184358596802, + "step": 156720 + }, + { + "epoch": 22.246983676366217, + "grad_norm": 1.6750128269195557, + "learning_rate": 7.77625266146203e-05, + "loss": 0.010962040722370147, + "step": 156730 + }, + { + "epoch": 22.248403122782115, + "grad_norm": 13.25168514251709, + "learning_rate": 7.776110716820441e-05, + "loss": 0.04979659616947174, + "step": 156740 + }, + { + "epoch": 22.249822569198013, + "grad_norm": 6.586043357849121, + "learning_rate": 7.77596877217885e-05, + "loss": 0.025202998518943788, + "step": 156750 + }, + { + "epoch": 22.25124201561391, + "grad_norm": 1.8250092267990112, + "learning_rate": 7.77582682753726e-05, + "loss": 0.0110416442155838, + "step": 156760 + }, + { + "epoch": 22.25266146202981, + "grad_norm": 1.2811121940612793, + "learning_rate": 7.775684882895671e-05, + "loss": 0.005936400219798088, + "step": 156770 + }, + { + "epoch": 22.254080908445705, + "grad_norm": 0.5547705888748169, + "learning_rate": 7.775542938254081e-05, + "loss": 0.02519708573818207, + "step": 156780 + }, + { + "epoch": 22.255500354861603, + "grad_norm": 0.986126720905304, + "learning_rate": 7.775400993612492e-05, + "loss": 0.011631923913955688, + "step": 156790 + }, + { + "epoch": 22.2569198012775, + "grad_norm": 0.8930934071540833, + "learning_rate": 7.775259048970902e-05, + "loss": 0.016468805074691773, + "step": 156800 + }, + { + "epoch": 22.2583392476934, + "grad_norm": 16.083873748779297, + "learning_rate": 7.775117104329312e-05, + "loss": 0.03857468366622925, + "step": 156810 + }, + { + "epoch": 22.259758694109298, + "grad_norm": 0.8279505372047424, + "learning_rate": 7.774975159687721e-05, + "loss": 0.004183601215481758, + "step": 156820 + }, + { + "epoch": 22.261178140525196, + "grad_norm": 10.038238525390625, + "learning_rate": 7.774833215046133e-05, + "loss": 0.03234444260597229, + "step": 156830 + }, + { + "epoch": 22.262597586941094, + "grad_norm": 0.49715524911880493, + "learning_rate": 7.774691270404542e-05, + "loss": 0.008418069779872894, + "step": 156840 + }, + { + "epoch": 22.26401703335699, + "grad_norm": 0.032814498990774155, + "learning_rate": 7.774549325762953e-05, + "loss": 0.034330925345420836, + "step": 156850 + }, + { + "epoch": 22.265436479772887, + "grad_norm": 0.41001394391059875, + "learning_rate": 7.774407381121363e-05, + "loss": 0.009199117124080659, + "step": 156860 + }, + { + "epoch": 22.266855926188786, + "grad_norm": 0.03531016409397125, + "learning_rate": 7.774265436479773e-05, + "loss": 0.02186751365661621, + "step": 156870 + }, + { + "epoch": 22.268275372604684, + "grad_norm": 2.2019834518432617, + "learning_rate": 7.774123491838184e-05, + "loss": 0.010834416747093201, + "step": 156880 + }, + { + "epoch": 22.269694819020582, + "grad_norm": 0.13095907866954803, + "learning_rate": 7.773981547196594e-05, + "loss": 0.012883412837982177, + "step": 156890 + }, + { + "epoch": 22.27111426543648, + "grad_norm": 0.22800277173519135, + "learning_rate": 7.773839602555005e-05, + "loss": 0.02260344922542572, + "step": 156900 + }, + { + "epoch": 22.27253371185238, + "grad_norm": 4.392408847808838, + "learning_rate": 7.773697657913414e-05, + "loss": 0.009460886567831039, + "step": 156910 + }, + { + "epoch": 22.273953158268274, + "grad_norm": 0.1426231414079666, + "learning_rate": 7.773555713271824e-05, + "loss": 0.0036942556500434874, + "step": 156920 + }, + { + "epoch": 22.275372604684172, + "grad_norm": 0.10234623402357101, + "learning_rate": 7.773413768630234e-05, + "loss": 0.008067598938941956, + "step": 156930 + }, + { + "epoch": 22.27679205110007, + "grad_norm": 14.729464530944824, + "learning_rate": 7.773271823988645e-05, + "loss": 0.03420968949794769, + "step": 156940 + }, + { + "epoch": 22.27821149751597, + "grad_norm": 0.6127524375915527, + "learning_rate": 7.773129879347055e-05, + "loss": 0.018050459027290345, + "step": 156950 + }, + { + "epoch": 22.279630943931867, + "grad_norm": 0.00412955554202199, + "learning_rate": 7.772987934705466e-05, + "loss": 0.002604038268327713, + "step": 156960 + }, + { + "epoch": 22.281050390347765, + "grad_norm": 0.31192293763160706, + "learning_rate": 7.772845990063876e-05, + "loss": 0.00800662711262703, + "step": 156970 + }, + { + "epoch": 22.282469836763664, + "grad_norm": 0.3433336019515991, + "learning_rate": 7.772704045422285e-05, + "loss": 0.04620177149772644, + "step": 156980 + }, + { + "epoch": 22.28388928317956, + "grad_norm": 7.759442329406738, + "learning_rate": 7.772562100780696e-05, + "loss": 0.03429055511951447, + "step": 156990 + }, + { + "epoch": 22.285308729595457, + "grad_norm": 0.02789120562374592, + "learning_rate": 7.772420156139106e-05, + "loss": 0.02714965045452118, + "step": 157000 + }, + { + "epoch": 22.285308729595457, + "eval_accuracy": 0.9882367902333566, + "eval_loss": 0.044594656676054, + "eval_runtime": 32.4875, + "eval_samples_per_second": 484.094, + "eval_steps_per_second": 15.144, + "step": 157000 + }, + { + "epoch": 22.286728176011355, + "grad_norm": 2.138810634613037, + "learning_rate": 7.772278211497517e-05, + "loss": 0.0026128679513931275, + "step": 157010 + }, + { + "epoch": 22.288147622427253, + "grad_norm": 0.3142348527908325, + "learning_rate": 7.772136266855926e-05, + "loss": 0.0064741730690002445, + "step": 157020 + }, + { + "epoch": 22.28956706884315, + "grad_norm": 0.1558825671672821, + "learning_rate": 7.771994322214337e-05, + "loss": 0.0057801220566034315, + "step": 157030 + }, + { + "epoch": 22.29098651525905, + "grad_norm": 12.042495727539062, + "learning_rate": 7.771852377572746e-05, + "loss": 0.0112741120159626, + "step": 157040 + }, + { + "epoch": 22.292405961674948, + "grad_norm": 7.938665866851807, + "learning_rate": 7.771710432931158e-05, + "loss": 0.03352750539779663, + "step": 157050 + }, + { + "epoch": 22.293825408090843, + "grad_norm": 0.1827085316181183, + "learning_rate": 7.771568488289567e-05, + "loss": 0.012945596873760224, + "step": 157060 + }, + { + "epoch": 22.29524485450674, + "grad_norm": 1.6993259191513062, + "learning_rate": 7.771426543647977e-05, + "loss": 0.002298201620578766, + "step": 157070 + }, + { + "epoch": 22.29666430092264, + "grad_norm": 0.020412249490618706, + "learning_rate": 7.771284599006388e-05, + "loss": 0.009876996278762817, + "step": 157080 + }, + { + "epoch": 22.298083747338538, + "grad_norm": 2.487182855606079, + "learning_rate": 7.771142654364798e-05, + "loss": 0.012551988661289214, + "step": 157090 + }, + { + "epoch": 22.299503193754436, + "grad_norm": 0.1205645427107811, + "learning_rate": 7.771000709723209e-05, + "loss": 0.018575915694236757, + "step": 157100 + }, + { + "epoch": 22.300922640170334, + "grad_norm": 7.773462295532227, + "learning_rate": 7.770858765081619e-05, + "loss": 0.007395397126674652, + "step": 157110 + }, + { + "epoch": 22.302342086586233, + "grad_norm": 6.635310173034668, + "learning_rate": 7.770716820440028e-05, + "loss": 0.007033777236938476, + "step": 157120 + }, + { + "epoch": 22.303761533002127, + "grad_norm": 11.825334548950195, + "learning_rate": 7.770574875798438e-05, + "loss": 0.0424881100654602, + "step": 157130 + }, + { + "epoch": 22.305180979418026, + "grad_norm": 0.5748307704925537, + "learning_rate": 7.770432931156849e-05, + "loss": 0.02862425446510315, + "step": 157140 + }, + { + "epoch": 22.306600425833924, + "grad_norm": 27.19774055480957, + "learning_rate": 7.770290986515259e-05, + "loss": 0.0342379629611969, + "step": 157150 + }, + { + "epoch": 22.308019872249822, + "grad_norm": 11.08139419555664, + "learning_rate": 7.77014904187367e-05, + "loss": 0.03405921757221222, + "step": 157160 + }, + { + "epoch": 22.30943931866572, + "grad_norm": 0.1628044843673706, + "learning_rate": 7.77000709723208e-05, + "loss": 0.020260962843894958, + "step": 157170 + }, + { + "epoch": 22.31085876508162, + "grad_norm": 7.082169532775879, + "learning_rate": 7.76986515259049e-05, + "loss": 0.010690602660179137, + "step": 157180 + }, + { + "epoch": 22.312278211497517, + "grad_norm": 0.5308610200881958, + "learning_rate": 7.769723207948901e-05, + "loss": 0.033976799249649046, + "step": 157190 + }, + { + "epoch": 22.313697657913412, + "grad_norm": 8.86551570892334, + "learning_rate": 7.76958126330731e-05, + "loss": 0.02252533882856369, + "step": 157200 + }, + { + "epoch": 22.31511710432931, + "grad_norm": 0.37537357211112976, + "learning_rate": 7.769439318665722e-05, + "loss": 0.01802233010530472, + "step": 157210 + }, + { + "epoch": 22.31653655074521, + "grad_norm": 1.2083526849746704, + "learning_rate": 7.769297374024131e-05, + "loss": 0.02804979383945465, + "step": 157220 + }, + { + "epoch": 22.317955997161107, + "grad_norm": 0.6579700112342834, + "learning_rate": 7.769155429382541e-05, + "loss": 0.008526626229286193, + "step": 157230 + }, + { + "epoch": 22.319375443577005, + "grad_norm": 0.02894330583512783, + "learning_rate": 7.769013484740951e-05, + "loss": 0.0165112167596817, + "step": 157240 + }, + { + "epoch": 22.320794889992904, + "grad_norm": 0.25422602891921997, + "learning_rate": 7.768871540099362e-05, + "loss": 0.003976498916745186, + "step": 157250 + }, + { + "epoch": 22.322214336408802, + "grad_norm": 0.02831200137734413, + "learning_rate": 7.768729595457772e-05, + "loss": 0.0028068572282791138, + "step": 157260 + }, + { + "epoch": 22.323633782824697, + "grad_norm": 0.289396733045578, + "learning_rate": 7.768587650816183e-05, + "loss": 0.005779454112052917, + "step": 157270 + }, + { + "epoch": 22.325053229240595, + "grad_norm": 0.30526086688041687, + "learning_rate": 7.768445706174592e-05, + "loss": 0.0024255804717540743, + "step": 157280 + }, + { + "epoch": 22.326472675656493, + "grad_norm": 0.6464574933052063, + "learning_rate": 7.768303761533002e-05, + "loss": 0.0026240404695272446, + "step": 157290 + }, + { + "epoch": 22.32789212207239, + "grad_norm": 3.7027275562286377, + "learning_rate": 7.768161816891413e-05, + "loss": 0.02328314483165741, + "step": 157300 + }, + { + "epoch": 22.32931156848829, + "grad_norm": 8.35509967803955, + "learning_rate": 7.768019872249823e-05, + "loss": 0.004795863851904869, + "step": 157310 + }, + { + "epoch": 22.330731014904188, + "grad_norm": 0.04652887582778931, + "learning_rate": 7.767877927608234e-05, + "loss": 0.011951911449432372, + "step": 157320 + }, + { + "epoch": 22.332150461320087, + "grad_norm": 0.06563115119934082, + "learning_rate": 7.767735982966642e-05, + "loss": 0.032347336411476135, + "step": 157330 + }, + { + "epoch": 22.33356990773598, + "grad_norm": 0.3860478699207306, + "learning_rate": 7.767594038325054e-05, + "loss": 0.017612168192863466, + "step": 157340 + }, + { + "epoch": 22.33498935415188, + "grad_norm": 0.36468297243118286, + "learning_rate": 7.767452093683463e-05, + "loss": 0.007859216630458831, + "step": 157350 + }, + { + "epoch": 22.336408800567778, + "grad_norm": 1.994027018547058, + "learning_rate": 7.767310149041874e-05, + "loss": 0.010813948512077332, + "step": 157360 + }, + { + "epoch": 22.337828246983676, + "grad_norm": 0.03414653241634369, + "learning_rate": 7.767168204400284e-05, + "loss": 0.03487716019153595, + "step": 157370 + }, + { + "epoch": 22.339247693399575, + "grad_norm": 2.9039816856384277, + "learning_rate": 7.767026259758694e-05, + "loss": 0.005640045925974846, + "step": 157380 + }, + { + "epoch": 22.340667139815473, + "grad_norm": 0.10631964355707169, + "learning_rate": 7.766884315117105e-05, + "loss": 0.02011025995016098, + "step": 157390 + }, + { + "epoch": 22.34208658623137, + "grad_norm": 0.14689916372299194, + "learning_rate": 7.766742370475515e-05, + "loss": 0.010036943107843399, + "step": 157400 + }, + { + "epoch": 22.343506032647266, + "grad_norm": 0.014813115820288658, + "learning_rate": 7.766600425833926e-05, + "loss": 0.003897089883685112, + "step": 157410 + }, + { + "epoch": 22.344925479063164, + "grad_norm": 0.11093901842832565, + "learning_rate": 7.766458481192335e-05, + "loss": 0.0010665234178304673, + "step": 157420 + }, + { + "epoch": 22.346344925479062, + "grad_norm": 0.09238677471876144, + "learning_rate": 7.766316536550745e-05, + "loss": 0.007135692238807678, + "step": 157430 + }, + { + "epoch": 22.34776437189496, + "grad_norm": 0.027461502701044083, + "learning_rate": 7.766174591909155e-05, + "loss": 0.00431610681116581, + "step": 157440 + }, + { + "epoch": 22.34918381831086, + "grad_norm": 0.09450242668390274, + "learning_rate": 7.766032647267566e-05, + "loss": 0.017845050990581514, + "step": 157450 + }, + { + "epoch": 22.350603264726757, + "grad_norm": 0.15231925249099731, + "learning_rate": 7.765890702625976e-05, + "loss": 0.012370072305202484, + "step": 157460 + }, + { + "epoch": 22.352022711142656, + "grad_norm": 14.631620407104492, + "learning_rate": 7.765748757984387e-05, + "loss": 0.01407625675201416, + "step": 157470 + }, + { + "epoch": 22.35344215755855, + "grad_norm": 1.3520756959915161, + "learning_rate": 7.765606813342797e-05, + "loss": 0.011169973015785217, + "step": 157480 + }, + { + "epoch": 22.35486160397445, + "grad_norm": 0.09555142372846603, + "learning_rate": 7.765464868701206e-05, + "loss": 0.004119285568594933, + "step": 157490 + }, + { + "epoch": 22.356281050390347, + "grad_norm": 0.22862669825553894, + "learning_rate": 7.765322924059617e-05, + "loss": 0.0025756161659955977, + "step": 157500 + }, + { + "epoch": 22.356281050390347, + "eval_accuracy": 0.9908437718573154, + "eval_loss": 0.03223145753145218, + "eval_runtime": 31.1412, + "eval_samples_per_second": 505.022, + "eval_steps_per_second": 15.799, + "step": 157500 + }, + { + "epoch": 22.357700496806245, + "grad_norm": 0.05830603837966919, + "learning_rate": 7.765180979418027e-05, + "loss": 0.02554323673248291, + "step": 157510 + }, + { + "epoch": 22.359119943222144, + "grad_norm": 0.013545520603656769, + "learning_rate": 7.765039034776438e-05, + "loss": 0.02105751186609268, + "step": 157520 + }, + { + "epoch": 22.360539389638042, + "grad_norm": 2.4102694988250732, + "learning_rate": 7.764897090134847e-05, + "loss": 0.003028986230492592, + "step": 157530 + }, + { + "epoch": 22.36195883605394, + "grad_norm": 5.8498406410217285, + "learning_rate": 7.764755145493258e-05, + "loss": 0.014929765462875366, + "step": 157540 + }, + { + "epoch": 22.363378282469835, + "grad_norm": 2.054464340209961, + "learning_rate": 7.764613200851668e-05, + "loss": 0.0209028497338295, + "step": 157550 + }, + { + "epoch": 22.364797728885733, + "grad_norm": 0.18177175521850586, + "learning_rate": 7.764471256210079e-05, + "loss": 0.04518730342388153, + "step": 157560 + }, + { + "epoch": 22.36621717530163, + "grad_norm": 1.4414011240005493, + "learning_rate": 7.764329311568488e-05, + "loss": 0.007240471243858337, + "step": 157570 + }, + { + "epoch": 22.36763662171753, + "grad_norm": 7.4974236488342285, + "learning_rate": 7.7641873669269e-05, + "loss": 0.013431793451309204, + "step": 157580 + }, + { + "epoch": 22.36905606813343, + "grad_norm": 3.038242816925049, + "learning_rate": 7.764045422285309e-05, + "loss": 0.032947197556495667, + "step": 157590 + }, + { + "epoch": 22.370475514549327, + "grad_norm": 0.31183478236198425, + "learning_rate": 7.763903477643719e-05, + "loss": 0.08205429911613464, + "step": 157600 + }, + { + "epoch": 22.371894960965225, + "grad_norm": 0.11748132854700089, + "learning_rate": 7.76376153300213e-05, + "loss": 0.03575382828712463, + "step": 157610 + }, + { + "epoch": 22.37331440738112, + "grad_norm": 0.017881209030747414, + "learning_rate": 7.76361958836054e-05, + "loss": 0.01500861942768097, + "step": 157620 + }, + { + "epoch": 22.374733853797018, + "grad_norm": 0.1347864717245102, + "learning_rate": 7.763477643718951e-05, + "loss": 0.02632550597190857, + "step": 157630 + }, + { + "epoch": 22.376153300212916, + "grad_norm": 19.97641372680664, + "learning_rate": 7.763335699077359e-05, + "loss": 0.01945675313472748, + "step": 157640 + }, + { + "epoch": 22.377572746628815, + "grad_norm": 3.486276149749756, + "learning_rate": 7.76319375443577e-05, + "loss": 0.007763060182332993, + "step": 157650 + }, + { + "epoch": 22.378992193044713, + "grad_norm": 0.051752690225839615, + "learning_rate": 7.76305180979418e-05, + "loss": 0.002775098755955696, + "step": 157660 + }, + { + "epoch": 22.38041163946061, + "grad_norm": 0.06058258190751076, + "learning_rate": 7.762909865152591e-05, + "loss": 0.01329578012228012, + "step": 157670 + }, + { + "epoch": 22.38183108587651, + "grad_norm": 8.484663009643555, + "learning_rate": 7.762767920511001e-05, + "loss": 0.0130478635430336, + "step": 157680 + }, + { + "epoch": 22.383250532292404, + "grad_norm": 0.4817946255207062, + "learning_rate": 7.76262597586941e-05, + "loss": 0.01366025060415268, + "step": 157690 + }, + { + "epoch": 22.384669978708303, + "grad_norm": 16.87750816345215, + "learning_rate": 7.762484031227822e-05, + "loss": 0.017043343186378478, + "step": 157700 + }, + { + "epoch": 22.3860894251242, + "grad_norm": 0.7071067690849304, + "learning_rate": 7.762342086586231e-05, + "loss": 0.0035200439393520354, + "step": 157710 + }, + { + "epoch": 22.3875088715401, + "grad_norm": 0.0037088035605847836, + "learning_rate": 7.762200141944643e-05, + "loss": 0.005088154971599579, + "step": 157720 + }, + { + "epoch": 22.388928317955997, + "grad_norm": 8.053375244140625, + "learning_rate": 7.762058197303052e-05, + "loss": 0.004890743643045425, + "step": 157730 + }, + { + "epoch": 22.390347764371896, + "grad_norm": 9.699647903442383, + "learning_rate": 7.761916252661462e-05, + "loss": 0.015349754691123962, + "step": 157740 + }, + { + "epoch": 22.391767210787794, + "grad_norm": 0.9764191508293152, + "learning_rate": 7.761774308019872e-05, + "loss": 0.005586006492376327, + "step": 157750 + }, + { + "epoch": 22.39318665720369, + "grad_norm": 0.13752713799476624, + "learning_rate": 7.761632363378283e-05, + "loss": 0.027968138456344604, + "step": 157760 + }, + { + "epoch": 22.394606103619587, + "grad_norm": 14.051712036132812, + "learning_rate": 7.761490418736693e-05, + "loss": 0.023042930662631987, + "step": 157770 + }, + { + "epoch": 22.396025550035485, + "grad_norm": 3.199176073074341, + "learning_rate": 7.761348474095104e-05, + "loss": 0.039394164085388185, + "step": 157780 + }, + { + "epoch": 22.397444996451384, + "grad_norm": 0.45714691281318665, + "learning_rate": 7.761206529453513e-05, + "loss": 0.009024067223072052, + "step": 157790 + }, + { + "epoch": 22.398864442867282, + "grad_norm": 1.3256912231445312, + "learning_rate": 7.761064584811923e-05, + "loss": 0.029252198338508607, + "step": 157800 + }, + { + "epoch": 22.40028388928318, + "grad_norm": 0.60904461145401, + "learning_rate": 7.760922640170334e-05, + "loss": 0.039113056659698484, + "step": 157810 + }, + { + "epoch": 22.40170333569908, + "grad_norm": 2.4503774642944336, + "learning_rate": 7.760780695528744e-05, + "loss": 0.009926814585924149, + "step": 157820 + }, + { + "epoch": 22.403122782114973, + "grad_norm": 0.652350902557373, + "learning_rate": 7.760638750887155e-05, + "loss": 0.014002639055252075, + "step": 157830 + }, + { + "epoch": 22.40454222853087, + "grad_norm": 0.039406511932611465, + "learning_rate": 7.760496806245563e-05, + "loss": 0.011849080771207809, + "step": 157840 + }, + { + "epoch": 22.40596167494677, + "grad_norm": 0.008178537711501122, + "learning_rate": 7.760354861603975e-05, + "loss": 0.011060686409473419, + "step": 157850 + }, + { + "epoch": 22.40738112136267, + "grad_norm": 10.046416282653809, + "learning_rate": 7.760212916962384e-05, + "loss": 0.012067420035600662, + "step": 157860 + }, + { + "epoch": 22.408800567778567, + "grad_norm": 0.021307114511728287, + "learning_rate": 7.760070972320795e-05, + "loss": 0.00683537945151329, + "step": 157870 + }, + { + "epoch": 22.410220014194465, + "grad_norm": 0.02506701648235321, + "learning_rate": 7.759929027679206e-05, + "loss": 0.010639746487140656, + "step": 157880 + }, + { + "epoch": 22.411639460610363, + "grad_norm": 0.010163038969039917, + "learning_rate": 7.759787083037616e-05, + "loss": 0.0031052429229021074, + "step": 157890 + }, + { + "epoch": 22.413058907026258, + "grad_norm": 0.27794021368026733, + "learning_rate": 7.759645138396026e-05, + "loss": 0.02686570882797241, + "step": 157900 + }, + { + "epoch": 22.414478353442156, + "grad_norm": 0.7705673575401306, + "learning_rate": 7.759503193754436e-05, + "loss": 0.002600378543138504, + "step": 157910 + }, + { + "epoch": 22.415897799858055, + "grad_norm": 1.0360749959945679, + "learning_rate": 7.759361249112847e-05, + "loss": 0.01664630025625229, + "step": 157920 + }, + { + "epoch": 22.417317246273953, + "grad_norm": 0.02189263142645359, + "learning_rate": 7.759219304471257e-05, + "loss": 0.011403093487024308, + "step": 157930 + }, + { + "epoch": 22.41873669268985, + "grad_norm": 0.5252323746681213, + "learning_rate": 7.759077359829668e-05, + "loss": 0.01529238373041153, + "step": 157940 + }, + { + "epoch": 22.42015613910575, + "grad_norm": 0.05122009664773941, + "learning_rate": 7.758935415188076e-05, + "loss": 0.03153864443302155, + "step": 157950 + }, + { + "epoch": 22.421575585521648, + "grad_norm": 1.7206672430038452, + "learning_rate": 7.758793470546487e-05, + "loss": 0.03539736866950989, + "step": 157960 + }, + { + "epoch": 22.422995031937543, + "grad_norm": 17.414770126342773, + "learning_rate": 7.758651525904898e-05, + "loss": 0.012664547562599182, + "step": 157970 + }, + { + "epoch": 22.42441447835344, + "grad_norm": 0.1499902456998825, + "learning_rate": 7.758509581263308e-05, + "loss": 0.004160892963409424, + "step": 157980 + }, + { + "epoch": 22.42583392476934, + "grad_norm": 1.0237393379211426, + "learning_rate": 7.758367636621719e-05, + "loss": 0.006034663692116737, + "step": 157990 + }, + { + "epoch": 22.427253371185238, + "grad_norm": 0.02525465562939644, + "learning_rate": 7.758225691980127e-05, + "loss": 0.009457568824291229, + "step": 158000 + }, + { + "epoch": 22.427253371185238, + "eval_accuracy": 0.988872639409932, + "eval_loss": 0.04059690237045288, + "eval_runtime": 30.8797, + "eval_samples_per_second": 509.299, + "eval_steps_per_second": 15.933, + "step": 158000 + }, + { + "epoch": 22.428672817601136, + "grad_norm": 1.05144202709198, + "learning_rate": 7.758083747338538e-05, + "loss": 0.020673306286334993, + "step": 158010 + }, + { + "epoch": 22.430092264017034, + "grad_norm": 0.028118697926402092, + "learning_rate": 7.757941802696948e-05, + "loss": 0.02086816281080246, + "step": 158020 + }, + { + "epoch": 22.431511710432932, + "grad_norm": 1.2461590766906738, + "learning_rate": 7.757799858055359e-05, + "loss": 0.0015938576310873032, + "step": 158030 + }, + { + "epoch": 22.432931156848827, + "grad_norm": 3.2972943782806396, + "learning_rate": 7.757657913413769e-05, + "loss": 0.04721449911594391, + "step": 158040 + }, + { + "epoch": 22.434350603264726, + "grad_norm": 0.18750934302806854, + "learning_rate": 7.757515968772179e-05, + "loss": 0.02726680040359497, + "step": 158050 + }, + { + "epoch": 22.435770049680624, + "grad_norm": 9.946572303771973, + "learning_rate": 7.75737402413059e-05, + "loss": 0.015767055749893188, + "step": 158060 + }, + { + "epoch": 22.437189496096522, + "grad_norm": 6.135612964630127, + "learning_rate": 7.757232079489e-05, + "loss": 0.008439029008150101, + "step": 158070 + }, + { + "epoch": 22.43860894251242, + "grad_norm": 8.143952369689941, + "learning_rate": 7.757090134847411e-05, + "loss": 0.02583845555782318, + "step": 158080 + }, + { + "epoch": 22.44002838892832, + "grad_norm": 3.006328821182251, + "learning_rate": 7.75694819020582e-05, + "loss": 0.0034192051738500596, + "step": 158090 + }, + { + "epoch": 22.441447835344217, + "grad_norm": 0.10491666197776794, + "learning_rate": 7.75680624556423e-05, + "loss": 0.04079504311084747, + "step": 158100 + }, + { + "epoch": 22.442867281760112, + "grad_norm": 0.08952518552541733, + "learning_rate": 7.75666430092264e-05, + "loss": 0.04543862044811249, + "step": 158110 + }, + { + "epoch": 22.44428672817601, + "grad_norm": 13.04823112487793, + "learning_rate": 7.756522356281051e-05, + "loss": 0.031036949157714842, + "step": 158120 + }, + { + "epoch": 22.44570617459191, + "grad_norm": 0.027029162272810936, + "learning_rate": 7.756380411639461e-05, + "loss": 0.019752249121665955, + "step": 158130 + }, + { + "epoch": 22.447125621007807, + "grad_norm": 6.343795299530029, + "learning_rate": 7.756238466997872e-05, + "loss": 0.07416347861289978, + "step": 158140 + }, + { + "epoch": 22.448545067423705, + "grad_norm": 0.12407232075929642, + "learning_rate": 7.756096522356282e-05, + "loss": 0.006796523928642273, + "step": 158150 + }, + { + "epoch": 22.449964513839603, + "grad_norm": 4.494736671447754, + "learning_rate": 7.755954577714691e-05, + "loss": 0.00697382390499115, + "step": 158160 + }, + { + "epoch": 22.4513839602555, + "grad_norm": 0.2808956801891327, + "learning_rate": 7.755812633073102e-05, + "loss": 0.02482542395591736, + "step": 158170 + }, + { + "epoch": 22.4528034066714, + "grad_norm": 8.806466102600098, + "learning_rate": 7.755670688431512e-05, + "loss": 0.032122132182121274, + "step": 158180 + }, + { + "epoch": 22.454222853087295, + "grad_norm": 0.6270188093185425, + "learning_rate": 7.755528743789923e-05, + "loss": 0.020539870858192442, + "step": 158190 + }, + { + "epoch": 22.455642299503193, + "grad_norm": 0.035203717648983, + "learning_rate": 7.755386799148332e-05, + "loss": 0.01865440309047699, + "step": 158200 + }, + { + "epoch": 22.45706174591909, + "grad_norm": 1.313195824623108, + "learning_rate": 7.755244854506743e-05, + "loss": 0.002453989535570145, + "step": 158210 + }, + { + "epoch": 22.45848119233499, + "grad_norm": 0.2515464723110199, + "learning_rate": 7.755102909865152e-05, + "loss": 0.01543634533882141, + "step": 158220 + }, + { + "epoch": 22.459900638750888, + "grad_norm": 5.3734893798828125, + "learning_rate": 7.754960965223564e-05, + "loss": 0.0058571044355630875, + "step": 158230 + }, + { + "epoch": 22.461320085166786, + "grad_norm": 0.010522381402552128, + "learning_rate": 7.754819020581973e-05, + "loss": 0.008202619850635529, + "step": 158240 + }, + { + "epoch": 22.462739531582685, + "grad_norm": 0.12635721266269684, + "learning_rate": 7.754677075940384e-05, + "loss": 0.029473838210105897, + "step": 158250 + }, + { + "epoch": 22.46415897799858, + "grad_norm": 0.1094331294298172, + "learning_rate": 7.754535131298794e-05, + "loss": 0.022599954903125764, + "step": 158260 + }, + { + "epoch": 22.465578424414478, + "grad_norm": 0.1387767344713211, + "learning_rate": 7.754393186657204e-05, + "loss": 0.01524360477924347, + "step": 158270 + }, + { + "epoch": 22.466997870830376, + "grad_norm": 8.003771781921387, + "learning_rate": 7.754251242015615e-05, + "loss": 0.044424769282341, + "step": 158280 + }, + { + "epoch": 22.468417317246274, + "grad_norm": 0.07056516408920288, + "learning_rate": 7.754109297374025e-05, + "loss": 0.003380986303091049, + "step": 158290 + }, + { + "epoch": 22.469836763662173, + "grad_norm": 0.11740966886281967, + "learning_rate": 7.753967352732436e-05, + "loss": 0.021326199173927307, + "step": 158300 + }, + { + "epoch": 22.47125621007807, + "grad_norm": 0.016576427966356277, + "learning_rate": 7.753825408090844e-05, + "loss": 0.005726790800690651, + "step": 158310 + }, + { + "epoch": 22.47267565649397, + "grad_norm": 0.2914753258228302, + "learning_rate": 7.753683463449255e-05, + "loss": 0.00747789517045021, + "step": 158320 + }, + { + "epoch": 22.474095102909864, + "grad_norm": 0.010632489807903767, + "learning_rate": 7.753541518807665e-05, + "loss": 0.014266845583915711, + "step": 158330 + }, + { + "epoch": 22.475514549325762, + "grad_norm": 0.05623609200119972, + "learning_rate": 7.753399574166076e-05, + "loss": 0.06276005506515503, + "step": 158340 + }, + { + "epoch": 22.47693399574166, + "grad_norm": 1.2466684579849243, + "learning_rate": 7.753257629524486e-05, + "loss": 0.014473459124565125, + "step": 158350 + }, + { + "epoch": 22.47835344215756, + "grad_norm": 1.9889719486236572, + "learning_rate": 7.753115684882896e-05, + "loss": 0.018504013121128083, + "step": 158360 + }, + { + "epoch": 22.479772888573457, + "grad_norm": 0.45456525683403015, + "learning_rate": 7.752973740241307e-05, + "loss": 0.03710986971855164, + "step": 158370 + }, + { + "epoch": 22.481192334989355, + "grad_norm": 0.06952834129333496, + "learning_rate": 7.752831795599716e-05, + "loss": 0.020645791292190553, + "step": 158380 + }, + { + "epoch": 22.482611781405254, + "grad_norm": 0.6986912488937378, + "learning_rate": 7.752689850958127e-05, + "loss": 0.028540158271789552, + "step": 158390 + }, + { + "epoch": 22.48403122782115, + "grad_norm": 0.0470876507461071, + "learning_rate": 7.752547906316537e-05, + "loss": 0.014051660895347595, + "step": 158400 + }, + { + "epoch": 22.485450674237047, + "grad_norm": 0.10415098816156387, + "learning_rate": 7.752405961674947e-05, + "loss": 0.0453540712594986, + "step": 158410 + }, + { + "epoch": 22.486870120652945, + "grad_norm": 0.8781419396400452, + "learning_rate": 7.752264017033357e-05, + "loss": 0.002484041452407837, + "step": 158420 + }, + { + "epoch": 22.488289567068843, + "grad_norm": 0.43081602454185486, + "learning_rate": 7.752122072391768e-05, + "loss": 0.017442478239536284, + "step": 158430 + }, + { + "epoch": 22.48970901348474, + "grad_norm": 0.5540584325790405, + "learning_rate": 7.751980127750178e-05, + "loss": 0.014503073692321778, + "step": 158440 + }, + { + "epoch": 22.49112845990064, + "grad_norm": 0.01617208868265152, + "learning_rate": 7.751838183108589e-05, + "loss": 0.0037552282214164735, + "step": 158450 + }, + { + "epoch": 22.49254790631654, + "grad_norm": 0.03205499425530434, + "learning_rate": 7.751696238466998e-05, + "loss": 0.0021371353417634965, + "step": 158460 + }, + { + "epoch": 22.493967352732433, + "grad_norm": 9.364788055419922, + "learning_rate": 7.751554293825408e-05, + "loss": 0.007025979459285736, + "step": 158470 + }, + { + "epoch": 22.49538679914833, + "grad_norm": 0.1477038860321045, + "learning_rate": 7.751412349183819e-05, + "loss": 0.013539460301399232, + "step": 158480 + }, + { + "epoch": 22.49680624556423, + "grad_norm": 0.04915747791528702, + "learning_rate": 7.751270404542229e-05, + "loss": 0.03261194825172424, + "step": 158490 + }, + { + "epoch": 22.498225691980128, + "grad_norm": 0.06221115216612816, + "learning_rate": 7.75112845990064e-05, + "loss": 0.01314397007226944, + "step": 158500 + }, + { + "epoch": 22.498225691980128, + "eval_accuracy": 0.9905894321866853, + "eval_loss": 0.03792344406247139, + "eval_runtime": 31.1211, + "eval_samples_per_second": 505.349, + "eval_steps_per_second": 15.809, + "step": 158500 + }, + { + "epoch": 22.499645138396026, + "grad_norm": 0.07033000886440277, + "learning_rate": 7.750986515259048e-05, + "loss": 0.045132437348365785, + "step": 158510 + }, + { + "epoch": 22.501064584811925, + "grad_norm": 0.11388713866472244, + "learning_rate": 7.75084457061746e-05, + "loss": 0.022923552989959718, + "step": 158520 + }, + { + "epoch": 22.502484031227823, + "grad_norm": 1.0214308500289917, + "learning_rate": 7.750702625975869e-05, + "loss": 0.009690790623426437, + "step": 158530 + }, + { + "epoch": 22.503903477643718, + "grad_norm": 0.7464064359664917, + "learning_rate": 7.75056068133428e-05, + "loss": 0.009011165797710418, + "step": 158540 + }, + { + "epoch": 22.505322924059616, + "grad_norm": 0.040032073855400085, + "learning_rate": 7.75041873669269e-05, + "loss": 0.005155869573354721, + "step": 158550 + }, + { + "epoch": 22.506742370475514, + "grad_norm": 0.19786891341209412, + "learning_rate": 7.7502767920511e-05, + "loss": 0.02071729153394699, + "step": 158560 + }, + { + "epoch": 22.508161816891413, + "grad_norm": 4.149325847625732, + "learning_rate": 7.750134847409511e-05, + "loss": 0.013772135972976685, + "step": 158570 + }, + { + "epoch": 22.50958126330731, + "grad_norm": 1.3576583862304688, + "learning_rate": 7.74999290276792e-05, + "loss": 0.025132346153259277, + "step": 158580 + }, + { + "epoch": 22.51100070972321, + "grad_norm": 7.1251139640808105, + "learning_rate": 7.749850958126332e-05, + "loss": 0.022010709345340728, + "step": 158590 + }, + { + "epoch": 22.512420156139108, + "grad_norm": 0.07827175408601761, + "learning_rate": 7.749709013484741e-05, + "loss": 0.0014567647129297256, + "step": 158600 + }, + { + "epoch": 22.513839602555002, + "grad_norm": 0.11930258572101593, + "learning_rate": 7.749567068843153e-05, + "loss": 0.018433178961277007, + "step": 158610 + }, + { + "epoch": 22.5152590489709, + "grad_norm": 10.317915916442871, + "learning_rate": 7.749425124201561e-05, + "loss": 0.00852893590927124, + "step": 158620 + }, + { + "epoch": 22.5166784953868, + "grad_norm": 0.2808504104614258, + "learning_rate": 7.749283179559972e-05, + "loss": 0.031204766035079955, + "step": 158630 + }, + { + "epoch": 22.518097941802697, + "grad_norm": 1.3872064352035522, + "learning_rate": 7.749141234918382e-05, + "loss": 0.028063207864761353, + "step": 158640 + }, + { + "epoch": 22.519517388218595, + "grad_norm": 0.13982322812080383, + "learning_rate": 7.748999290276793e-05, + "loss": 0.00934479534626007, + "step": 158650 + }, + { + "epoch": 22.520936834634494, + "grad_norm": 3.612175464630127, + "learning_rate": 7.748857345635203e-05, + "loss": 0.01334807276725769, + "step": 158660 + }, + { + "epoch": 22.522356281050392, + "grad_norm": 11.551177978515625, + "learning_rate": 7.748715400993612e-05, + "loss": 0.03358421921730041, + "step": 158670 + }, + { + "epoch": 22.523775727466287, + "grad_norm": 0.25505468249320984, + "learning_rate": 7.748573456352023e-05, + "loss": 0.020181326568126677, + "step": 158680 + }, + { + "epoch": 22.525195173882185, + "grad_norm": 0.2256714254617691, + "learning_rate": 7.748431511710433e-05, + "loss": 0.02636730968952179, + "step": 158690 + }, + { + "epoch": 22.526614620298083, + "grad_norm": 0.03345772251486778, + "learning_rate": 7.748289567068844e-05, + "loss": 0.001260707899928093, + "step": 158700 + }, + { + "epoch": 22.528034066713982, + "grad_norm": 8.43707275390625, + "learning_rate": 7.748147622427254e-05, + "loss": 0.03548075556755066, + "step": 158710 + }, + { + "epoch": 22.52945351312988, + "grad_norm": 0.5538367629051208, + "learning_rate": 7.748005677785664e-05, + "loss": 0.008879843354225158, + "step": 158720 + }, + { + "epoch": 22.53087295954578, + "grad_norm": 0.04493587091565132, + "learning_rate": 7.747863733144073e-05, + "loss": 0.04076599776744842, + "step": 158730 + }, + { + "epoch": 22.532292405961677, + "grad_norm": 0.19090299308300018, + "learning_rate": 7.747721788502485e-05, + "loss": 0.022210666537284852, + "step": 158740 + }, + { + "epoch": 22.53371185237757, + "grad_norm": 0.11153759062290192, + "learning_rate": 7.747579843860894e-05, + "loss": 0.03516617119312286, + "step": 158750 + }, + { + "epoch": 22.53513129879347, + "grad_norm": 1.7377121448516846, + "learning_rate": 7.747437899219305e-05, + "loss": 0.04072195291519165, + "step": 158760 + }, + { + "epoch": 22.536550745209368, + "grad_norm": 0.06686928123235703, + "learning_rate": 7.747295954577715e-05, + "loss": 0.0030821334570646287, + "step": 158770 + }, + { + "epoch": 22.537970191625266, + "grad_norm": 0.14707443118095398, + "learning_rate": 7.747154009936125e-05, + "loss": 0.0207765594124794, + "step": 158780 + }, + { + "epoch": 22.539389638041165, + "grad_norm": 0.17583338916301727, + "learning_rate": 7.747012065294536e-05, + "loss": 0.013034404814243316, + "step": 158790 + }, + { + "epoch": 22.540809084457063, + "grad_norm": 1.6991326808929443, + "learning_rate": 7.746870120652946e-05, + "loss": 0.00810532197356224, + "step": 158800 + }, + { + "epoch": 22.54222853087296, + "grad_norm": 0.026322200894355774, + "learning_rate": 7.746728176011357e-05, + "loss": 0.006932845711708069, + "step": 158810 + }, + { + "epoch": 22.543647977288856, + "grad_norm": 1.8408856391906738, + "learning_rate": 7.746586231369765e-05, + "loss": 0.011511996388435364, + "step": 158820 + }, + { + "epoch": 22.545067423704754, + "grad_norm": 0.10930416733026505, + "learning_rate": 7.746444286728176e-05, + "loss": 0.03214123249053955, + "step": 158830 + }, + { + "epoch": 22.546486870120653, + "grad_norm": 0.5288762450218201, + "learning_rate": 7.746302342086586e-05, + "loss": 0.005965733900666237, + "step": 158840 + }, + { + "epoch": 22.54790631653655, + "grad_norm": 1.6479065418243408, + "learning_rate": 7.746160397444997e-05, + "loss": 0.024427339434623718, + "step": 158850 + }, + { + "epoch": 22.54932576295245, + "grad_norm": 0.07508545368909836, + "learning_rate": 7.746018452803407e-05, + "loss": 0.0033032428473234177, + "step": 158860 + }, + { + "epoch": 22.550745209368348, + "grad_norm": 0.6447070837020874, + "learning_rate": 7.745876508161817e-05, + "loss": 0.01718648076057434, + "step": 158870 + }, + { + "epoch": 22.552164655784246, + "grad_norm": 0.5143861770629883, + "learning_rate": 7.745734563520228e-05, + "loss": 0.00394304022192955, + "step": 158880 + }, + { + "epoch": 22.55358410220014, + "grad_norm": 1.8426095247268677, + "learning_rate": 7.745592618878637e-05, + "loss": 0.058056455850601194, + "step": 158890 + }, + { + "epoch": 22.55500354861604, + "grad_norm": 4.245222568511963, + "learning_rate": 7.745450674237048e-05, + "loss": 0.027254462242126465, + "step": 158900 + }, + { + "epoch": 22.556422995031937, + "grad_norm": 0.11887446790933609, + "learning_rate": 7.745308729595458e-05, + "loss": 0.034078973531723025, + "step": 158910 + }, + { + "epoch": 22.557842441447836, + "grad_norm": 7.1803059577941895, + "learning_rate": 7.745166784953868e-05, + "loss": 0.024916628003120424, + "step": 158920 + }, + { + "epoch": 22.559261887863734, + "grad_norm": 1.6360132694244385, + "learning_rate": 7.745024840312278e-05, + "loss": 0.003317312523722649, + "step": 158930 + }, + { + "epoch": 22.560681334279632, + "grad_norm": 0.07002828270196915, + "learning_rate": 7.744882895670689e-05, + "loss": 0.006323719024658203, + "step": 158940 + }, + { + "epoch": 22.56210078069553, + "grad_norm": 10.601655006408691, + "learning_rate": 7.744740951029099e-05, + "loss": 0.023100095987319946, + "step": 158950 + }, + { + "epoch": 22.563520227111425, + "grad_norm": 0.22745627164840698, + "learning_rate": 7.74459900638751e-05, + "loss": 0.007169928401708603, + "step": 158960 + }, + { + "epoch": 22.564939673527324, + "grad_norm": 0.15357869863510132, + "learning_rate": 7.74445706174592e-05, + "loss": 0.0020546797662973405, + "step": 158970 + }, + { + "epoch": 22.566359119943222, + "grad_norm": 8.447617530822754, + "learning_rate": 7.744315117104329e-05, + "loss": 0.017369568347930908, + "step": 158980 + }, + { + "epoch": 22.56777856635912, + "grad_norm": 13.05282974243164, + "learning_rate": 7.74417317246274e-05, + "loss": 0.025463995337486268, + "step": 158990 + }, + { + "epoch": 22.56919801277502, + "grad_norm": 0.2755369246006012, + "learning_rate": 7.74403122782115e-05, + "loss": 0.011595487594604492, + "step": 159000 + }, + { + "epoch": 22.56919801277502, + "eval_accuracy": 0.9889362243275895, + "eval_loss": 0.03800418600440025, + "eval_runtime": 30.7885, + "eval_samples_per_second": 510.807, + "eval_steps_per_second": 15.98, + "step": 159000 + }, + { + "epoch": 22.570617459190917, + "grad_norm": 1.6850870847702026, + "learning_rate": 7.743889283179561e-05, + "loss": 0.025997638702392578, + "step": 159010 + }, + { + "epoch": 22.572036905606815, + "grad_norm": 2.7293012142181396, + "learning_rate": 7.743747338537971e-05, + "loss": 0.003980817273259163, + "step": 159020 + }, + { + "epoch": 22.57345635202271, + "grad_norm": 0.5372725129127502, + "learning_rate": 7.74360539389638e-05, + "loss": 0.01969671994447708, + "step": 159030 + }, + { + "epoch": 22.574875798438608, + "grad_norm": 5.794559001922607, + "learning_rate": 7.74346344925479e-05, + "loss": 0.024230395257472993, + "step": 159040 + }, + { + "epoch": 22.576295244854506, + "grad_norm": 12.547741889953613, + "learning_rate": 7.743321504613201e-05, + "loss": 0.018874749541282654, + "step": 159050 + }, + { + "epoch": 22.577714691270405, + "grad_norm": 6.733235836029053, + "learning_rate": 7.743179559971611e-05, + "loss": 0.03471670150756836, + "step": 159060 + }, + { + "epoch": 22.579134137686303, + "grad_norm": 19.456722259521484, + "learning_rate": 7.743037615330022e-05, + "loss": 0.028678667545318604, + "step": 159070 + }, + { + "epoch": 22.5805535841022, + "grad_norm": 0.05204786732792854, + "learning_rate": 7.742895670688432e-05, + "loss": 0.012357182800769806, + "step": 159080 + }, + { + "epoch": 22.5819730305181, + "grad_norm": 0.012773375026881695, + "learning_rate": 7.742753726046842e-05, + "loss": 0.030895236134529113, + "step": 159090 + }, + { + "epoch": 22.583392476933994, + "grad_norm": 13.81541919708252, + "learning_rate": 7.742611781405253e-05, + "loss": 0.007949428260326385, + "step": 159100 + }, + { + "epoch": 22.584811923349893, + "grad_norm": 6.0529398918151855, + "learning_rate": 7.742469836763662e-05, + "loss": 0.028233027458190917, + "step": 159110 + }, + { + "epoch": 22.58623136976579, + "grad_norm": 11.985567092895508, + "learning_rate": 7.742327892122074e-05, + "loss": 0.04349898099899292, + "step": 159120 + }, + { + "epoch": 22.58765081618169, + "grad_norm": 0.005835610441863537, + "learning_rate": 7.742185947480482e-05, + "loss": 0.012536582350730897, + "step": 159130 + }, + { + "epoch": 22.589070262597588, + "grad_norm": 3.6895742416381836, + "learning_rate": 7.742044002838893e-05, + "loss": 0.01597345918416977, + "step": 159140 + }, + { + "epoch": 22.590489709013486, + "grad_norm": 1.2924069166183472, + "learning_rate": 7.741902058197303e-05, + "loss": 0.011831177771091462, + "step": 159150 + }, + { + "epoch": 22.591909155429384, + "grad_norm": 1.0199419260025024, + "learning_rate": 7.741760113555714e-05, + "loss": 0.031389302015304564, + "step": 159160 + }, + { + "epoch": 22.59332860184528, + "grad_norm": 0.8326813578605652, + "learning_rate": 7.741618168914125e-05, + "loss": 0.00220700278878212, + "step": 159170 + }, + { + "epoch": 22.594748048261177, + "grad_norm": 0.020369822159409523, + "learning_rate": 7.741476224272533e-05, + "loss": 0.009619846940040588, + "step": 159180 + }, + { + "epoch": 22.596167494677076, + "grad_norm": 0.026509283110499382, + "learning_rate": 7.741334279630944e-05, + "loss": 0.018055228888988493, + "step": 159190 + }, + { + "epoch": 22.597586941092974, + "grad_norm": 0.20580554008483887, + "learning_rate": 7.741192334989354e-05, + "loss": 0.028126612305641174, + "step": 159200 + }, + { + "epoch": 22.599006387508872, + "grad_norm": 0.03719627112150192, + "learning_rate": 7.741050390347765e-05, + "loss": 0.005551586300134659, + "step": 159210 + }, + { + "epoch": 22.60042583392477, + "grad_norm": 0.05747586488723755, + "learning_rate": 7.740908445706175e-05, + "loss": 0.04191165566444397, + "step": 159220 + }, + { + "epoch": 22.60184528034067, + "grad_norm": 0.09290939569473267, + "learning_rate": 7.740766501064585e-05, + "loss": 0.0018793828785419464, + "step": 159230 + }, + { + "epoch": 22.603264726756564, + "grad_norm": 0.5466276407241821, + "learning_rate": 7.740624556422994e-05, + "loss": 0.005200093612074852, + "step": 159240 + }, + { + "epoch": 22.604684173172462, + "grad_norm": 3.716014862060547, + "learning_rate": 7.740482611781406e-05, + "loss": 0.01997900754213333, + "step": 159250 + }, + { + "epoch": 22.60610361958836, + "grad_norm": 0.8647883534431458, + "learning_rate": 7.740340667139817e-05, + "loss": 0.0025421187281608583, + "step": 159260 + }, + { + "epoch": 22.60752306600426, + "grad_norm": 0.22152212262153625, + "learning_rate": 7.740198722498226e-05, + "loss": 0.0053821496665477754, + "step": 159270 + }, + { + "epoch": 22.608942512420157, + "grad_norm": 1.6946345567703247, + "learning_rate": 7.740056777856636e-05, + "loss": 0.0074778318405151365, + "step": 159280 + }, + { + "epoch": 22.610361958836055, + "grad_norm": 0.10071541368961334, + "learning_rate": 7.739914833215046e-05, + "loss": 0.020376217365264893, + "step": 159290 + }, + { + "epoch": 22.611781405251953, + "grad_norm": 1.932011604309082, + "learning_rate": 7.739772888573457e-05, + "loss": 0.039461395144462584, + "step": 159300 + }, + { + "epoch": 22.613200851667848, + "grad_norm": 0.029205817729234695, + "learning_rate": 7.739630943931867e-05, + "loss": 0.0037381209433078764, + "step": 159310 + }, + { + "epoch": 22.614620298083747, + "grad_norm": 0.16336078941822052, + "learning_rate": 7.739488999290278e-05, + "loss": 0.028498396277427673, + "step": 159320 + }, + { + "epoch": 22.616039744499645, + "grad_norm": 0.4874585270881653, + "learning_rate": 7.739347054648688e-05, + "loss": 0.006881621479988098, + "step": 159330 + }, + { + "epoch": 22.617459190915543, + "grad_norm": 5.073184967041016, + "learning_rate": 7.739205110007097e-05, + "loss": 0.019182351231575013, + "step": 159340 + }, + { + "epoch": 22.61887863733144, + "grad_norm": 0.9553701877593994, + "learning_rate": 7.739063165365508e-05, + "loss": 0.004613106325268746, + "step": 159350 + }, + { + "epoch": 22.62029808374734, + "grad_norm": 0.45802977681159973, + "learning_rate": 7.738921220723918e-05, + "loss": 0.006418612599372864, + "step": 159360 + }, + { + "epoch": 22.621717530163238, + "grad_norm": 1.099333643913269, + "learning_rate": 7.738779276082329e-05, + "loss": 0.025840193033218384, + "step": 159370 + }, + { + "epoch": 22.623136976579133, + "grad_norm": 0.01409867499023676, + "learning_rate": 7.738637331440739e-05, + "loss": 0.011579766869544983, + "step": 159380 + }, + { + "epoch": 22.62455642299503, + "grad_norm": 0.032323870807886124, + "learning_rate": 7.738495386799149e-05, + "loss": 0.025756067037582396, + "step": 159390 + }, + { + "epoch": 22.62597586941093, + "grad_norm": 11.710440635681152, + "learning_rate": 7.738353442157558e-05, + "loss": 0.030228179693222047, + "step": 159400 + }, + { + "epoch": 22.627395315826828, + "grad_norm": 0.09593667834997177, + "learning_rate": 7.73821149751597e-05, + "loss": 0.016091972589492798, + "step": 159410 + }, + { + "epoch": 22.628814762242726, + "grad_norm": 0.2888541519641876, + "learning_rate": 7.738069552874379e-05, + "loss": 0.013794219493865967, + "step": 159420 + }, + { + "epoch": 22.630234208658624, + "grad_norm": 4.673342704772949, + "learning_rate": 7.73792760823279e-05, + "loss": 0.034199610352516174, + "step": 159430 + }, + { + "epoch": 22.631653655074523, + "grad_norm": 0.113935187458992, + "learning_rate": 7.737785663591199e-05, + "loss": 0.003508252277970314, + "step": 159440 + }, + { + "epoch": 22.633073101490417, + "grad_norm": 0.00997981708496809, + "learning_rate": 7.73764371894961e-05, + "loss": 0.07730802297592163, + "step": 159450 + }, + { + "epoch": 22.634492547906316, + "grad_norm": 0.12752574682235718, + "learning_rate": 7.737501774308021e-05, + "loss": 0.005117392539978028, + "step": 159460 + }, + { + "epoch": 22.635911994322214, + "grad_norm": 0.4262155592441559, + "learning_rate": 7.73735982966643e-05, + "loss": 0.024055181443691252, + "step": 159470 + }, + { + "epoch": 22.637331440738112, + "grad_norm": 0.08212634176015854, + "learning_rate": 7.737217885024842e-05, + "loss": 0.009031610190868377, + "step": 159480 + }, + { + "epoch": 22.63875088715401, + "grad_norm": 0.1315786987543106, + "learning_rate": 7.73707594038325e-05, + "loss": 0.01506779044866562, + "step": 159490 + }, + { + "epoch": 22.64017033356991, + "grad_norm": 0.3355313837528229, + "learning_rate": 7.736933995741661e-05, + "loss": 0.0398618757724762, + "step": 159500 + }, + { + "epoch": 22.64017033356991, + "eval_accuracy": 0.9896992433394799, + "eval_loss": 0.03425000235438347, + "eval_runtime": 32.0529, + "eval_samples_per_second": 490.658, + "eval_steps_per_second": 15.35, + "step": 159500 + }, + { + "epoch": 22.641589779985807, + "grad_norm": 0.05310266837477684, + "learning_rate": 7.736792051100071e-05, + "loss": 0.02315482646226883, + "step": 159510 + }, + { + "epoch": 22.643009226401702, + "grad_norm": 13.616758346557617, + "learning_rate": 7.736650106458482e-05, + "loss": 0.01896231323480606, + "step": 159520 + }, + { + "epoch": 22.6444286728176, + "grad_norm": 0.3994249403476715, + "learning_rate": 7.736508161816892e-05, + "loss": 0.0033166803419589995, + "step": 159530 + }, + { + "epoch": 22.6458481192335, + "grad_norm": 0.2235090285539627, + "learning_rate": 7.736366217175302e-05, + "loss": 0.014450803399085999, + "step": 159540 + }, + { + "epoch": 22.647267565649397, + "grad_norm": 0.21497444808483124, + "learning_rate": 7.736224272533713e-05, + "loss": 0.056241470575332644, + "step": 159550 + }, + { + "epoch": 22.648687012065295, + "grad_norm": 9.174455642700195, + "learning_rate": 7.736082327892122e-05, + "loss": 0.04912766218185425, + "step": 159560 + }, + { + "epoch": 22.650106458481194, + "grad_norm": 0.056745413690805435, + "learning_rate": 7.735940383250533e-05, + "loss": 0.023115354776382446, + "step": 159570 + }, + { + "epoch": 22.651525904897092, + "grad_norm": 0.023714296519756317, + "learning_rate": 7.735798438608943e-05, + "loss": 0.00983305498957634, + "step": 159580 + }, + { + "epoch": 22.652945351312987, + "grad_norm": 0.7644844055175781, + "learning_rate": 7.735656493967353e-05, + "loss": 0.01133526861667633, + "step": 159590 + }, + { + "epoch": 22.654364797728885, + "grad_norm": 2.8843109607696533, + "learning_rate": 7.735514549325763e-05, + "loss": 0.010176539421081543, + "step": 159600 + }, + { + "epoch": 22.655784244144783, + "grad_norm": 2.3378593921661377, + "learning_rate": 7.735372604684174e-05, + "loss": 0.0028312966227531434, + "step": 159610 + }, + { + "epoch": 22.65720369056068, + "grad_norm": 0.12521985173225403, + "learning_rate": 7.735230660042583e-05, + "loss": 0.010271473228931427, + "step": 159620 + }, + { + "epoch": 22.65862313697658, + "grad_norm": 0.021498097106814384, + "learning_rate": 7.735088715400995e-05, + "loss": 0.018469969928264617, + "step": 159630 + }, + { + "epoch": 22.660042583392478, + "grad_norm": 0.34795081615448, + "learning_rate": 7.734946770759404e-05, + "loss": 0.0032161388546228407, + "step": 159640 + }, + { + "epoch": 22.661462029808376, + "grad_norm": 0.06621494889259338, + "learning_rate": 7.734804826117814e-05, + "loss": 0.010059913247823715, + "step": 159650 + }, + { + "epoch": 22.66288147622427, + "grad_norm": 15.615278244018555, + "learning_rate": 7.734662881476225e-05, + "loss": 0.04272184371948242, + "step": 159660 + }, + { + "epoch": 22.66430092264017, + "grad_norm": 2.4798178672790527, + "learning_rate": 7.734520936834635e-05, + "loss": 0.016018617153167724, + "step": 159670 + }, + { + "epoch": 22.665720369056068, + "grad_norm": 0.4926806390285492, + "learning_rate": 7.734378992193046e-05, + "loss": 0.026209577918052673, + "step": 159680 + }, + { + "epoch": 22.667139815471966, + "grad_norm": 0.6387996077537537, + "learning_rate": 7.734237047551456e-05, + "loss": 0.016200077533721925, + "step": 159690 + }, + { + "epoch": 22.668559261887864, + "grad_norm": 0.030517011880874634, + "learning_rate": 7.734095102909865e-05, + "loss": 0.03894783556461334, + "step": 159700 + }, + { + "epoch": 22.669978708303763, + "grad_norm": 26.73149299621582, + "learning_rate": 7.733953158268275e-05, + "loss": 0.04062844216823578, + "step": 159710 + }, + { + "epoch": 22.67139815471966, + "grad_norm": 8.793081283569336, + "learning_rate": 7.733811213626686e-05, + "loss": 0.04902389645576477, + "step": 159720 + }, + { + "epoch": 22.672817601135556, + "grad_norm": 15.691280364990234, + "learning_rate": 7.733669268985096e-05, + "loss": 0.037295737862586976, + "step": 159730 + }, + { + "epoch": 22.674237047551454, + "grad_norm": 1.1893489360809326, + "learning_rate": 7.733527324343507e-05, + "loss": 0.007760918140411377, + "step": 159740 + }, + { + "epoch": 22.675656493967352, + "grad_norm": 10.579549789428711, + "learning_rate": 7.733385379701917e-05, + "loss": 0.03190224170684815, + "step": 159750 + }, + { + "epoch": 22.67707594038325, + "grad_norm": 0.38818302750587463, + "learning_rate": 7.733243435060327e-05, + "loss": 0.031702494621276854, + "step": 159760 + }, + { + "epoch": 22.67849538679915, + "grad_norm": 5.017606258392334, + "learning_rate": 7.733101490418738e-05, + "loss": 0.05229347348213196, + "step": 159770 + }, + { + "epoch": 22.679914833215047, + "grad_norm": 4.5757551193237305, + "learning_rate": 7.732959545777147e-05, + "loss": 0.022170212864875794, + "step": 159780 + }, + { + "epoch": 22.681334279630946, + "grad_norm": 0.004683753475546837, + "learning_rate": 7.732817601135558e-05, + "loss": 0.027418911457061768, + "step": 159790 + }, + { + "epoch": 22.68275372604684, + "grad_norm": 7.092657089233398, + "learning_rate": 7.732675656493967e-05, + "loss": 0.03169282078742981, + "step": 159800 + }, + { + "epoch": 22.68417317246274, + "grad_norm": 0.23998382687568665, + "learning_rate": 7.732533711852378e-05, + "loss": 0.004967937618494034, + "step": 159810 + }, + { + "epoch": 22.685592618878637, + "grad_norm": 1.8321422338485718, + "learning_rate": 7.732391767210788e-05, + "loss": 0.013894245028495789, + "step": 159820 + }, + { + "epoch": 22.687012065294535, + "grad_norm": 5.097275257110596, + "learning_rate": 7.732249822569199e-05, + "loss": 0.02511633336544037, + "step": 159830 + }, + { + "epoch": 22.688431511710434, + "grad_norm": 7.659380912780762, + "learning_rate": 7.732107877927609e-05, + "loss": 0.00847838968038559, + "step": 159840 + }, + { + "epoch": 22.689850958126332, + "grad_norm": 1.2720708847045898, + "learning_rate": 7.731965933286018e-05, + "loss": 0.018764455616474152, + "step": 159850 + }, + { + "epoch": 22.69127040454223, + "grad_norm": 0.012540793046355247, + "learning_rate": 7.73182398864443e-05, + "loss": 0.01049247831106186, + "step": 159860 + }, + { + "epoch": 22.692689850958125, + "grad_norm": 0.017372870817780495, + "learning_rate": 7.731682044002839e-05, + "loss": 0.058463698625564574, + "step": 159870 + }, + { + "epoch": 22.694109297374023, + "grad_norm": 0.02809220366179943, + "learning_rate": 7.73154009936125e-05, + "loss": 0.014134123921394348, + "step": 159880 + }, + { + "epoch": 22.69552874378992, + "grad_norm": 11.334383010864258, + "learning_rate": 7.73139815471966e-05, + "loss": 0.036473029851913454, + "step": 159890 + }, + { + "epoch": 22.69694819020582, + "grad_norm": 1.9914966821670532, + "learning_rate": 7.73125621007807e-05, + "loss": 0.012525233626365661, + "step": 159900 + }, + { + "epoch": 22.698367636621718, + "grad_norm": 0.021400459110736847, + "learning_rate": 7.73111426543648e-05, + "loss": 0.025075027346611024, + "step": 159910 + }, + { + "epoch": 22.699787083037616, + "grad_norm": 0.09523364156484604, + "learning_rate": 7.73097232079489e-05, + "loss": 0.033366695046424866, + "step": 159920 + }, + { + "epoch": 22.701206529453515, + "grad_norm": 0.08532078564167023, + "learning_rate": 7.7308303761533e-05, + "loss": 0.042099347710609435, + "step": 159930 + }, + { + "epoch": 22.70262597586941, + "grad_norm": 8.521576881408691, + "learning_rate": 7.730688431511711e-05, + "loss": 0.04827213883399963, + "step": 159940 + }, + { + "epoch": 22.704045422285308, + "grad_norm": 2.167962074279785, + "learning_rate": 7.730546486870121e-05, + "loss": 0.013536466658115387, + "step": 159950 + }, + { + "epoch": 22.705464868701206, + "grad_norm": 5.594449996948242, + "learning_rate": 7.730404542228531e-05, + "loss": 0.012618489563465118, + "step": 159960 + }, + { + "epoch": 22.706884315117104, + "grad_norm": 0.3212679624557495, + "learning_rate": 7.730262597586942e-05, + "loss": 0.015288057923316955, + "step": 159970 + }, + { + "epoch": 22.708303761533003, + "grad_norm": 0.13467688858509064, + "learning_rate": 7.730120652945352e-05, + "loss": 0.008196897804737091, + "step": 159980 + }, + { + "epoch": 22.7097232079489, + "grad_norm": 0.9096996188163757, + "learning_rate": 7.729978708303763e-05, + "loss": 0.06215350031852722, + "step": 159990 + }, + { + "epoch": 22.7111426543648, + "grad_norm": 2.5159287452697754, + "learning_rate": 7.729836763662171e-05, + "loss": 0.0030499514192342757, + "step": 160000 + }, + { + "epoch": 22.7111426543648, + "eval_accuracy": 0.987918865645069, + "eval_loss": 0.0466885082423687, + "eval_runtime": 32.6769, + "eval_samples_per_second": 481.288, + "eval_steps_per_second": 15.057, + "step": 160000 + }, + { + "epoch": 22.712562100780694, + "grad_norm": 7.60798454284668, + "learning_rate": 7.729694819020582e-05, + "loss": 0.006361671537160873, + "step": 160010 + }, + { + "epoch": 22.713981547196592, + "grad_norm": 0.28756624460220337, + "learning_rate": 7.729552874378992e-05, + "loss": 0.005560239776968956, + "step": 160020 + }, + { + "epoch": 22.71540099361249, + "grad_norm": 1.3404912948608398, + "learning_rate": 7.729410929737403e-05, + "loss": 0.002193022519350052, + "step": 160030 + }, + { + "epoch": 22.71682044002839, + "grad_norm": 0.0866628959774971, + "learning_rate": 7.729268985095813e-05, + "loss": 0.0035404741764068605, + "step": 160040 + }, + { + "epoch": 22.718239886444287, + "grad_norm": 0.030149100348353386, + "learning_rate": 7.729127040454224e-05, + "loss": 0.012468218803405762, + "step": 160050 + }, + { + "epoch": 22.719659332860186, + "grad_norm": 1.7158918380737305, + "learning_rate": 7.728985095812634e-05, + "loss": 0.0010232627391815186, + "step": 160060 + }, + { + "epoch": 22.721078779276084, + "grad_norm": 0.020403893664479256, + "learning_rate": 7.728843151171043e-05, + "loss": 0.016857782006263734, + "step": 160070 + }, + { + "epoch": 22.72249822569198, + "grad_norm": 2.4056396484375, + "learning_rate": 7.728701206529454e-05, + "loss": 0.029088348150253296, + "step": 160080 + }, + { + "epoch": 22.723917672107877, + "grad_norm": 0.06975699216127396, + "learning_rate": 7.728559261887864e-05, + "loss": 0.004994088783860207, + "step": 160090 + }, + { + "epoch": 22.725337118523775, + "grad_norm": 0.18976786732673645, + "learning_rate": 7.728417317246275e-05, + "loss": 0.0030188210308551787, + "step": 160100 + }, + { + "epoch": 22.726756564939674, + "grad_norm": 2.2341432571411133, + "learning_rate": 7.728275372604684e-05, + "loss": 0.005208659544587135, + "step": 160110 + }, + { + "epoch": 22.728176011355572, + "grad_norm": 0.16124406456947327, + "learning_rate": 7.728133427963095e-05, + "loss": 0.001570281758904457, + "step": 160120 + }, + { + "epoch": 22.72959545777147, + "grad_norm": 0.08787505328655243, + "learning_rate": 7.727991483321504e-05, + "loss": 0.0010072313249111175, + "step": 160130 + }, + { + "epoch": 22.73101490418737, + "grad_norm": 0.0273013636469841, + "learning_rate": 7.727849538679916e-05, + "loss": 0.01428002119064331, + "step": 160140 + }, + { + "epoch": 22.732434350603263, + "grad_norm": 0.01905743032693863, + "learning_rate": 7.727707594038325e-05, + "loss": 0.014527519047260285, + "step": 160150 + }, + { + "epoch": 22.73385379701916, + "grad_norm": 0.17875850200653076, + "learning_rate": 7.727565649396735e-05, + "loss": 0.003752744197845459, + "step": 160160 + }, + { + "epoch": 22.73527324343506, + "grad_norm": 0.13961610198020935, + "learning_rate": 7.727423704755146e-05, + "loss": 0.001430308073759079, + "step": 160170 + }, + { + "epoch": 22.73669268985096, + "grad_norm": 0.28335314989089966, + "learning_rate": 7.727281760113556e-05, + "loss": 0.015633009374141693, + "step": 160180 + }, + { + "epoch": 22.738112136266857, + "grad_norm": 0.8718028664588928, + "learning_rate": 7.727139815471967e-05, + "loss": 0.02738465666770935, + "step": 160190 + }, + { + "epoch": 22.739531582682755, + "grad_norm": 0.6878511309623718, + "learning_rate": 7.726997870830377e-05, + "loss": 0.005553923547267914, + "step": 160200 + }, + { + "epoch": 22.740951029098653, + "grad_norm": 0.40028470754623413, + "learning_rate": 7.726855926188786e-05, + "loss": 0.002297991141676903, + "step": 160210 + }, + { + "epoch": 22.742370475514548, + "grad_norm": 0.006008324213325977, + "learning_rate": 7.726713981547196e-05, + "loss": 0.009290139377117156, + "step": 160220 + }, + { + "epoch": 22.743789921930446, + "grad_norm": 0.15749114751815796, + "learning_rate": 7.726572036905607e-05, + "loss": 0.048120349645614624, + "step": 160230 + }, + { + "epoch": 22.745209368346345, + "grad_norm": 0.0845469981431961, + "learning_rate": 7.726430092264017e-05, + "loss": 0.007590672373771668, + "step": 160240 + }, + { + "epoch": 22.746628814762243, + "grad_norm": 7.441592216491699, + "learning_rate": 7.726288147622428e-05, + "loss": 0.007027903944253922, + "step": 160250 + }, + { + "epoch": 22.74804826117814, + "grad_norm": 0.040471382439136505, + "learning_rate": 7.726146202980838e-05, + "loss": 0.014409859478473664, + "step": 160260 + }, + { + "epoch": 22.74946770759404, + "grad_norm": 0.0999637246131897, + "learning_rate": 7.726004258339248e-05, + "loss": 0.008060001581907273, + "step": 160270 + }, + { + "epoch": 22.750887154009938, + "grad_norm": 4.231478691101074, + "learning_rate": 7.725862313697659e-05, + "loss": 0.00417415127158165, + "step": 160280 + }, + { + "epoch": 22.752306600425833, + "grad_norm": 0.15422852337360382, + "learning_rate": 7.725720369056068e-05, + "loss": 0.00941803976893425, + "step": 160290 + }, + { + "epoch": 22.75372604684173, + "grad_norm": 0.1971638798713684, + "learning_rate": 7.72557842441448e-05, + "loss": 0.028908944129943846, + "step": 160300 + }, + { + "epoch": 22.75514549325763, + "grad_norm": 1.0679829120635986, + "learning_rate": 7.725436479772888e-05, + "loss": 0.03047631084918976, + "step": 160310 + }, + { + "epoch": 22.756564939673527, + "grad_norm": 0.8671993613243103, + "learning_rate": 7.725294535131299e-05, + "loss": 0.028746581077575682, + "step": 160320 + }, + { + "epoch": 22.757984386089426, + "grad_norm": 2.6174075603485107, + "learning_rate": 7.725152590489709e-05, + "loss": 0.03653534948825836, + "step": 160330 + }, + { + "epoch": 22.759403832505324, + "grad_norm": 0.36605095863342285, + "learning_rate": 7.72501064584812e-05, + "loss": 0.008347028493881225, + "step": 160340 + }, + { + "epoch": 22.760823278921222, + "grad_norm": 2.4115898609161377, + "learning_rate": 7.72486870120653e-05, + "loss": 0.022666722536087036, + "step": 160350 + }, + { + "epoch": 22.762242725337117, + "grad_norm": 3.804072141647339, + "learning_rate": 7.724726756564939e-05, + "loss": 0.02918463945388794, + "step": 160360 + }, + { + "epoch": 22.763662171753015, + "grad_norm": 0.11534058302640915, + "learning_rate": 7.72458481192335e-05, + "loss": 0.0055793851613998415, + "step": 160370 + }, + { + "epoch": 22.765081618168914, + "grad_norm": 0.06028597429394722, + "learning_rate": 7.72444286728176e-05, + "loss": 0.002206745743751526, + "step": 160380 + }, + { + "epoch": 22.766501064584812, + "grad_norm": 3.405935764312744, + "learning_rate": 7.724300922640171e-05, + "loss": 0.03635244369506836, + "step": 160390 + }, + { + "epoch": 22.76792051100071, + "grad_norm": 11.555081367492676, + "learning_rate": 7.724158977998581e-05, + "loss": 0.014376814663410186, + "step": 160400 + }, + { + "epoch": 22.76933995741661, + "grad_norm": 0.7535704970359802, + "learning_rate": 7.724017033356992e-05, + "loss": 0.037115943431854245, + "step": 160410 + }, + { + "epoch": 22.770759403832507, + "grad_norm": 0.6693304777145386, + "learning_rate": 7.7238750887154e-05, + "loss": 0.005752077326178551, + "step": 160420 + }, + { + "epoch": 22.7721788502484, + "grad_norm": 0.9475473165512085, + "learning_rate": 7.723733144073812e-05, + "loss": 0.07100241780281066, + "step": 160430 + }, + { + "epoch": 22.7735982966643, + "grad_norm": 2.9929866790771484, + "learning_rate": 7.723591199432221e-05, + "loss": 0.015220768749713898, + "step": 160440 + }, + { + "epoch": 22.7750177430802, + "grad_norm": 4.685712814331055, + "learning_rate": 7.723449254790632e-05, + "loss": 0.0264895498752594, + "step": 160450 + }, + { + "epoch": 22.776437189496097, + "grad_norm": 0.021241342648863792, + "learning_rate": 7.723307310149042e-05, + "loss": 0.005869306251406669, + "step": 160460 + }, + { + "epoch": 22.777856635911995, + "grad_norm": 13.127159118652344, + "learning_rate": 7.723165365507452e-05, + "loss": 0.024513253569602968, + "step": 160470 + }, + { + "epoch": 22.779276082327893, + "grad_norm": 0.04513964429497719, + "learning_rate": 7.723023420865863e-05, + "loss": 0.0435330331325531, + "step": 160480 + }, + { + "epoch": 22.78069552874379, + "grad_norm": 2.811532974243164, + "learning_rate": 7.722881476224273e-05, + "loss": 0.006350237876176834, + "step": 160490 + }, + { + "epoch": 22.782114975159686, + "grad_norm": 1.497002363204956, + "learning_rate": 7.722739531582684e-05, + "loss": 0.02161119282245636, + "step": 160500 + }, + { + "epoch": 22.782114975159686, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.05360054969787598, + "eval_runtime": 33.288, + "eval_samples_per_second": 472.453, + "eval_steps_per_second": 14.78, + "step": 160500 + }, + { + "epoch": 22.783534421575585, + "grad_norm": 1.6408385038375854, + "learning_rate": 7.722597586941093e-05, + "loss": 0.0038059189915657043, + "step": 160510 + }, + { + "epoch": 22.784953867991483, + "grad_norm": 4.8498454093933105, + "learning_rate": 7.722455642299503e-05, + "loss": 0.039502504467964175, + "step": 160520 + }, + { + "epoch": 22.78637331440738, + "grad_norm": 0.5308853387832642, + "learning_rate": 7.722313697657913e-05, + "loss": 0.003257053717970848, + "step": 160530 + }, + { + "epoch": 22.78779276082328, + "grad_norm": 2.2692599296569824, + "learning_rate": 7.722171753016324e-05, + "loss": 0.024766245484352113, + "step": 160540 + }, + { + "epoch": 22.789212207239178, + "grad_norm": 0.3963622748851776, + "learning_rate": 7.722029808374734e-05, + "loss": 0.03809992372989655, + "step": 160550 + }, + { + "epoch": 22.790631653655076, + "grad_norm": 0.16955435276031494, + "learning_rate": 7.721887863733145e-05, + "loss": 0.036906537413597104, + "step": 160560 + }, + { + "epoch": 22.79205110007097, + "grad_norm": 4.779110431671143, + "learning_rate": 7.721745919091555e-05, + "loss": 0.011944958567619323, + "step": 160570 + }, + { + "epoch": 22.79347054648687, + "grad_norm": 0.5937197804450989, + "learning_rate": 7.721603974449964e-05, + "loss": 0.02508275508880615, + "step": 160580 + }, + { + "epoch": 22.794889992902768, + "grad_norm": 0.3013536334037781, + "learning_rate": 7.721462029808375e-05, + "loss": 0.02505006790161133, + "step": 160590 + }, + { + "epoch": 22.796309439318666, + "grad_norm": 0.5222639441490173, + "learning_rate": 7.721320085166785e-05, + "loss": 0.015503109991550445, + "step": 160600 + }, + { + "epoch": 22.797728885734564, + "grad_norm": 10.769311904907227, + "learning_rate": 7.721178140525196e-05, + "loss": 0.01393204927444458, + "step": 160610 + }, + { + "epoch": 22.799148332150462, + "grad_norm": 14.927803039550781, + "learning_rate": 7.721036195883605e-05, + "loss": 0.02686433494091034, + "step": 160620 + }, + { + "epoch": 22.80056777856636, + "grad_norm": 0.31277549266815186, + "learning_rate": 7.720894251242016e-05, + "loss": 0.005339305102825165, + "step": 160630 + }, + { + "epoch": 22.801987224982255, + "grad_norm": 0.7531947493553162, + "learning_rate": 7.720752306600425e-05, + "loss": 0.007539245486259461, + "step": 160640 + }, + { + "epoch": 22.803406671398154, + "grad_norm": 0.005132125690579414, + "learning_rate": 7.720610361958837e-05, + "loss": 0.026399222016334534, + "step": 160650 + }, + { + "epoch": 22.804826117814052, + "grad_norm": 0.511212944984436, + "learning_rate": 7.720468417317248e-05, + "loss": 0.0097750224173069, + "step": 160660 + }, + { + "epoch": 22.80624556422995, + "grad_norm": 10.932982444763184, + "learning_rate": 7.720326472675656e-05, + "loss": 0.061018633842468264, + "step": 160670 + }, + { + "epoch": 22.80766501064585, + "grad_norm": 0.0058971187099814415, + "learning_rate": 7.720184528034067e-05, + "loss": 0.01053609475493431, + "step": 160680 + }, + { + "epoch": 22.809084457061747, + "grad_norm": 11.420747756958008, + "learning_rate": 7.720042583392477e-05, + "loss": 0.023927433788776396, + "step": 160690 + }, + { + "epoch": 22.810503903477645, + "grad_norm": 3.1432878971099854, + "learning_rate": 7.719900638750888e-05, + "loss": 0.01987508237361908, + "step": 160700 + }, + { + "epoch": 22.81192334989354, + "grad_norm": 5.696770668029785, + "learning_rate": 7.719758694109298e-05, + "loss": 0.02484031617641449, + "step": 160710 + }, + { + "epoch": 22.81334279630944, + "grad_norm": 0.40992504358291626, + "learning_rate": 7.719616749467707e-05, + "loss": 0.008489498496055603, + "step": 160720 + }, + { + "epoch": 22.814762242725337, + "grad_norm": 0.19944070279598236, + "learning_rate": 7.719474804826117e-05, + "loss": 0.017672815918922426, + "step": 160730 + }, + { + "epoch": 22.816181689141235, + "grad_norm": 0.13940846920013428, + "learning_rate": 7.719332860184528e-05, + "loss": 0.006945253163576126, + "step": 160740 + }, + { + "epoch": 22.817601135557133, + "grad_norm": 0.4422439932823181, + "learning_rate": 7.71919091554294e-05, + "loss": 0.027322635054588318, + "step": 160750 + }, + { + "epoch": 22.81902058197303, + "grad_norm": 0.055839937180280685, + "learning_rate": 7.719048970901349e-05, + "loss": 0.022102104127407075, + "step": 160760 + }, + { + "epoch": 22.82044002838893, + "grad_norm": 0.9106621742248535, + "learning_rate": 7.71890702625976e-05, + "loss": 0.027456504106521607, + "step": 160770 + }, + { + "epoch": 22.821859474804825, + "grad_norm": 0.12872973084449768, + "learning_rate": 7.718765081618169e-05, + "loss": 0.012655019760131836, + "step": 160780 + }, + { + "epoch": 22.823278921220723, + "grad_norm": 2.133991003036499, + "learning_rate": 7.71862313697658e-05, + "loss": 0.028094691038131715, + "step": 160790 + }, + { + "epoch": 22.82469836763662, + "grad_norm": 0.01568033918738365, + "learning_rate": 7.71848119233499e-05, + "loss": 0.01020006239414215, + "step": 160800 + }, + { + "epoch": 22.82611781405252, + "grad_norm": 0.5772684812545776, + "learning_rate": 7.7183392476934e-05, + "loss": 0.009229369461536407, + "step": 160810 + }, + { + "epoch": 22.827537260468418, + "grad_norm": 0.1118927001953125, + "learning_rate": 7.71819730305181e-05, + "loss": 0.007211325317621231, + "step": 160820 + }, + { + "epoch": 22.828956706884316, + "grad_norm": 1.4190748929977417, + "learning_rate": 7.71805535841022e-05, + "loss": 0.008611522614955902, + "step": 160830 + }, + { + "epoch": 22.830376153300215, + "grad_norm": 0.28628435730934143, + "learning_rate": 7.717913413768631e-05, + "loss": 0.03562330007553101, + "step": 160840 + }, + { + "epoch": 22.83179559971611, + "grad_norm": 0.4766266345977783, + "learning_rate": 7.717771469127041e-05, + "loss": 0.056645655632019044, + "step": 160850 + }, + { + "epoch": 22.833215046132008, + "grad_norm": 1.8854352235794067, + "learning_rate": 7.717629524485452e-05, + "loss": 0.039526650309562684, + "step": 160860 + }, + { + "epoch": 22.834634492547906, + "grad_norm": 0.29683372378349304, + "learning_rate": 7.717487579843862e-05, + "loss": 0.028241449594497682, + "step": 160870 + }, + { + "epoch": 22.836053938963804, + "grad_norm": 1.5731925964355469, + "learning_rate": 7.717345635202271e-05, + "loss": 0.005742316320538521, + "step": 160880 + }, + { + "epoch": 22.837473385379703, + "grad_norm": 7.4947099685668945, + "learning_rate": 7.717203690560681e-05, + "loss": 0.018449863791465758, + "step": 160890 + }, + { + "epoch": 22.8388928317956, + "grad_norm": 12.44045352935791, + "learning_rate": 7.717061745919092e-05, + "loss": 0.01632877588272095, + "step": 160900 + }, + { + "epoch": 22.8403122782115, + "grad_norm": 0.3827916979789734, + "learning_rate": 7.716919801277502e-05, + "loss": 0.008497563004493714, + "step": 160910 + }, + { + "epoch": 22.841731724627394, + "grad_norm": 1.136602520942688, + "learning_rate": 7.716777856635913e-05, + "loss": 0.003709818795323372, + "step": 160920 + }, + { + "epoch": 22.843151171043292, + "grad_norm": 0.6600062847137451, + "learning_rate": 7.716635911994323e-05, + "loss": 0.026395943760871888, + "step": 160930 + }, + { + "epoch": 22.84457061745919, + "grad_norm": 0.07525736838579178, + "learning_rate": 7.716493967352733e-05, + "loss": 0.007931937277317048, + "step": 160940 + }, + { + "epoch": 22.84599006387509, + "grad_norm": 0.017768265679478645, + "learning_rate": 7.716352022711144e-05, + "loss": 0.013794943690299988, + "step": 160950 + }, + { + "epoch": 22.847409510290987, + "grad_norm": 10.234431266784668, + "learning_rate": 7.716210078069553e-05, + "loss": 0.0396212637424469, + "step": 160960 + }, + { + "epoch": 22.848828956706885, + "grad_norm": 0.019011499360203743, + "learning_rate": 7.716068133427964e-05, + "loss": 0.02025754898786545, + "step": 160970 + }, + { + "epoch": 22.850248403122784, + "grad_norm": 0.23502807319164276, + "learning_rate": 7.715926188786373e-05, + "loss": 0.0019734371453523636, + "step": 160980 + }, + { + "epoch": 22.85166784953868, + "grad_norm": 0.2261151671409607, + "learning_rate": 7.715784244144784e-05, + "loss": 0.004238513857126236, + "step": 160990 + }, + { + "epoch": 22.853087295954577, + "grad_norm": 0.6332138180732727, + "learning_rate": 7.715642299503194e-05, + "loss": 0.04215038120746613, + "step": 161000 + }, + { + "epoch": 22.853087295954577, + "eval_accuracy": 0.987918865645069, + "eval_loss": 0.04423164576292038, + "eval_runtime": 32.5191, + "eval_samples_per_second": 483.623, + "eval_steps_per_second": 15.13, + "step": 161000 + }, + { + "epoch": 22.854506742370475, + "grad_norm": 0.2695649266242981, + "learning_rate": 7.715500354861605e-05, + "loss": 0.01630450189113617, + "step": 161010 + }, + { + "epoch": 22.855926188786373, + "grad_norm": 0.24794624745845795, + "learning_rate": 7.715358410220014e-05, + "loss": 0.024842479825019838, + "step": 161020 + }, + { + "epoch": 22.85734563520227, + "grad_norm": 0.07152845710515976, + "learning_rate": 7.715216465578424e-05, + "loss": 0.014620494842529298, + "step": 161030 + }, + { + "epoch": 22.85876508161817, + "grad_norm": 0.08355807512998581, + "learning_rate": 7.715074520936835e-05, + "loss": 0.01654687374830246, + "step": 161040 + }, + { + "epoch": 22.86018452803407, + "grad_norm": 0.005100735463202, + "learning_rate": 7.714932576295245e-05, + "loss": 0.008904163539409638, + "step": 161050 + }, + { + "epoch": 22.861603974449963, + "grad_norm": 17.39082908630371, + "learning_rate": 7.714790631653656e-05, + "loss": 0.026420870423316957, + "step": 161060 + }, + { + "epoch": 22.86302342086586, + "grad_norm": 0.06105856969952583, + "learning_rate": 7.714648687012066e-05, + "loss": 0.014832265675067902, + "step": 161070 + }, + { + "epoch": 22.86444286728176, + "grad_norm": 0.7735015153884888, + "learning_rate": 7.714506742370477e-05, + "loss": 0.015475049614906311, + "step": 161080 + }, + { + "epoch": 22.865862313697658, + "grad_norm": 3.3862085342407227, + "learning_rate": 7.714364797728885e-05, + "loss": 0.0035722479224205017, + "step": 161090 + }, + { + "epoch": 22.867281760113556, + "grad_norm": 0.3881051242351532, + "learning_rate": 7.714222853087296e-05, + "loss": 0.0014706455171108247, + "step": 161100 + }, + { + "epoch": 22.868701206529455, + "grad_norm": 11.852885246276855, + "learning_rate": 7.714080908445706e-05, + "loss": 0.019852733612060545, + "step": 161110 + }, + { + "epoch": 22.870120652945353, + "grad_norm": 0.03727658838033676, + "learning_rate": 7.713938963804117e-05, + "loss": 0.011802415549755096, + "step": 161120 + }, + { + "epoch": 22.871540099361248, + "grad_norm": 3.470445156097412, + "learning_rate": 7.713797019162527e-05, + "loss": 0.009820803254842757, + "step": 161130 + }, + { + "epoch": 22.872959545777146, + "grad_norm": 0.046736665070056915, + "learning_rate": 7.713655074520937e-05, + "loss": 0.03099876642227173, + "step": 161140 + }, + { + "epoch": 22.874378992193044, + "grad_norm": 0.017452873289585114, + "learning_rate": 7.713513129879348e-05, + "loss": 0.0030446752905845644, + "step": 161150 + }, + { + "epoch": 22.875798438608943, + "grad_norm": 0.43185585737228394, + "learning_rate": 7.713371185237758e-05, + "loss": 0.03341522812843323, + "step": 161160 + }, + { + "epoch": 22.87721788502484, + "grad_norm": 1.2220635414123535, + "learning_rate": 7.713229240596169e-05, + "loss": 0.006367437541484833, + "step": 161170 + }, + { + "epoch": 22.87863733144074, + "grad_norm": 21.658296585083008, + "learning_rate": 7.713087295954578e-05, + "loss": 0.02857770025730133, + "step": 161180 + }, + { + "epoch": 22.880056777856637, + "grad_norm": 3.9350883960723877, + "learning_rate": 7.712945351312988e-05, + "loss": 0.013504944741725922, + "step": 161190 + }, + { + "epoch": 22.881476224272532, + "grad_norm": 0.11397921293973923, + "learning_rate": 7.712803406671398e-05, + "loss": 0.03829925656318665, + "step": 161200 + }, + { + "epoch": 22.88289567068843, + "grad_norm": 0.025861887261271477, + "learning_rate": 7.712661462029809e-05, + "loss": 0.0038186315447092055, + "step": 161210 + }, + { + "epoch": 22.88431511710433, + "grad_norm": 0.03403586149215698, + "learning_rate": 7.712519517388219e-05, + "loss": 0.026607397198677062, + "step": 161220 + }, + { + "epoch": 22.885734563520227, + "grad_norm": 0.7475443482398987, + "learning_rate": 7.71237757274663e-05, + "loss": 0.011042962968349456, + "step": 161230 + }, + { + "epoch": 22.887154009936125, + "grad_norm": 1.4000591039657593, + "learning_rate": 7.71223562810504e-05, + "loss": 0.017829933762550355, + "step": 161240 + }, + { + "epoch": 22.888573456352024, + "grad_norm": 14.506952285766602, + "learning_rate": 7.712093683463449e-05, + "loss": 0.022377786040306092, + "step": 161250 + }, + { + "epoch": 22.889992902767922, + "grad_norm": 0.35509392619132996, + "learning_rate": 7.71195173882186e-05, + "loss": 0.014188916981220245, + "step": 161260 + }, + { + "epoch": 22.891412349183817, + "grad_norm": 5.571715831756592, + "learning_rate": 7.711823988644429e-05, + "loss": 0.030032038688659668, + "step": 161270 + }, + { + "epoch": 22.892831795599715, + "grad_norm": 12.064584732055664, + "learning_rate": 7.711682044002839e-05, + "loss": 0.035630783438682555, + "step": 161280 + }, + { + "epoch": 22.894251242015613, + "grad_norm": 3.6158697605133057, + "learning_rate": 7.71154009936125e-05, + "loss": 0.042185822129249574, + "step": 161290 + }, + { + "epoch": 22.89567068843151, + "grad_norm": 0.018797844648361206, + "learning_rate": 7.71139815471966e-05, + "loss": 0.040814349055290224, + "step": 161300 + }, + { + "epoch": 22.89709013484741, + "grad_norm": 0.03753923997282982, + "learning_rate": 7.711256210078069e-05, + "loss": 0.020657944679260253, + "step": 161310 + }, + { + "epoch": 22.89850958126331, + "grad_norm": 0.052408620715141296, + "learning_rate": 7.71111426543648e-05, + "loss": 0.04698234796524048, + "step": 161320 + }, + { + "epoch": 22.899929027679207, + "grad_norm": 4.2034430503845215, + "learning_rate": 7.71097232079489e-05, + "loss": 0.015516915917396545, + "step": 161330 + }, + { + "epoch": 22.9013484740951, + "grad_norm": 0.49068406224250793, + "learning_rate": 7.710830376153301e-05, + "loss": 0.024399620294570924, + "step": 161340 + }, + { + "epoch": 22.902767920511, + "grad_norm": 0.019188227131962776, + "learning_rate": 7.710688431511711e-05, + "loss": 0.033277001976966855, + "step": 161350 + }, + { + "epoch": 22.904187366926898, + "grad_norm": 6.740253925323486, + "learning_rate": 7.71054648687012e-05, + "loss": 0.01604182571172714, + "step": 161360 + }, + { + "epoch": 22.905606813342796, + "grad_norm": 0.24971330165863037, + "learning_rate": 7.71040454222853e-05, + "loss": 0.037403085827827455, + "step": 161370 + }, + { + "epoch": 22.907026259758695, + "grad_norm": 0.09529177844524384, + "learning_rate": 7.710262597586941e-05, + "loss": 0.02608555555343628, + "step": 161380 + }, + { + "epoch": 22.908445706174593, + "grad_norm": 0.28843408823013306, + "learning_rate": 7.710120652945351e-05, + "loss": 0.04028843939304352, + "step": 161390 + }, + { + "epoch": 22.90986515259049, + "grad_norm": 0.18110573291778564, + "learning_rate": 7.709978708303762e-05, + "loss": 0.03038540482521057, + "step": 161400 + }, + { + "epoch": 22.911284599006386, + "grad_norm": 1.9561200141906738, + "learning_rate": 7.709836763662173e-05, + "loss": 0.041900652647018435, + "step": 161410 + }, + { + "epoch": 22.912704045422284, + "grad_norm": 2.2963578701019287, + "learning_rate": 7.709694819020582e-05, + "loss": 0.016660454869270324, + "step": 161420 + }, + { + "epoch": 22.914123491838183, + "grad_norm": 0.5842369794845581, + "learning_rate": 7.709552874378993e-05, + "loss": 0.01031760573387146, + "step": 161430 + }, + { + "epoch": 22.91554293825408, + "grad_norm": 0.3290294110774994, + "learning_rate": 7.709410929737403e-05, + "loss": 0.003792167827486992, + "step": 161440 + }, + { + "epoch": 22.91696238466998, + "grad_norm": 0.03769886493682861, + "learning_rate": 7.709268985095814e-05, + "loss": 0.03237307965755463, + "step": 161450 + }, + { + "epoch": 22.918381831085878, + "grad_norm": 0.1274748295545578, + "learning_rate": 7.709127040454223e-05, + "loss": 0.027212321758270264, + "step": 161460 + }, + { + "epoch": 22.919801277501776, + "grad_norm": 0.05297774076461792, + "learning_rate": 7.708985095812633e-05, + "loss": 0.01669422537088394, + "step": 161470 + }, + { + "epoch": 22.92122072391767, + "grad_norm": 0.24771833419799805, + "learning_rate": 7.708843151171043e-05, + "loss": 0.020847928524017335, + "step": 161480 + }, + { + "epoch": 22.92264017033357, + "grad_norm": 0.06459509581327438, + "learning_rate": 7.708701206529454e-05, + "loss": 0.029286113381385804, + "step": 161490 + }, + { + "epoch": 22.924059616749467, + "grad_norm": 1.6945854425430298, + "learning_rate": 7.708559261887865e-05, + "loss": 0.03703280985355377, + "step": 161500 + }, + { + "epoch": 22.924059616749467, + "eval_accuracy": 0.9884275449863292, + "eval_loss": 0.044185709208250046, + "eval_runtime": 32.5495, + "eval_samples_per_second": 483.171, + "eval_steps_per_second": 15.115, + "step": 161500 + }, + { + "epoch": 22.925479063165366, + "grad_norm": 0.19346709549427032, + "learning_rate": 7.708417317246275e-05, + "loss": 0.025829434394836426, + "step": 161510 + }, + { + "epoch": 22.926898509581264, + "grad_norm": 0.10766290873289108, + "learning_rate": 7.708275372604685e-05, + "loss": 0.021949516236782075, + "step": 161520 + }, + { + "epoch": 22.928317955997162, + "grad_norm": 0.030968334525823593, + "learning_rate": 7.708133427963094e-05, + "loss": 0.021751122176647188, + "step": 161530 + }, + { + "epoch": 22.92973740241306, + "grad_norm": 0.5014197826385498, + "learning_rate": 7.707991483321505e-05, + "loss": 0.019297340512275697, + "step": 161540 + }, + { + "epoch": 22.931156848828955, + "grad_norm": 0.19232438504695892, + "learning_rate": 7.707849538679915e-05, + "loss": 0.003909531608223915, + "step": 161550 + }, + { + "epoch": 22.932576295244854, + "grad_norm": 0.5488612055778503, + "learning_rate": 7.707707594038326e-05, + "loss": 0.02149796187877655, + "step": 161560 + }, + { + "epoch": 22.933995741660752, + "grad_norm": 13.509936332702637, + "learning_rate": 7.707565649396735e-05, + "loss": 0.019596312940120698, + "step": 161570 + }, + { + "epoch": 22.93541518807665, + "grad_norm": 0.9531795382499695, + "learning_rate": 7.707423704755146e-05, + "loss": 0.011289587616920472, + "step": 161580 + }, + { + "epoch": 22.93683463449255, + "grad_norm": 0.09487307816743851, + "learning_rate": 7.707281760113555e-05, + "loss": 0.02423248291015625, + "step": 161590 + }, + { + "epoch": 22.938254080908447, + "grad_norm": 7.086032867431641, + "learning_rate": 7.707139815471966e-05, + "loss": 0.012223917245864867, + "step": 161600 + }, + { + "epoch": 22.939673527324345, + "grad_norm": 0.203185573220253, + "learning_rate": 7.706997870830378e-05, + "loss": 0.025282692909240723, + "step": 161610 + }, + { + "epoch": 22.94109297374024, + "grad_norm": 0.01871105097234249, + "learning_rate": 7.706855926188786e-05, + "loss": 0.007927915453910828, + "step": 161620 + }, + { + "epoch": 22.942512420156138, + "grad_norm": 0.09195920825004578, + "learning_rate": 7.706713981547197e-05, + "loss": 0.016610655188560485, + "step": 161630 + }, + { + "epoch": 22.943931866572036, + "grad_norm": 0.6392912864685059, + "learning_rate": 7.706572036905607e-05, + "loss": 0.03601295650005341, + "step": 161640 + }, + { + "epoch": 22.945351312987935, + "grad_norm": 5.183906555175781, + "learning_rate": 7.706430092264018e-05, + "loss": 0.015701791644096373, + "step": 161650 + }, + { + "epoch": 22.946770759403833, + "grad_norm": 0.01105321105569601, + "learning_rate": 7.706288147622428e-05, + "loss": 0.017326465249061583, + "step": 161660 + }, + { + "epoch": 22.94819020581973, + "grad_norm": 0.05637863278388977, + "learning_rate": 7.706146202980837e-05, + "loss": 0.012118382751941681, + "step": 161670 + }, + { + "epoch": 22.94960965223563, + "grad_norm": 3.900541305541992, + "learning_rate": 7.706004258339247e-05, + "loss": 0.0038341082632541656, + "step": 161680 + }, + { + "epoch": 22.951029098651524, + "grad_norm": 13.3785400390625, + "learning_rate": 7.705862313697658e-05, + "loss": 0.024840718507766722, + "step": 161690 + }, + { + "epoch": 22.952448545067423, + "grad_norm": 2.403290271759033, + "learning_rate": 7.705720369056069e-05, + "loss": 0.012386296689510346, + "step": 161700 + }, + { + "epoch": 22.95386799148332, + "grad_norm": 1.8122022151947021, + "learning_rate": 7.705578424414479e-05, + "loss": 0.010960541665554047, + "step": 161710 + }, + { + "epoch": 22.95528743789922, + "grad_norm": 0.015222480520606041, + "learning_rate": 7.705436479772889e-05, + "loss": 0.0024630904197692873, + "step": 161720 + }, + { + "epoch": 22.956706884315118, + "grad_norm": 0.721798837184906, + "learning_rate": 7.705294535131298e-05, + "loss": 0.0015426710247993468, + "step": 161730 + }, + { + "epoch": 22.958126330731016, + "grad_norm": 0.014370634220540524, + "learning_rate": 7.70515259048971e-05, + "loss": 0.023647612333297728, + "step": 161740 + }, + { + "epoch": 22.959545777146914, + "grad_norm": 0.7748165130615234, + "learning_rate": 7.70501064584812e-05, + "loss": 0.0206304207444191, + "step": 161750 + }, + { + "epoch": 22.96096522356281, + "grad_norm": 0.21551628410816193, + "learning_rate": 7.70486870120653e-05, + "loss": 0.00360206738114357, + "step": 161760 + }, + { + "epoch": 22.962384669978707, + "grad_norm": 9.349844932556152, + "learning_rate": 7.70472675656494e-05, + "loss": 0.053832334280014035, + "step": 161770 + }, + { + "epoch": 22.963804116394606, + "grad_norm": 0.005291808396577835, + "learning_rate": 7.70458481192335e-05, + "loss": 0.03611416220664978, + "step": 161780 + }, + { + "epoch": 22.965223562810504, + "grad_norm": 0.043515536934137344, + "learning_rate": 7.704442867281761e-05, + "loss": 0.003078308328986168, + "step": 161790 + }, + { + "epoch": 22.966643009226402, + "grad_norm": 6.0235490798950195, + "learning_rate": 7.704300922640171e-05, + "loss": 0.00577174685895443, + "step": 161800 + }, + { + "epoch": 22.9680624556423, + "grad_norm": 0.5617413520812988, + "learning_rate": 7.704158977998582e-05, + "loss": 0.009858866035938264, + "step": 161810 + }, + { + "epoch": 22.9694819020582, + "grad_norm": 0.09254857152700424, + "learning_rate": 7.704017033356992e-05, + "loss": 0.004753181338310241, + "step": 161820 + }, + { + "epoch": 22.970901348474094, + "grad_norm": 0.03090120293200016, + "learning_rate": 7.703875088715401e-05, + "loss": 0.02552323341369629, + "step": 161830 + }, + { + "epoch": 22.972320794889992, + "grad_norm": 0.12373021990060806, + "learning_rate": 7.703733144073811e-05, + "loss": 0.019504737854003907, + "step": 161840 + }, + { + "epoch": 22.97374024130589, + "grad_norm": 0.17300604283809662, + "learning_rate": 7.703591199432222e-05, + "loss": 0.015018416941165924, + "step": 161850 + }, + { + "epoch": 22.97515968772179, + "grad_norm": 10.098119735717773, + "learning_rate": 7.703449254790632e-05, + "loss": 0.03669427633285523, + "step": 161860 + }, + { + "epoch": 22.976579134137687, + "grad_norm": 0.07114504277706146, + "learning_rate": 7.703307310149043e-05, + "loss": 0.0029472272843122484, + "step": 161870 + }, + { + "epoch": 22.977998580553585, + "grad_norm": 1.0454742908477783, + "learning_rate": 7.703165365507453e-05, + "loss": 0.03925280570983887, + "step": 161880 + }, + { + "epoch": 22.979418026969483, + "grad_norm": 0.008811729960143566, + "learning_rate": 7.703023420865862e-05, + "loss": 0.04343777596950531, + "step": 161890 + }, + { + "epoch": 22.980837473385378, + "grad_norm": 7.293917179107666, + "learning_rate": 7.702881476224274e-05, + "loss": 0.05511009097099304, + "step": 161900 + }, + { + "epoch": 22.982256919801276, + "grad_norm": 4.575845718383789, + "learning_rate": 7.702739531582683e-05, + "loss": 0.008168426901102066, + "step": 161910 + }, + { + "epoch": 22.983676366217175, + "grad_norm": 0.12320123612880707, + "learning_rate": 7.702597586941094e-05, + "loss": 0.021623848378658293, + "step": 161920 + }, + { + "epoch": 22.985095812633073, + "grad_norm": 0.02295038104057312, + "learning_rate": 7.702455642299503e-05, + "loss": 0.051585721969604495, + "step": 161930 + }, + { + "epoch": 22.98651525904897, + "grad_norm": 1.7926585674285889, + "learning_rate": 7.702313697657914e-05, + "loss": 0.00455988422036171, + "step": 161940 + }, + { + "epoch": 22.98793470546487, + "grad_norm": 0.5792025923728943, + "learning_rate": 7.702171753016324e-05, + "loss": 0.0050077404826879505, + "step": 161950 + }, + { + "epoch": 22.989354151880768, + "grad_norm": 0.34394052624702454, + "learning_rate": 7.702029808374735e-05, + "loss": 0.014922839403152467, + "step": 161960 + }, + { + "epoch": 22.990773598296663, + "grad_norm": 0.13150949776172638, + "learning_rate": 7.701887863733144e-05, + "loss": 0.023162418603897096, + "step": 161970 + }, + { + "epoch": 22.99219304471256, + "grad_norm": 5.206557273864746, + "learning_rate": 7.701745919091554e-05, + "loss": 0.036036941409111026, + "step": 161980 + }, + { + "epoch": 22.99361249112846, + "grad_norm": 0.6861659288406372, + "learning_rate": 7.701603974449965e-05, + "loss": 0.0187500923871994, + "step": 161990 + }, + { + "epoch": 22.995031937544358, + "grad_norm": 0.07208126038312912, + "learning_rate": 7.701462029808375e-05, + "loss": 0.005242104828357697, + "step": 162000 + }, + { + "epoch": 22.995031937544358, + "eval_accuracy": 0.9890633941629046, + "eval_loss": 0.04525516927242279, + "eval_runtime": 31.4712, + "eval_samples_per_second": 499.727, + "eval_steps_per_second": 15.633, + "step": 162000 + }, + { + "epoch": 22.996451383960256, + "grad_norm": 1.5645928382873535, + "learning_rate": 7.701320085166786e-05, + "loss": 0.015884467959403993, + "step": 162010 + }, + { + "epoch": 22.997870830376154, + "grad_norm": 14.743457794189453, + "learning_rate": 7.701178140525196e-05, + "loss": 0.012643532454967498, + "step": 162020 + }, + { + "epoch": 22.999290276792053, + "grad_norm": 0.4143930971622467, + "learning_rate": 7.701036195883606e-05, + "loss": 0.015752191841602325, + "step": 162030 + }, + { + "epoch": 23.000709723207947, + "grad_norm": 0.004221033304929733, + "learning_rate": 7.700894251242015e-05, + "loss": 0.004766692221164703, + "step": 162040 + }, + { + "epoch": 23.002129169623846, + "grad_norm": 0.01289429608732462, + "learning_rate": 7.700752306600426e-05, + "loss": 0.005274784937500954, + "step": 162050 + }, + { + "epoch": 23.003548616039744, + "grad_norm": 0.012014439329504967, + "learning_rate": 7.700610361958836e-05, + "loss": 0.0007916856557130814, + "step": 162060 + }, + { + "epoch": 23.004968062455642, + "grad_norm": 7.144986629486084, + "learning_rate": 7.700468417317247e-05, + "loss": 0.006299991905689239, + "step": 162070 + }, + { + "epoch": 23.00638750887154, + "grad_norm": 0.02035887911915779, + "learning_rate": 7.700326472675657e-05, + "loss": 0.0017514165490865707, + "step": 162080 + }, + { + "epoch": 23.00780695528744, + "grad_norm": 11.122802734375, + "learning_rate": 7.700184528034067e-05, + "loss": 0.01793658435344696, + "step": 162090 + }, + { + "epoch": 23.009226401703337, + "grad_norm": 0.08542089909315109, + "learning_rate": 7.700042583392478e-05, + "loss": 0.015630367398262023, + "step": 162100 + }, + { + "epoch": 23.010645848119232, + "grad_norm": 10.555177688598633, + "learning_rate": 7.699900638750887e-05, + "loss": 0.04705832302570343, + "step": 162110 + }, + { + "epoch": 23.01206529453513, + "grad_norm": 0.032844752073287964, + "learning_rate": 7.699758694109299e-05, + "loss": 0.008681935071945191, + "step": 162120 + }, + { + "epoch": 23.01348474095103, + "grad_norm": 3.011885404586792, + "learning_rate": 7.699616749467708e-05, + "loss": 0.019360101222991942, + "step": 162130 + }, + { + "epoch": 23.014904187366927, + "grad_norm": 0.04727676510810852, + "learning_rate": 7.699474804826118e-05, + "loss": 0.004417017847299576, + "step": 162140 + }, + { + "epoch": 23.016323633782825, + "grad_norm": 0.060292523354291916, + "learning_rate": 7.699332860184528e-05, + "loss": 0.018852721154689788, + "step": 162150 + }, + { + "epoch": 23.017743080198724, + "grad_norm": 1.6079437732696533, + "learning_rate": 7.699190915542939e-05, + "loss": 0.027467569708824156, + "step": 162160 + }, + { + "epoch": 23.019162526614622, + "grad_norm": 0.8061238527297974, + "learning_rate": 7.699048970901349e-05, + "loss": 0.014040300250053405, + "step": 162170 + }, + { + "epoch": 23.020581973030517, + "grad_norm": 0.08499953150749207, + "learning_rate": 7.69890702625976e-05, + "loss": 0.00402054451406002, + "step": 162180 + }, + { + "epoch": 23.022001419446415, + "grad_norm": 0.07407218962907791, + "learning_rate": 7.69876508161817e-05, + "loss": 0.0020604208111763, + "step": 162190 + }, + { + "epoch": 23.023420865862313, + "grad_norm": 7.858417987823486, + "learning_rate": 7.698623136976579e-05, + "loss": 0.005858126282691956, + "step": 162200 + }, + { + "epoch": 23.02484031227821, + "grad_norm": 0.41011783480644226, + "learning_rate": 7.69848119233499e-05, + "loss": 0.002301470935344696, + "step": 162210 + }, + { + "epoch": 23.02625975869411, + "grad_norm": 0.031270187348127365, + "learning_rate": 7.6983392476934e-05, + "loss": 0.005888228118419647, + "step": 162220 + }, + { + "epoch": 23.027679205110008, + "grad_norm": 3.0655300617218018, + "learning_rate": 7.698197303051811e-05, + "loss": 0.003980183228850365, + "step": 162230 + }, + { + "epoch": 23.029098651525906, + "grad_norm": 0.004664251580834389, + "learning_rate": 7.69805535841022e-05, + "loss": 0.011663904786109925, + "step": 162240 + }, + { + "epoch": 23.0305180979418, + "grad_norm": 0.138235405087471, + "learning_rate": 7.69791341376863e-05, + "loss": 0.0008032497018575669, + "step": 162250 + }, + { + "epoch": 23.0319375443577, + "grad_norm": 2.050046682357788, + "learning_rate": 7.69777146912704e-05, + "loss": 0.005337257310748101, + "step": 162260 + }, + { + "epoch": 23.033356990773598, + "grad_norm": 0.05537521839141846, + "learning_rate": 7.697629524485451e-05, + "loss": 0.0019353412091732026, + "step": 162270 + }, + { + "epoch": 23.034776437189496, + "grad_norm": 0.017545649781823158, + "learning_rate": 7.697487579843861e-05, + "loss": 0.02282845228910446, + "step": 162280 + }, + { + "epoch": 23.036195883605394, + "grad_norm": 0.003601963398978114, + "learning_rate": 7.697345635202271e-05, + "loss": 0.03403696715831757, + "step": 162290 + }, + { + "epoch": 23.037615330021293, + "grad_norm": 20.0732421875, + "learning_rate": 7.697203690560682e-05, + "loss": 0.05239020586013794, + "step": 162300 + }, + { + "epoch": 23.03903477643719, + "grad_norm": 0.022700130939483643, + "learning_rate": 7.697061745919092e-05, + "loss": 0.02013615667819977, + "step": 162310 + }, + { + "epoch": 23.040454222853086, + "grad_norm": 0.4408855140209198, + "learning_rate": 7.696919801277503e-05, + "loss": 0.012053713202476501, + "step": 162320 + }, + { + "epoch": 23.041873669268984, + "grad_norm": 3.9482064247131348, + "learning_rate": 7.696777856635913e-05, + "loss": 0.0048280622810125354, + "step": 162330 + }, + { + "epoch": 23.043293115684882, + "grad_norm": 0.06381677836179733, + "learning_rate": 7.696635911994322e-05, + "loss": 0.02394731342792511, + "step": 162340 + }, + { + "epoch": 23.04471256210078, + "grad_norm": 0.014467685483396053, + "learning_rate": 7.696493967352732e-05, + "loss": 0.03404964804649353, + "step": 162350 + }, + { + "epoch": 23.04613200851668, + "grad_norm": 0.5630199909210205, + "learning_rate": 7.696352022711143e-05, + "loss": 0.02165971100330353, + "step": 162360 + }, + { + "epoch": 23.047551454932577, + "grad_norm": 0.054986339062452316, + "learning_rate": 7.696210078069553e-05, + "loss": 0.0033312324434518815, + "step": 162370 + }, + { + "epoch": 23.048970901348476, + "grad_norm": 0.04998296871781349, + "learning_rate": 7.696068133427964e-05, + "loss": 0.024840201437473296, + "step": 162380 + }, + { + "epoch": 23.05039034776437, + "grad_norm": 2.9335968494415283, + "learning_rate": 7.695926188786374e-05, + "loss": 0.02612728774547577, + "step": 162390 + }, + { + "epoch": 23.05180979418027, + "grad_norm": 0.08934736996889114, + "learning_rate": 7.695784244144783e-05, + "loss": 0.01079886332154274, + "step": 162400 + }, + { + "epoch": 23.053229240596167, + "grad_norm": 1.4009144306182861, + "learning_rate": 7.695642299503195e-05, + "loss": 0.012840324640274048, + "step": 162410 + }, + { + "epoch": 23.054648687012065, + "grad_norm": 5.246031284332275, + "learning_rate": 7.695500354861604e-05, + "loss": 0.020674696564674376, + "step": 162420 + }, + { + "epoch": 23.056068133427964, + "grad_norm": 1.0353832244873047, + "learning_rate": 7.695358410220015e-05, + "loss": 0.032555806636810306, + "step": 162430 + }, + { + "epoch": 23.057487579843862, + "grad_norm": 22.06109046936035, + "learning_rate": 7.695216465578424e-05, + "loss": 0.02898041307926178, + "step": 162440 + }, + { + "epoch": 23.05890702625976, + "grad_norm": 0.00831094104796648, + "learning_rate": 7.695074520936835e-05, + "loss": 0.0040160447359085085, + "step": 162450 + }, + { + "epoch": 23.060326472675655, + "grad_norm": 0.12358088791370392, + "learning_rate": 7.694932576295245e-05, + "loss": 0.007175853848457337, + "step": 162460 + }, + { + "epoch": 23.061745919091553, + "grad_norm": 0.3178024888038635, + "learning_rate": 7.694790631653656e-05, + "loss": 0.005930808559060097, + "step": 162470 + }, + { + "epoch": 23.06316536550745, + "grad_norm": 1.1642258167266846, + "learning_rate": 7.694648687012065e-05, + "loss": 0.047300207614898684, + "step": 162480 + }, + { + "epoch": 23.06458481192335, + "grad_norm": 2.028388023376465, + "learning_rate": 7.694506742370476e-05, + "loss": 0.05117456912994385, + "step": 162490 + }, + { + "epoch": 23.066004258339248, + "grad_norm": 0.44468292593955994, + "learning_rate": 7.694364797728886e-05, + "loss": 0.02594589293003082, + "step": 162500 + }, + { + "epoch": 23.066004258339248, + "eval_accuracy": 0.9844852800915623, + "eval_loss": 0.06441562622785568, + "eval_runtime": 31.8142, + "eval_samples_per_second": 494.339, + "eval_steps_per_second": 15.465, + "step": 162500 + }, + { + "epoch": 23.067423704755146, + "grad_norm": 7.526379108428955, + "learning_rate": 7.694222853087296e-05, + "loss": 0.016502782702445984, + "step": 162510 + }, + { + "epoch": 23.068843151171045, + "grad_norm": 0.027306029573082924, + "learning_rate": 7.694080908445707e-05, + "loss": 0.029854172468185426, + "step": 162520 + }, + { + "epoch": 23.07026259758694, + "grad_norm": 0.06151280924677849, + "learning_rate": 7.693938963804117e-05, + "loss": 0.02120264172554016, + "step": 162530 + }, + { + "epoch": 23.071682044002838, + "grad_norm": 0.17482632398605347, + "learning_rate": 7.693797019162528e-05, + "loss": 0.02518077790737152, + "step": 162540 + }, + { + "epoch": 23.073101490418736, + "grad_norm": 0.005407204385846853, + "learning_rate": 7.693655074520936e-05, + "loss": 0.03239222466945648, + "step": 162550 + }, + { + "epoch": 23.074520936834634, + "grad_norm": 9.93742847442627, + "learning_rate": 7.693513129879347e-05, + "loss": 0.01886531859636307, + "step": 162560 + }, + { + "epoch": 23.075940383250533, + "grad_norm": 0.013357577845454216, + "learning_rate": 7.693371185237757e-05, + "loss": 0.02723831832408905, + "step": 162570 + }, + { + "epoch": 23.07735982966643, + "grad_norm": 0.01795533485710621, + "learning_rate": 7.693229240596168e-05, + "loss": 0.006658503413200378, + "step": 162580 + }, + { + "epoch": 23.07877927608233, + "grad_norm": 0.34461337327957153, + "learning_rate": 7.693087295954578e-05, + "loss": 0.03218346834182739, + "step": 162590 + }, + { + "epoch": 23.080198722498224, + "grad_norm": 0.15731771290302277, + "learning_rate": 7.692945351312988e-05, + "loss": 0.007884921133518219, + "step": 162600 + }, + { + "epoch": 23.081618168914122, + "grad_norm": 0.16855670511722565, + "learning_rate": 7.692803406671399e-05, + "loss": 0.020282313227653503, + "step": 162610 + }, + { + "epoch": 23.08303761533002, + "grad_norm": 0.1348881870508194, + "learning_rate": 7.692661462029809e-05, + "loss": 0.010228237509727478, + "step": 162620 + }, + { + "epoch": 23.08445706174592, + "grad_norm": 7.965588569641113, + "learning_rate": 7.69251951738822e-05, + "loss": 0.07130510807037353, + "step": 162630 + }, + { + "epoch": 23.085876508161817, + "grad_norm": 1.8469702005386353, + "learning_rate": 7.69237757274663e-05, + "loss": 0.009843124449253083, + "step": 162640 + }, + { + "epoch": 23.087295954577716, + "grad_norm": 0.20647244155406952, + "learning_rate": 7.692235628105039e-05, + "loss": 0.0038670334964990617, + "step": 162650 + }, + { + "epoch": 23.088715400993614, + "grad_norm": 22.169654846191406, + "learning_rate": 7.692093683463449e-05, + "loss": 0.029027491807937622, + "step": 162660 + }, + { + "epoch": 23.09013484740951, + "grad_norm": 0.011130492202937603, + "learning_rate": 7.69195173882186e-05, + "loss": 0.008509107679128648, + "step": 162670 + }, + { + "epoch": 23.091554293825407, + "grad_norm": 0.01648932509124279, + "learning_rate": 7.69180979418027e-05, + "loss": 0.009814728796482087, + "step": 162680 + }, + { + "epoch": 23.092973740241305, + "grad_norm": 7.4081244468688965, + "learning_rate": 7.691667849538681e-05, + "loss": 0.015613070130348206, + "step": 162690 + }, + { + "epoch": 23.094393186657204, + "grad_norm": 0.009029856882989407, + "learning_rate": 7.69152590489709e-05, + "loss": 0.0029690850526094435, + "step": 162700 + }, + { + "epoch": 23.095812633073102, + "grad_norm": 5.916153430938721, + "learning_rate": 7.6913839602555e-05, + "loss": 0.007691198587417602, + "step": 162710 + }, + { + "epoch": 23.097232079489, + "grad_norm": 4.533042907714844, + "learning_rate": 7.691242015613911e-05, + "loss": 0.010851771384477616, + "step": 162720 + }, + { + "epoch": 23.0986515259049, + "grad_norm": 0.02013479173183441, + "learning_rate": 7.691100070972321e-05, + "loss": 0.01569564938545227, + "step": 162730 + }, + { + "epoch": 23.100070972320793, + "grad_norm": 0.04012364149093628, + "learning_rate": 7.690958126330732e-05, + "loss": 0.007517506182193756, + "step": 162740 + }, + { + "epoch": 23.10149041873669, + "grad_norm": 0.13479460775852203, + "learning_rate": 7.69081618168914e-05, + "loss": 0.004010143503546715, + "step": 162750 + }, + { + "epoch": 23.10290986515259, + "grad_norm": 0.013275690376758575, + "learning_rate": 7.690674237047552e-05, + "loss": 0.0025931593030691148, + "step": 162760 + }, + { + "epoch": 23.10432931156849, + "grad_norm": 1.252179741859436, + "learning_rate": 7.690532292405961e-05, + "loss": 0.01159205734729767, + "step": 162770 + }, + { + "epoch": 23.105748757984387, + "grad_norm": 0.39506641030311584, + "learning_rate": 7.690390347764372e-05, + "loss": 0.0015432383865118028, + "step": 162780 + }, + { + "epoch": 23.107168204400285, + "grad_norm": 0.22367994487285614, + "learning_rate": 7.690248403122782e-05, + "loss": 0.007768946886062622, + "step": 162790 + }, + { + "epoch": 23.108587650816183, + "grad_norm": 0.037619736045598984, + "learning_rate": 7.690106458481192e-05, + "loss": 0.008029165863990783, + "step": 162800 + }, + { + "epoch": 23.110007097232078, + "grad_norm": 0.030249066650867462, + "learning_rate": 7.689964513839603e-05, + "loss": 0.008502288162708283, + "step": 162810 + }, + { + "epoch": 23.111426543647976, + "grad_norm": 0.26394662261009216, + "learning_rate": 7.689822569198013e-05, + "loss": 0.011885827779769898, + "step": 162820 + }, + { + "epoch": 23.112845990063875, + "grad_norm": 0.060594383627176285, + "learning_rate": 7.689680624556424e-05, + "loss": 0.01973457485437393, + "step": 162830 + }, + { + "epoch": 23.114265436479773, + "grad_norm": 0.44695061445236206, + "learning_rate": 7.689538679914834e-05, + "loss": 0.021900515258312225, + "step": 162840 + }, + { + "epoch": 23.11568488289567, + "grad_norm": 0.07750909775495529, + "learning_rate": 7.689396735273245e-05, + "loss": 0.004007786512374878, + "step": 162850 + }, + { + "epoch": 23.11710432931157, + "grad_norm": 0.0719253346323967, + "learning_rate": 7.689254790631653e-05, + "loss": 0.002799773961305618, + "step": 162860 + }, + { + "epoch": 23.118523775727468, + "grad_norm": 0.26183587312698364, + "learning_rate": 7.689112845990064e-05, + "loss": 0.010252837836742402, + "step": 162870 + }, + { + "epoch": 23.119943222143363, + "grad_norm": 4.122483730316162, + "learning_rate": 7.688970901348474e-05, + "loss": 0.00818869099020958, + "step": 162880 + }, + { + "epoch": 23.12136266855926, + "grad_norm": 0.04787770286202431, + "learning_rate": 7.688828956706885e-05, + "loss": 0.008643164485692977, + "step": 162890 + }, + { + "epoch": 23.12278211497516, + "grad_norm": 4.087941646575928, + "learning_rate": 7.688687012065296e-05, + "loss": 0.02280993163585663, + "step": 162900 + }, + { + "epoch": 23.124201561391057, + "grad_norm": 10.27031135559082, + "learning_rate": 7.688545067423704e-05, + "loss": 0.02956539988517761, + "step": 162910 + }, + { + "epoch": 23.125621007806956, + "grad_norm": 4.465502738952637, + "learning_rate": 7.688403122782116e-05, + "loss": 0.036733183264732364, + "step": 162920 + }, + { + "epoch": 23.127040454222854, + "grad_norm": 0.08674945682287216, + "learning_rate": 7.688261178140525e-05, + "loss": 0.007174272835254669, + "step": 162930 + }, + { + "epoch": 23.128459900638752, + "grad_norm": 0.015028636902570724, + "learning_rate": 7.688119233498936e-05, + "loss": 0.06274649500846863, + "step": 162940 + }, + { + "epoch": 23.129879347054647, + "grad_norm": 0.07752390950918198, + "learning_rate": 7.687977288857346e-05, + "loss": 0.017393530905246736, + "step": 162950 + }, + { + "epoch": 23.131298793470545, + "grad_norm": 0.5470381379127502, + "learning_rate": 7.687835344215756e-05, + "loss": 0.00829293429851532, + "step": 162960 + }, + { + "epoch": 23.132718239886444, + "grad_norm": 0.1407025307416916, + "learning_rate": 7.687693399574166e-05, + "loss": 0.0008098684251308441, + "step": 162970 + }, + { + "epoch": 23.134137686302342, + "grad_norm": 0.006651765666902065, + "learning_rate": 7.687551454932577e-05, + "loss": 0.0172958642244339, + "step": 162980 + }, + { + "epoch": 23.13555713271824, + "grad_norm": 4.5103583335876465, + "learning_rate": 7.687409510290988e-05, + "loss": 0.010490042716264724, + "step": 162990 + }, + { + "epoch": 23.13697657913414, + "grad_norm": 0.8626647591590881, + "learning_rate": 7.687267565649398e-05, + "loss": 0.0034983612596988676, + "step": 163000 + }, + { + "epoch": 23.13697657913414, + "eval_accuracy": 0.9889362243275895, + "eval_loss": 0.038272157311439514, + "eval_runtime": 31.3147, + "eval_samples_per_second": 502.223, + "eval_steps_per_second": 15.711, + "step": 163000 + }, + { + "epoch": 23.138396025550037, + "grad_norm": 0.050362344831228256, + "learning_rate": 7.687125621007807e-05, + "loss": 0.002429485321044922, + "step": 163010 + }, + { + "epoch": 23.13981547196593, + "grad_norm": 11.521379470825195, + "learning_rate": 7.686983676366217e-05, + "loss": 0.019295407831668852, + "step": 163020 + }, + { + "epoch": 23.14123491838183, + "grad_norm": 0.7864540815353394, + "learning_rate": 7.686841731724628e-05, + "loss": 0.01803356856107712, + "step": 163030 + }, + { + "epoch": 23.14265436479773, + "grad_norm": 0.008088598027825356, + "learning_rate": 7.686699787083038e-05, + "loss": 0.0020820554345846176, + "step": 163040 + }, + { + "epoch": 23.144073811213627, + "grad_norm": 0.10877585411071777, + "learning_rate": 7.686557842441449e-05, + "loss": 0.014271177351474762, + "step": 163050 + }, + { + "epoch": 23.145493257629525, + "grad_norm": 0.3781072795391083, + "learning_rate": 7.686415897799857e-05, + "loss": 0.01383303701877594, + "step": 163060 + }, + { + "epoch": 23.146912704045423, + "grad_norm": 0.019399583339691162, + "learning_rate": 7.686273953158268e-05, + "loss": 0.011244687438011169, + "step": 163070 + }, + { + "epoch": 23.14833215046132, + "grad_norm": 0.030331028625369072, + "learning_rate": 7.68613200851668e-05, + "loss": 0.007428871095180511, + "step": 163080 + }, + { + "epoch": 23.149751596877216, + "grad_norm": 0.07057445496320724, + "learning_rate": 7.685990063875089e-05, + "loss": 0.003156041353940964, + "step": 163090 + }, + { + "epoch": 23.151171043293115, + "grad_norm": 0.277256041765213, + "learning_rate": 7.6858481192335e-05, + "loss": 0.01384088397026062, + "step": 163100 + }, + { + "epoch": 23.152590489709013, + "grad_norm": 0.025087472051382065, + "learning_rate": 7.685706174591909e-05, + "loss": 0.053059864044189456, + "step": 163110 + }, + { + "epoch": 23.15400993612491, + "grad_norm": 1.3452047109603882, + "learning_rate": 7.68556422995032e-05, + "loss": 0.013987116515636444, + "step": 163120 + }, + { + "epoch": 23.15542938254081, + "grad_norm": 0.04178168624639511, + "learning_rate": 7.68542228530873e-05, + "loss": 0.030617043375968933, + "step": 163130 + }, + { + "epoch": 23.156848828956708, + "grad_norm": 0.3115779161453247, + "learning_rate": 7.68528034066714e-05, + "loss": 0.006028669700026512, + "step": 163140 + }, + { + "epoch": 23.158268275372606, + "grad_norm": 0.0654100626707077, + "learning_rate": 7.68513839602555e-05, + "loss": 0.005169058963656425, + "step": 163150 + }, + { + "epoch": 23.1596877217885, + "grad_norm": 0.36675235629081726, + "learning_rate": 7.68499645138396e-05, + "loss": 0.005789804458618164, + "step": 163160 + }, + { + "epoch": 23.1611071682044, + "grad_norm": 0.10254102945327759, + "learning_rate": 7.684854506742371e-05, + "loss": 0.012728314101696014, + "step": 163170 + }, + { + "epoch": 23.162526614620297, + "grad_norm": 0.05589598789811134, + "learning_rate": 7.684712562100781e-05, + "loss": 0.009516747295856475, + "step": 163180 + }, + { + "epoch": 23.163946061036196, + "grad_norm": 9.000589370727539, + "learning_rate": 7.684570617459192e-05, + "loss": 0.049416130781173705, + "step": 163190 + }, + { + "epoch": 23.165365507452094, + "grad_norm": 15.124249458312988, + "learning_rate": 7.684428672817602e-05, + "loss": 0.03094829022884369, + "step": 163200 + }, + { + "epoch": 23.166784953867992, + "grad_norm": 1.535739779472351, + "learning_rate": 7.684286728176013e-05, + "loss": 0.03364900648593903, + "step": 163210 + }, + { + "epoch": 23.16820440028389, + "grad_norm": 0.06094123423099518, + "learning_rate": 7.684144783534421e-05, + "loss": 0.022162599861621855, + "step": 163220 + }, + { + "epoch": 23.169623846699785, + "grad_norm": 11.120777130126953, + "learning_rate": 7.684002838892832e-05, + "loss": 0.009362877905368805, + "step": 163230 + }, + { + "epoch": 23.171043293115684, + "grad_norm": 10.148634910583496, + "learning_rate": 7.683860894251242e-05, + "loss": 0.026524320244789124, + "step": 163240 + }, + { + "epoch": 23.172462739531582, + "grad_norm": 0.2080085426568985, + "learning_rate": 7.683718949609653e-05, + "loss": 0.025483694672584534, + "step": 163250 + }, + { + "epoch": 23.17388218594748, + "grad_norm": 1.2418477535247803, + "learning_rate": 7.683577004968063e-05, + "loss": 0.00922916829586029, + "step": 163260 + }, + { + "epoch": 23.17530163236338, + "grad_norm": 9.026187896728516, + "learning_rate": 7.683435060326473e-05, + "loss": 0.017989969253540038, + "step": 163270 + }, + { + "epoch": 23.176721078779277, + "grad_norm": 0.28982868790626526, + "learning_rate": 7.683293115684884e-05, + "loss": 0.02058362364768982, + "step": 163280 + }, + { + "epoch": 23.178140525195175, + "grad_norm": 0.10903431475162506, + "learning_rate": 7.683151171043293e-05, + "loss": 0.018497779965400696, + "step": 163290 + }, + { + "epoch": 23.17955997161107, + "grad_norm": 4.881477355957031, + "learning_rate": 7.683009226401705e-05, + "loss": 0.00865916907787323, + "step": 163300 + }, + { + "epoch": 23.18097941802697, + "grad_norm": 0.2669900059700012, + "learning_rate": 7.682867281760114e-05, + "loss": 0.02375580221414566, + "step": 163310 + }, + { + "epoch": 23.182398864442867, + "grad_norm": 0.11022119224071503, + "learning_rate": 7.682739531582683e-05, + "loss": 0.083653324842453, + "step": 163320 + }, + { + "epoch": 23.183818310858765, + "grad_norm": 5.147243499755859, + "learning_rate": 7.682597586941094e-05, + "loss": 0.031257224082946775, + "step": 163330 + }, + { + "epoch": 23.185237757274663, + "grad_norm": 3.593852996826172, + "learning_rate": 7.682455642299504e-05, + "loss": 0.006379434466361999, + "step": 163340 + }, + { + "epoch": 23.18665720369056, + "grad_norm": 0.6422228813171387, + "learning_rate": 7.682313697657913e-05, + "loss": 0.005741833150386811, + "step": 163350 + }, + { + "epoch": 23.18807665010646, + "grad_norm": 0.015536986291408539, + "learning_rate": 7.682171753016324e-05, + "loss": 0.00452861487865448, + "step": 163360 + }, + { + "epoch": 23.189496096522355, + "grad_norm": 7.10488224029541, + "learning_rate": 7.682029808374734e-05, + "loss": 0.037220853567123416, + "step": 163370 + }, + { + "epoch": 23.190915542938253, + "grad_norm": 0.06295428425073624, + "learning_rate": 7.681887863733145e-05, + "loss": 0.01894667446613312, + "step": 163380 + }, + { + "epoch": 23.19233498935415, + "grad_norm": 0.06888831406831741, + "learning_rate": 7.681745919091554e-05, + "loss": 0.016139568388462068, + "step": 163390 + }, + { + "epoch": 23.19375443577005, + "grad_norm": 0.006295394152402878, + "learning_rate": 7.681603974449965e-05, + "loss": 0.004946483299136162, + "step": 163400 + }, + { + "epoch": 23.195173882185948, + "grad_norm": 1.4388620853424072, + "learning_rate": 7.681462029808374e-05, + "loss": 0.006145629659295082, + "step": 163410 + }, + { + "epoch": 23.196593328601846, + "grad_norm": 9.541016578674316, + "learning_rate": 7.681320085166786e-05, + "loss": 0.058392131328582765, + "step": 163420 + }, + { + "epoch": 23.198012775017745, + "grad_norm": 1.7744998931884766, + "learning_rate": 7.681178140525195e-05, + "loss": 0.003867834433913231, + "step": 163430 + }, + { + "epoch": 23.19943222143364, + "grad_norm": 0.10151354223489761, + "learning_rate": 7.681036195883605e-05, + "loss": 0.043567624688148496, + "step": 163440 + }, + { + "epoch": 23.200851667849538, + "grad_norm": 4.099494934082031, + "learning_rate": 7.680894251242016e-05, + "loss": 0.006441095471382141, + "step": 163450 + }, + { + "epoch": 23.202271114265436, + "grad_norm": 4.0720295906066895, + "learning_rate": 7.680752306600426e-05, + "loss": 0.03324064612388611, + "step": 163460 + }, + { + "epoch": 23.203690560681334, + "grad_norm": 0.14392825961112976, + "learning_rate": 7.680610361958837e-05, + "loss": 0.02270908057689667, + "step": 163470 + }, + { + "epoch": 23.205110007097232, + "grad_norm": 7.81494140625, + "learning_rate": 7.680468417317247e-05, + "loss": 0.07168574333190918, + "step": 163480 + }, + { + "epoch": 23.20652945351313, + "grad_norm": 13.527397155761719, + "learning_rate": 7.680326472675658e-05, + "loss": 0.033021238446235654, + "step": 163490 + }, + { + "epoch": 23.20794889992903, + "grad_norm": 0.418710857629776, + "learning_rate": 7.680184528034066e-05, + "loss": 0.00859561413526535, + "step": 163500 + }, + { + "epoch": 23.20794889992903, + "eval_accuracy": 0.9851211292681376, + "eval_loss": 0.05818798393011093, + "eval_runtime": 35.8878, + "eval_samples_per_second": 438.227, + "eval_steps_per_second": 13.709, + "step": 163500 + }, + { + "epoch": 23.209368346344924, + "grad_norm": 0.03371967002749443, + "learning_rate": 7.680042583392477e-05, + "loss": 0.02113671749830246, + "step": 163510 + }, + { + "epoch": 23.210787792760822, + "grad_norm": 10.847223281860352, + "learning_rate": 7.679900638750887e-05, + "loss": 0.02217719703912735, + "step": 163520 + }, + { + "epoch": 23.21220723917672, + "grad_norm": 0.15189655125141144, + "learning_rate": 7.679758694109298e-05, + "loss": 0.02578548192977905, + "step": 163530 + }, + { + "epoch": 23.21362668559262, + "grad_norm": 0.07837851345539093, + "learning_rate": 7.679616749467708e-05, + "loss": 0.00646902546286583, + "step": 163540 + }, + { + "epoch": 23.215046132008517, + "grad_norm": 0.6939084529876709, + "learning_rate": 7.679474804826118e-05, + "loss": 0.027848750352859497, + "step": 163550 + }, + { + "epoch": 23.216465578424415, + "grad_norm": 12.830351829528809, + "learning_rate": 7.679332860184529e-05, + "loss": 0.011741799861192703, + "step": 163560 + }, + { + "epoch": 23.217885024840314, + "grad_norm": 13.902604103088379, + "learning_rate": 7.679190915542938e-05, + "loss": 0.018804095685482025, + "step": 163570 + }, + { + "epoch": 23.21930447125621, + "grad_norm": 0.33358854055404663, + "learning_rate": 7.67904897090135e-05, + "loss": 0.07884200215339661, + "step": 163580 + }, + { + "epoch": 23.220723917672107, + "grad_norm": 2.8247289657592773, + "learning_rate": 7.678907026259759e-05, + "loss": 0.021544642746448517, + "step": 163590 + }, + { + "epoch": 23.222143364088005, + "grad_norm": 1.3561774492263794, + "learning_rate": 7.678765081618169e-05, + "loss": 0.06348315477371216, + "step": 163600 + }, + { + "epoch": 23.223562810503903, + "grad_norm": 0.21089696884155273, + "learning_rate": 7.678623136976579e-05, + "loss": 0.03758191168308258, + "step": 163610 + }, + { + "epoch": 23.2249822569198, + "grad_norm": 1.717174768447876, + "learning_rate": 7.67848119233499e-05, + "loss": 0.02656695246696472, + "step": 163620 + }, + { + "epoch": 23.2264017033357, + "grad_norm": 0.08262735605239868, + "learning_rate": 7.6783392476934e-05, + "loss": 0.0337538480758667, + "step": 163630 + }, + { + "epoch": 23.2278211497516, + "grad_norm": 0.0836467370390892, + "learning_rate": 7.67819730305181e-05, + "loss": 0.01760019510984421, + "step": 163640 + }, + { + "epoch": 23.229240596167493, + "grad_norm": 0.012955832295119762, + "learning_rate": 7.67805535841022e-05, + "loss": 0.019087645411491393, + "step": 163650 + }, + { + "epoch": 23.23066004258339, + "grad_norm": 0.10591467469930649, + "learning_rate": 7.67791341376863e-05, + "loss": 0.015456712245941162, + "step": 163660 + }, + { + "epoch": 23.23207948899929, + "grad_norm": 0.08486371487379074, + "learning_rate": 7.677771469127041e-05, + "loss": 0.03375494778156281, + "step": 163670 + }, + { + "epoch": 23.233498935415188, + "grad_norm": 0.4872633218765259, + "learning_rate": 7.677629524485451e-05, + "loss": 0.019985070824623107, + "step": 163680 + }, + { + "epoch": 23.234918381831086, + "grad_norm": 1.2168809175491333, + "learning_rate": 7.677487579843862e-05, + "loss": 0.012876510620117188, + "step": 163690 + }, + { + "epoch": 23.236337828246985, + "grad_norm": 2.1401267051696777, + "learning_rate": 7.67734563520227e-05, + "loss": 0.02096586674451828, + "step": 163700 + }, + { + "epoch": 23.237757274662883, + "grad_norm": 3.3611226081848145, + "learning_rate": 7.677203690560682e-05, + "loss": 0.012545591592788697, + "step": 163710 + }, + { + "epoch": 23.239176721078778, + "grad_norm": 0.045655641704797745, + "learning_rate": 7.677061745919091e-05, + "loss": 0.003518717736005783, + "step": 163720 + }, + { + "epoch": 23.240596167494676, + "grad_norm": 2.3230082988739014, + "learning_rate": 7.676919801277502e-05, + "loss": 0.009554924070835113, + "step": 163730 + }, + { + "epoch": 23.242015613910574, + "grad_norm": 0.05728672072291374, + "learning_rate": 7.676777856635913e-05, + "loss": 0.010943768918514252, + "step": 163740 + }, + { + "epoch": 23.243435060326473, + "grad_norm": 15.546445846557617, + "learning_rate": 7.676635911994322e-05, + "loss": 0.04270048439502716, + "step": 163750 + }, + { + "epoch": 23.24485450674237, + "grad_norm": 0.0464971549808979, + "learning_rate": 7.676493967352733e-05, + "loss": 0.017297746241092683, + "step": 163760 + }, + { + "epoch": 23.24627395315827, + "grad_norm": 0.10044872015714645, + "learning_rate": 7.676352022711143e-05, + "loss": 0.013265913724899292, + "step": 163770 + }, + { + "epoch": 23.247693399574167, + "grad_norm": 0.24115735292434692, + "learning_rate": 7.676210078069554e-05, + "loss": 0.008149499446153641, + "step": 163780 + }, + { + "epoch": 23.249112845990062, + "grad_norm": 0.16820739209651947, + "learning_rate": 7.676068133427963e-05, + "loss": 0.009171213209629058, + "step": 163790 + }, + { + "epoch": 23.25053229240596, + "grad_norm": 3.914043426513672, + "learning_rate": 7.675926188786373e-05, + "loss": 0.041691988706588745, + "step": 163800 + }, + { + "epoch": 23.25195173882186, + "grad_norm": 0.008680049329996109, + "learning_rate": 7.675784244144783e-05, + "loss": 0.007538994401693344, + "step": 163810 + }, + { + "epoch": 23.253371185237757, + "grad_norm": 0.23187461495399475, + "learning_rate": 7.675642299503194e-05, + "loss": 0.027346861362457276, + "step": 163820 + }, + { + "epoch": 23.254790631653655, + "grad_norm": 15.285842895507812, + "learning_rate": 7.675500354861604e-05, + "loss": 0.017390260100364686, + "step": 163830 + }, + { + "epoch": 23.256210078069554, + "grad_norm": 2.259439468383789, + "learning_rate": 7.675358410220015e-05, + "loss": 0.00849878042936325, + "step": 163840 + }, + { + "epoch": 23.257629524485452, + "grad_norm": 1.5351204872131348, + "learning_rate": 7.675216465578426e-05, + "loss": 0.014755818247795104, + "step": 163850 + }, + { + "epoch": 23.259048970901347, + "grad_norm": 0.6266843676567078, + "learning_rate": 7.675074520936834e-05, + "loss": 0.014989227056503296, + "step": 163860 + }, + { + "epoch": 23.260468417317245, + "grad_norm": 1.9740090370178223, + "learning_rate": 7.674932576295245e-05, + "loss": 0.02225850969552994, + "step": 163870 + }, + { + "epoch": 23.261887863733143, + "grad_norm": 0.5121137499809265, + "learning_rate": 7.674790631653655e-05, + "loss": 0.016324050724506378, + "step": 163880 + }, + { + "epoch": 23.26330731014904, + "grad_norm": 0.4973202645778656, + "learning_rate": 7.674648687012066e-05, + "loss": 0.02932792007923126, + "step": 163890 + }, + { + "epoch": 23.26472675656494, + "grad_norm": 0.12655329704284668, + "learning_rate": 7.674506742370476e-05, + "loss": 0.024949759244918823, + "step": 163900 + }, + { + "epoch": 23.26614620298084, + "grad_norm": 0.14284615218639374, + "learning_rate": 7.674364797728886e-05, + "loss": 0.013745367527008057, + "step": 163910 + }, + { + "epoch": 23.267565649396737, + "grad_norm": 0.07328206300735474, + "learning_rate": 7.674222853087295e-05, + "loss": 0.05794970989227295, + "step": 163920 + }, + { + "epoch": 23.26898509581263, + "grad_norm": 0.17918673157691956, + "learning_rate": 7.674080908445707e-05, + "loss": 0.00791948065161705, + "step": 163930 + }, + { + "epoch": 23.27040454222853, + "grad_norm": 2.714103937149048, + "learning_rate": 7.673938963804118e-05, + "loss": 0.00900263786315918, + "step": 163940 + }, + { + "epoch": 23.271823988644428, + "grad_norm": 0.2439054697751999, + "learning_rate": 7.673797019162527e-05, + "loss": 0.0031405165791511536, + "step": 163950 + }, + { + "epoch": 23.273243435060326, + "grad_norm": 0.05772057920694351, + "learning_rate": 7.673655074520937e-05, + "loss": 0.0015002239495515823, + "step": 163960 + }, + { + "epoch": 23.274662881476225, + "grad_norm": 0.11124888807535172, + "learning_rate": 7.673513129879347e-05, + "loss": 0.05028444528579712, + "step": 163970 + }, + { + "epoch": 23.276082327892123, + "grad_norm": 0.015601775608956814, + "learning_rate": 7.673371185237758e-05, + "loss": 0.004856827855110169, + "step": 163980 + }, + { + "epoch": 23.27750177430802, + "grad_norm": 7.011933326721191, + "learning_rate": 7.673229240596168e-05, + "loss": 0.01472586691379547, + "step": 163990 + }, + { + "epoch": 23.278921220723916, + "grad_norm": 0.2503751814365387, + "learning_rate": 7.673087295954579e-05, + "loss": 0.017151153087615965, + "step": 164000 + }, + { + "epoch": 23.278921220723916, + "eval_accuracy": 0.9893813187511922, + "eval_loss": 0.04085047170519829, + "eval_runtime": 35.6052, + "eval_samples_per_second": 441.706, + "eval_steps_per_second": 13.818, + "step": 164000 + }, + { + "epoch": 23.280340667139814, + "grad_norm": 0.019486384466290474, + "learning_rate": 7.672945351312987e-05, + "loss": 0.02349367141723633, + "step": 164010 + }, + { + "epoch": 23.281760113555713, + "grad_norm": 0.025149967521429062, + "learning_rate": 7.672803406671398e-05, + "loss": 0.02428726702928543, + "step": 164020 + }, + { + "epoch": 23.28317955997161, + "grad_norm": 0.5827234387397766, + "learning_rate": 7.67266146202981e-05, + "loss": 0.037998273968696594, + "step": 164030 + }, + { + "epoch": 23.28459900638751, + "grad_norm": 0.3379354476928711, + "learning_rate": 7.672519517388219e-05, + "loss": 0.003741409257054329, + "step": 164040 + }, + { + "epoch": 23.286018452803408, + "grad_norm": 0.9978327751159668, + "learning_rate": 7.67237757274663e-05, + "loss": 0.04062702059745789, + "step": 164050 + }, + { + "epoch": 23.287437899219306, + "grad_norm": 0.011829227209091187, + "learning_rate": 7.672235628105039e-05, + "loss": 0.0036276292055845262, + "step": 164060 + }, + { + "epoch": 23.2888573456352, + "grad_norm": 7.7990641593933105, + "learning_rate": 7.67209368346345e-05, + "loss": 0.003731155022978783, + "step": 164070 + }, + { + "epoch": 23.2902767920511, + "grad_norm": 0.01671607978641987, + "learning_rate": 7.67195173882186e-05, + "loss": 0.016301195323467254, + "step": 164080 + }, + { + "epoch": 23.291696238466997, + "grad_norm": 0.18641042709350586, + "learning_rate": 7.67180979418027e-05, + "loss": 0.011274157464504242, + "step": 164090 + }, + { + "epoch": 23.293115684882896, + "grad_norm": 1.1734553575515747, + "learning_rate": 7.67166784953868e-05, + "loss": 0.0033438004553318023, + "step": 164100 + }, + { + "epoch": 23.294535131298794, + "grad_norm": 0.4467449188232422, + "learning_rate": 7.67152590489709e-05, + "loss": 0.08818975687026978, + "step": 164110 + }, + { + "epoch": 23.295954577714692, + "grad_norm": 13.7846097946167, + "learning_rate": 7.671383960255501e-05, + "loss": 0.02947819232940674, + "step": 164120 + }, + { + "epoch": 23.29737402413059, + "grad_norm": 4.7052459716796875, + "learning_rate": 7.671242015613911e-05, + "loss": 0.002590373158454895, + "step": 164130 + }, + { + "epoch": 23.298793470546485, + "grad_norm": 6.38729190826416, + "learning_rate": 7.671100070972322e-05, + "loss": 0.03963978290557861, + "step": 164140 + }, + { + "epoch": 23.300212916962384, + "grad_norm": 11.49255084991455, + "learning_rate": 7.670958126330732e-05, + "loss": 0.03352076411247253, + "step": 164150 + }, + { + "epoch": 23.301632363378282, + "grad_norm": 0.04543439298868179, + "learning_rate": 7.670816181689141e-05, + "loss": 0.007700487971305847, + "step": 164160 + }, + { + "epoch": 23.30305180979418, + "grad_norm": 0.36027446389198303, + "learning_rate": 7.670674237047551e-05, + "loss": 0.010143016278743745, + "step": 164170 + }, + { + "epoch": 23.30447125621008, + "grad_norm": 0.13264481723308563, + "learning_rate": 7.670532292405962e-05, + "loss": 0.007651815563440323, + "step": 164180 + }, + { + "epoch": 23.305890702625977, + "grad_norm": 1.2894998788833618, + "learning_rate": 7.670390347764372e-05, + "loss": 0.010458014905452728, + "step": 164190 + }, + { + "epoch": 23.307310149041875, + "grad_norm": 0.028102636337280273, + "learning_rate": 7.670248403122783e-05, + "loss": 0.0012863524258136748, + "step": 164200 + }, + { + "epoch": 23.30872959545777, + "grad_norm": 5.462471961975098, + "learning_rate": 7.670106458481193e-05, + "loss": 0.014058217406272888, + "step": 164210 + }, + { + "epoch": 23.310149041873668, + "grad_norm": 0.18911345303058624, + "learning_rate": 7.669964513839603e-05, + "loss": 0.0027625951915979385, + "step": 164220 + }, + { + "epoch": 23.311568488289566, + "grad_norm": 0.05472433567047119, + "learning_rate": 7.669822569198014e-05, + "loss": 0.04935111105442047, + "step": 164230 + }, + { + "epoch": 23.312987934705465, + "grad_norm": 0.8976383209228516, + "learning_rate": 7.669680624556423e-05, + "loss": 0.009190114587545395, + "step": 164240 + }, + { + "epoch": 23.314407381121363, + "grad_norm": 0.12445452064275742, + "learning_rate": 7.669538679914834e-05, + "loss": 0.022233882546424867, + "step": 164250 + }, + { + "epoch": 23.31582682753726, + "grad_norm": 0.20108696818351746, + "learning_rate": 7.669396735273244e-05, + "loss": 0.003787750005722046, + "step": 164260 + }, + { + "epoch": 23.31724627395316, + "grad_norm": 0.011542386375367641, + "learning_rate": 7.669254790631654e-05, + "loss": 0.0074545606970787045, + "step": 164270 + }, + { + "epoch": 23.318665720369054, + "grad_norm": 0.022063687443733215, + "learning_rate": 7.669112845990064e-05, + "loss": 0.003003543987870216, + "step": 164280 + }, + { + "epoch": 23.320085166784953, + "grad_norm": 2.4177870750427246, + "learning_rate": 7.668970901348475e-05, + "loss": 0.010840468108654022, + "step": 164290 + }, + { + "epoch": 23.32150461320085, + "grad_norm": 0.382374107837677, + "learning_rate": 7.668828956706884e-05, + "loss": 0.013091279566287995, + "step": 164300 + }, + { + "epoch": 23.32292405961675, + "grad_norm": 6.0184831619262695, + "learning_rate": 7.668687012065296e-05, + "loss": 0.02513148486614227, + "step": 164310 + }, + { + "epoch": 23.324343506032648, + "grad_norm": 7.632693290710449, + "learning_rate": 7.668545067423705e-05, + "loss": 0.023075708746910097, + "step": 164320 + }, + { + "epoch": 23.325762952448546, + "grad_norm": 0.20893056690692902, + "learning_rate": 7.668403122782115e-05, + "loss": 0.003418174386024475, + "step": 164330 + }, + { + "epoch": 23.327182398864444, + "grad_norm": 7.626595973968506, + "learning_rate": 7.668261178140526e-05, + "loss": 0.007526922225952149, + "step": 164340 + }, + { + "epoch": 23.32860184528034, + "grad_norm": 13.876319885253906, + "learning_rate": 7.668119233498936e-05, + "loss": 0.016262876987457275, + "step": 164350 + }, + { + "epoch": 23.330021291696237, + "grad_norm": 1.2191271781921387, + "learning_rate": 7.667977288857347e-05, + "loss": 0.004137857630848884, + "step": 164360 + }, + { + "epoch": 23.331440738112136, + "grad_norm": 0.2824001610279083, + "learning_rate": 7.667835344215755e-05, + "loss": 0.03549357652664185, + "step": 164370 + }, + { + "epoch": 23.332860184528034, + "grad_norm": 0.04719771072268486, + "learning_rate": 7.667693399574166e-05, + "loss": 0.012347467988729478, + "step": 164380 + }, + { + "epoch": 23.334279630943932, + "grad_norm": 5.488483428955078, + "learning_rate": 7.667551454932576e-05, + "loss": 0.040853378176689145, + "step": 164390 + }, + { + "epoch": 23.33569907735983, + "grad_norm": 0.19062750041484833, + "learning_rate": 7.667409510290987e-05, + "loss": 0.01288812756538391, + "step": 164400 + }, + { + "epoch": 23.33711852377573, + "grad_norm": 0.12844382226467133, + "learning_rate": 7.667267565649397e-05, + "loss": 0.017960503697395325, + "step": 164410 + }, + { + "epoch": 23.338537970191624, + "grad_norm": 0.07746586948633194, + "learning_rate": 7.667125621007807e-05, + "loss": 0.014731019735336304, + "step": 164420 + }, + { + "epoch": 23.339957416607522, + "grad_norm": 0.22059093415737152, + "learning_rate": 7.666983676366218e-05, + "loss": 0.007802623510360718, + "step": 164430 + }, + { + "epoch": 23.34137686302342, + "grad_norm": 11.91578483581543, + "learning_rate": 7.666841731724628e-05, + "loss": 0.013570009171962738, + "step": 164440 + }, + { + "epoch": 23.34279630943932, + "grad_norm": 0.16380976140499115, + "learning_rate": 7.666699787083039e-05, + "loss": 0.0757303535938263, + "step": 164450 + }, + { + "epoch": 23.344215755855217, + "grad_norm": 0.40290507674217224, + "learning_rate": 7.666557842441448e-05, + "loss": 0.0031037993729114534, + "step": 164460 + }, + { + "epoch": 23.345635202271115, + "grad_norm": 2.590711832046509, + "learning_rate": 7.666415897799858e-05, + "loss": 0.03262744247913361, + "step": 164470 + }, + { + "epoch": 23.347054648687013, + "grad_norm": 0.00724292965605855, + "learning_rate": 7.666273953158268e-05, + "loss": 0.007898370921611785, + "step": 164480 + }, + { + "epoch": 23.348474095102908, + "grad_norm": 3.1076743602752686, + "learning_rate": 7.666132008516679e-05, + "loss": 0.007815198600292205, + "step": 164490 + }, + { + "epoch": 23.349893541518806, + "grad_norm": 0.11026152223348618, + "learning_rate": 7.665990063875089e-05, + "loss": 0.02721671760082245, + "step": 164500 + }, + { + "epoch": 23.349893541518806, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.07727793604135513, + "eval_runtime": 32.1437, + "eval_samples_per_second": 489.271, + "eval_steps_per_second": 15.306, + "step": 164500 + }, + { + "epoch": 23.351312987934705, + "grad_norm": 0.5263890027999878, + "learning_rate": 7.6658481192335e-05, + "loss": 0.010638675093650818, + "step": 164510 + }, + { + "epoch": 23.352732434350603, + "grad_norm": 0.6094750165939331, + "learning_rate": 7.66570617459191e-05, + "loss": 0.057137519121170044, + "step": 164520 + }, + { + "epoch": 23.3541518807665, + "grad_norm": 0.24372835457324982, + "learning_rate": 7.665564229950319e-05, + "loss": 0.02085036039352417, + "step": 164530 + }, + { + "epoch": 23.3555713271824, + "grad_norm": 0.1645321398973465, + "learning_rate": 7.66542228530873e-05, + "loss": 0.0228232204914093, + "step": 164540 + }, + { + "epoch": 23.356990773598298, + "grad_norm": 7.412874698638916, + "learning_rate": 7.66528034066714e-05, + "loss": 0.030020755529403687, + "step": 164550 + }, + { + "epoch": 23.358410220014193, + "grad_norm": 0.029567338526248932, + "learning_rate": 7.665138396025551e-05, + "loss": 0.034139391779899594, + "step": 164560 + }, + { + "epoch": 23.35982966643009, + "grad_norm": 1.683417558670044, + "learning_rate": 7.664996451383961e-05, + "loss": 0.031986591219902036, + "step": 164570 + }, + { + "epoch": 23.36124911284599, + "grad_norm": 0.15090076625347137, + "learning_rate": 7.664854506742371e-05, + "loss": 0.003864661231637001, + "step": 164580 + }, + { + "epoch": 23.362668559261888, + "grad_norm": 1.5969091653823853, + "learning_rate": 7.66471256210078e-05, + "loss": 0.010417811572551727, + "step": 164590 + }, + { + "epoch": 23.364088005677786, + "grad_norm": 1.521074652671814, + "learning_rate": 7.664570617459192e-05, + "loss": 0.016457566618919374, + "step": 164600 + }, + { + "epoch": 23.365507452093684, + "grad_norm": 0.6433447003364563, + "learning_rate": 7.664428672817601e-05, + "loss": 0.0059282127767801285, + "step": 164610 + }, + { + "epoch": 23.366926898509583, + "grad_norm": 0.15938644111156464, + "learning_rate": 7.664286728176012e-05, + "loss": 0.024022677540779115, + "step": 164620 + }, + { + "epoch": 23.368346344925477, + "grad_norm": 7.223662853240967, + "learning_rate": 7.664144783534422e-05, + "loss": 0.019348344206809996, + "step": 164630 + }, + { + "epoch": 23.369765791341376, + "grad_norm": 0.07555852830410004, + "learning_rate": 7.664002838892832e-05, + "loss": 0.017076675593852998, + "step": 164640 + }, + { + "epoch": 23.371185237757274, + "grad_norm": 0.031056849285960197, + "learning_rate": 7.663860894251243e-05, + "loss": 0.0026843443512916563, + "step": 164650 + }, + { + "epoch": 23.372604684173172, + "grad_norm": 1.3230938911437988, + "learning_rate": 7.663718949609653e-05, + "loss": 0.0034713804721832275, + "step": 164660 + }, + { + "epoch": 23.37402413058907, + "grad_norm": 0.005016915034502745, + "learning_rate": 7.663577004968064e-05, + "loss": 0.0034542202949523924, + "step": 164670 + }, + { + "epoch": 23.37544357700497, + "grad_norm": 0.031200548633933067, + "learning_rate": 7.663435060326472e-05, + "loss": 0.03490687608718872, + "step": 164680 + }, + { + "epoch": 23.376863023420867, + "grad_norm": 0.11020562052726746, + "learning_rate": 7.663293115684883e-05, + "loss": 0.02072673738002777, + "step": 164690 + }, + { + "epoch": 23.378282469836762, + "grad_norm": 2.4811577796936035, + "learning_rate": 7.663151171043293e-05, + "loss": 0.010047583281993866, + "step": 164700 + }, + { + "epoch": 23.37970191625266, + "grad_norm": 2.5246899127960205, + "learning_rate": 7.663009226401704e-05, + "loss": 0.007637768983840942, + "step": 164710 + }, + { + "epoch": 23.38112136266856, + "grad_norm": 0.3505830764770508, + "learning_rate": 7.662867281760114e-05, + "loss": 0.02963155210018158, + "step": 164720 + }, + { + "epoch": 23.382540809084457, + "grad_norm": 0.5208035111427307, + "learning_rate": 7.662725337118524e-05, + "loss": 0.004340691119432449, + "step": 164730 + }, + { + "epoch": 23.383960255500355, + "grad_norm": 0.3988662660121918, + "learning_rate": 7.662583392476935e-05, + "loss": 0.00736311674118042, + "step": 164740 + }, + { + "epoch": 23.385379701916253, + "grad_norm": 1.5569313764572144, + "learning_rate": 7.662441447835344e-05, + "loss": 0.024629420042037962, + "step": 164750 + }, + { + "epoch": 23.386799148332152, + "grad_norm": 0.12053435295820236, + "learning_rate": 7.662299503193755e-05, + "loss": 0.03522132337093353, + "step": 164760 + }, + { + "epoch": 23.388218594748047, + "grad_norm": 0.41866621375083923, + "learning_rate": 7.662157558552165e-05, + "loss": 0.021464604139328002, + "step": 164770 + }, + { + "epoch": 23.389638041163945, + "grad_norm": 2.6294496059417725, + "learning_rate": 7.662015613910575e-05, + "loss": 0.0067822933197021484, + "step": 164780 + }, + { + "epoch": 23.391057487579843, + "grad_norm": 0.066832534968853, + "learning_rate": 7.661873669268985e-05, + "loss": 0.028071784973144533, + "step": 164790 + }, + { + "epoch": 23.39247693399574, + "grad_norm": 0.07968316972255707, + "learning_rate": 7.661731724627396e-05, + "loss": 0.02232668250799179, + "step": 164800 + }, + { + "epoch": 23.39389638041164, + "grad_norm": 0.06821835041046143, + "learning_rate": 7.661589779985805e-05, + "loss": 0.01089501827955246, + "step": 164810 + }, + { + "epoch": 23.395315826827538, + "grad_norm": 0.014848613180220127, + "learning_rate": 7.661447835344217e-05, + "loss": 0.005894585326313972, + "step": 164820 + }, + { + "epoch": 23.396735273243436, + "grad_norm": 0.03290534019470215, + "learning_rate": 7.661305890702626e-05, + "loss": 0.02245619148015976, + "step": 164830 + }, + { + "epoch": 23.39815471965933, + "grad_norm": 0.08413399010896683, + "learning_rate": 7.661163946061036e-05, + "loss": 0.013038022816181183, + "step": 164840 + }, + { + "epoch": 23.39957416607523, + "grad_norm": 0.7405422329902649, + "learning_rate": 7.661022001419447e-05, + "loss": 0.00945512279868126, + "step": 164850 + }, + { + "epoch": 23.400993612491128, + "grad_norm": 0.03713800385594368, + "learning_rate": 7.660880056777857e-05, + "loss": 0.020492833852767945, + "step": 164860 + }, + { + "epoch": 23.402413058907026, + "grad_norm": 0.21996015310287476, + "learning_rate": 7.660738112136268e-05, + "loss": 0.0015836004167795181, + "step": 164870 + }, + { + "epoch": 23.403832505322924, + "grad_norm": 0.06999274343252182, + "learning_rate": 7.660596167494676e-05, + "loss": 0.011954693496227265, + "step": 164880 + }, + { + "epoch": 23.405251951738823, + "grad_norm": 0.7203047275543213, + "learning_rate": 7.660454222853087e-05, + "loss": 0.00869842916727066, + "step": 164890 + }, + { + "epoch": 23.40667139815472, + "grad_norm": 0.2783105969429016, + "learning_rate": 7.660312278211497e-05, + "loss": 0.013655006885528564, + "step": 164900 + }, + { + "epoch": 23.408090844570616, + "grad_norm": 0.0999186560511589, + "learning_rate": 7.660170333569908e-05, + "loss": 0.0021539811044931413, + "step": 164910 + }, + { + "epoch": 23.409510290986514, + "grad_norm": 1.2562990188598633, + "learning_rate": 7.660028388928318e-05, + "loss": 0.016599693894386293, + "step": 164920 + }, + { + "epoch": 23.410929737402412, + "grad_norm": 0.02801164612174034, + "learning_rate": 7.659886444286729e-05, + "loss": 0.012262866646051408, + "step": 164930 + }, + { + "epoch": 23.41234918381831, + "grad_norm": 4.535494804382324, + "learning_rate": 7.659744499645139e-05, + "loss": 0.02802823781967163, + "step": 164940 + }, + { + "epoch": 23.41376863023421, + "grad_norm": 0.10716408491134644, + "learning_rate": 7.659602555003549e-05, + "loss": 0.007634977996349335, + "step": 164950 + }, + { + "epoch": 23.415188076650107, + "grad_norm": 0.25433796644210815, + "learning_rate": 7.65946061036196e-05, + "loss": 0.015354809165000916, + "step": 164960 + }, + { + "epoch": 23.416607523066006, + "grad_norm": 0.883215606212616, + "learning_rate": 7.65931866572037e-05, + "loss": 0.024400827288627625, + "step": 164970 + }, + { + "epoch": 23.4180269694819, + "grad_norm": 1.2165991067886353, + "learning_rate": 7.65917672107878e-05, + "loss": 0.011247232556343079, + "step": 164980 + }, + { + "epoch": 23.4194464158978, + "grad_norm": 2.535158157348633, + "learning_rate": 7.659034776437189e-05, + "loss": 0.010126005113124847, + "step": 164990 + }, + { + "epoch": 23.420865862313697, + "grad_norm": 0.7211308479309082, + "learning_rate": 7.6588928317956e-05, + "loss": 0.006162597611546516, + "step": 165000 + }, + { + "epoch": 23.420865862313697, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.05240930989384651, + "eval_runtime": 32.4128, + "eval_samples_per_second": 485.21, + "eval_steps_per_second": 15.179, + "step": 165000 + }, + { + "epoch": 23.422285308729595, + "grad_norm": 0.021497100591659546, + "learning_rate": 7.65875088715401e-05, + "loss": 0.019372935593128204, + "step": 165010 + }, + { + "epoch": 23.423704755145494, + "grad_norm": 0.31521075963974, + "learning_rate": 7.658608942512421e-05, + "loss": 0.010481907427310944, + "step": 165020 + }, + { + "epoch": 23.425124201561392, + "grad_norm": 0.2185920923948288, + "learning_rate": 7.65846699787083e-05, + "loss": 0.023853006958961486, + "step": 165030 + }, + { + "epoch": 23.42654364797729, + "grad_norm": 0.04154974967241287, + "learning_rate": 7.65832505322924e-05, + "loss": 0.016747835278511047, + "step": 165040 + }, + { + "epoch": 23.427963094393185, + "grad_norm": 0.011894523166120052, + "learning_rate": 7.658183108587651e-05, + "loss": 0.020564086735248566, + "step": 165050 + }, + { + "epoch": 23.429382540809083, + "grad_norm": 5.230328559875488, + "learning_rate": 7.658041163946061e-05, + "loss": 0.014585936069488525, + "step": 165060 + }, + { + "epoch": 23.43080198722498, + "grad_norm": 10.862504959106445, + "learning_rate": 7.657899219304472e-05, + "loss": 0.04045327603816986, + "step": 165070 + }, + { + "epoch": 23.43222143364088, + "grad_norm": 2.1711983680725098, + "learning_rate": 7.657757274662882e-05, + "loss": 0.02016182094812393, + "step": 165080 + }, + { + "epoch": 23.433640880056778, + "grad_norm": 0.22810782492160797, + "learning_rate": 7.657615330021292e-05, + "loss": 0.03608691096305847, + "step": 165090 + }, + { + "epoch": 23.435060326472676, + "grad_norm": 0.06499504297971725, + "learning_rate": 7.657473385379701e-05, + "loss": 0.006804431974887848, + "step": 165100 + }, + { + "epoch": 23.436479772888575, + "grad_norm": 7.876749515533447, + "learning_rate": 7.657331440738113e-05, + "loss": 0.05112998485565186, + "step": 165110 + }, + { + "epoch": 23.43789921930447, + "grad_norm": 0.7119622230529785, + "learning_rate": 7.657189496096522e-05, + "loss": 0.006392943859100342, + "step": 165120 + }, + { + "epoch": 23.439318665720368, + "grad_norm": 0.020981527864933014, + "learning_rate": 7.657047551454933e-05, + "loss": 0.0031373929232358932, + "step": 165130 + }, + { + "epoch": 23.440738112136266, + "grad_norm": 1.9192702770233154, + "learning_rate": 7.656905606813343e-05, + "loss": 0.004862849786877632, + "step": 165140 + }, + { + "epoch": 23.442157558552164, + "grad_norm": 0.026989160105586052, + "learning_rate": 7.656763662171753e-05, + "loss": 0.030451101064682008, + "step": 165150 + }, + { + "epoch": 23.443577004968063, + "grad_norm": 0.18356965482234955, + "learning_rate": 7.656621717530164e-05, + "loss": 0.0008653681725263596, + "step": 165160 + }, + { + "epoch": 23.44499645138396, + "grad_norm": 0.45307213068008423, + "learning_rate": 7.656479772888574e-05, + "loss": 0.003996932879090309, + "step": 165170 + }, + { + "epoch": 23.44641589779986, + "grad_norm": 0.1124897301197052, + "learning_rate": 7.656337828246985e-05, + "loss": 0.00297844335436821, + "step": 165180 + }, + { + "epoch": 23.447835344215754, + "grad_norm": 3.4592669010162354, + "learning_rate": 7.656195883605393e-05, + "loss": 0.010568429529666901, + "step": 165190 + }, + { + "epoch": 23.449254790631652, + "grad_norm": 0.29757747054100037, + "learning_rate": 7.656053938963804e-05, + "loss": 0.028656786680221556, + "step": 165200 + }, + { + "epoch": 23.45067423704755, + "grad_norm": 0.03736260160803795, + "learning_rate": 7.655911994322214e-05, + "loss": 0.005677218362689018, + "step": 165210 + }, + { + "epoch": 23.45209368346345, + "grad_norm": 0.12168633937835693, + "learning_rate": 7.655770049680625e-05, + "loss": 0.004092790558934212, + "step": 165220 + }, + { + "epoch": 23.453513129879347, + "grad_norm": 1.347800374031067, + "learning_rate": 7.655628105039036e-05, + "loss": 0.01336227059364319, + "step": 165230 + }, + { + "epoch": 23.454932576295246, + "grad_norm": 0.12720844149589539, + "learning_rate": 7.655486160397445e-05, + "loss": 0.007681816816329956, + "step": 165240 + }, + { + "epoch": 23.456352022711144, + "grad_norm": 0.01899314858019352, + "learning_rate": 7.655344215755856e-05, + "loss": 0.02640007734298706, + "step": 165250 + }, + { + "epoch": 23.45777146912704, + "grad_norm": 2.134843587875366, + "learning_rate": 7.655202271114265e-05, + "loss": 0.024336129426956177, + "step": 165260 + }, + { + "epoch": 23.459190915542937, + "grad_norm": 0.060599781572818756, + "learning_rate": 7.655060326472676e-05, + "loss": 0.029612934589385985, + "step": 165270 + }, + { + "epoch": 23.460610361958835, + "grad_norm": 2.5477445125579834, + "learning_rate": 7.654918381831086e-05, + "loss": 0.030779799818992613, + "step": 165280 + }, + { + "epoch": 23.462029808374734, + "grad_norm": 0.47447869181632996, + "learning_rate": 7.654776437189497e-05, + "loss": 0.04414839148521423, + "step": 165290 + }, + { + "epoch": 23.463449254790632, + "grad_norm": 14.569575309753418, + "learning_rate": 7.654634492547906e-05, + "loss": 0.06934483051300049, + "step": 165300 + }, + { + "epoch": 23.46486870120653, + "grad_norm": 0.04383723810315132, + "learning_rate": 7.654492547906317e-05, + "loss": 0.002575576677918434, + "step": 165310 + }, + { + "epoch": 23.46628814762243, + "grad_norm": 0.12279897183179855, + "learning_rate": 7.654350603264728e-05, + "loss": 0.021285238862037658, + "step": 165320 + }, + { + "epoch": 23.467707594038323, + "grad_norm": 0.03639863803982735, + "learning_rate": 7.654208658623138e-05, + "loss": 0.007353863120079041, + "step": 165330 + }, + { + "epoch": 23.46912704045422, + "grad_norm": 2.0143277645111084, + "learning_rate": 7.654066713981549e-05, + "loss": 0.019287930428981782, + "step": 165340 + }, + { + "epoch": 23.47054648687012, + "grad_norm": 15.339903831481934, + "learning_rate": 7.653924769339957e-05, + "loss": 0.03109239935874939, + "step": 165350 + }, + { + "epoch": 23.471965933286018, + "grad_norm": 0.19022344052791595, + "learning_rate": 7.653782824698368e-05, + "loss": 0.00262887068092823, + "step": 165360 + }, + { + "epoch": 23.473385379701917, + "grad_norm": 18.087942123413086, + "learning_rate": 7.653640880056778e-05, + "loss": 0.00841383934020996, + "step": 165370 + }, + { + "epoch": 23.474804826117815, + "grad_norm": 2.532810688018799, + "learning_rate": 7.653498935415189e-05, + "loss": 0.02143785059452057, + "step": 165380 + }, + { + "epoch": 23.476224272533713, + "grad_norm": 0.44489768147468567, + "learning_rate": 7.653356990773599e-05, + "loss": 0.029723387956619263, + "step": 165390 + }, + { + "epoch": 23.477643718949608, + "grad_norm": 0.06287723034620285, + "learning_rate": 7.653215046132008e-05, + "loss": 0.016013221442699434, + "step": 165400 + }, + { + "epoch": 23.479063165365506, + "grad_norm": 6.326075077056885, + "learning_rate": 7.65307310149042e-05, + "loss": 0.006591279804706573, + "step": 165410 + }, + { + "epoch": 23.480482611781405, + "grad_norm": 0.006258894223719835, + "learning_rate": 7.652931156848829e-05, + "loss": 0.010085760056972504, + "step": 165420 + }, + { + "epoch": 23.481902058197303, + "grad_norm": 0.006108805071562529, + "learning_rate": 7.65278921220724e-05, + "loss": 0.02563619613647461, + "step": 165430 + }, + { + "epoch": 23.4833215046132, + "grad_norm": 1.6585707664489746, + "learning_rate": 7.65264726756565e-05, + "loss": 0.018295219540596007, + "step": 165440 + }, + { + "epoch": 23.4847409510291, + "grad_norm": 0.5287357568740845, + "learning_rate": 7.65250532292406e-05, + "loss": 0.0030645165592432023, + "step": 165450 + }, + { + "epoch": 23.486160397444998, + "grad_norm": 0.28559088706970215, + "learning_rate": 7.65236337828247e-05, + "loss": 0.0019525133073329925, + "step": 165460 + }, + { + "epoch": 23.487579843860892, + "grad_norm": 0.16137036681175232, + "learning_rate": 7.652221433640881e-05, + "loss": 0.028801512718200684, + "step": 165470 + }, + { + "epoch": 23.48899929027679, + "grad_norm": 0.01263955608010292, + "learning_rate": 7.65207948899929e-05, + "loss": 0.0014247901737689972, + "step": 165480 + }, + { + "epoch": 23.49041873669269, + "grad_norm": 0.6919429302215576, + "learning_rate": 7.651937544357702e-05, + "loss": 0.0018203221261501312, + "step": 165490 + }, + { + "epoch": 23.491838183108587, + "grad_norm": 3.8670976161956787, + "learning_rate": 7.651795599716111e-05, + "loss": 0.00261000394821167, + "step": 165500 + }, + { + "epoch": 23.491838183108587, + "eval_accuracy": 0.9933871685636168, + "eval_loss": 0.024964477866888046, + "eval_runtime": 33.0021, + "eval_samples_per_second": 476.546, + "eval_steps_per_second": 14.908, + "step": 165500 + }, + { + "epoch": 23.493257629524486, + "grad_norm": 0.005244300235062838, + "learning_rate": 7.651653655074521e-05, + "loss": 0.023367512226104736, + "step": 165510 + }, + { + "epoch": 23.494677075940384, + "grad_norm": 0.38279181718826294, + "learning_rate": 7.651511710432932e-05, + "loss": 0.0034342389553785324, + "step": 165520 + }, + { + "epoch": 23.496096522356282, + "grad_norm": 0.15812750160694122, + "learning_rate": 7.651369765791342e-05, + "loss": 0.0017393436282873154, + "step": 165530 + }, + { + "epoch": 23.497515968772177, + "grad_norm": 0.11679746955633163, + "learning_rate": 7.651227821149753e-05, + "loss": 0.0022747047245502473, + "step": 165540 + }, + { + "epoch": 23.498935415188075, + "grad_norm": 0.34129610657691956, + "learning_rate": 7.651085876508161e-05, + "loss": 0.02715761661529541, + "step": 165550 + }, + { + "epoch": 23.500354861603974, + "grad_norm": 0.009584181942045689, + "learning_rate": 7.650943931866572e-05, + "loss": 0.01600191295146942, + "step": 165560 + }, + { + "epoch": 23.501774308019872, + "grad_norm": 7.015402793884277, + "learning_rate": 7.650801987224982e-05, + "loss": 0.010221995413303375, + "step": 165570 + }, + { + "epoch": 23.50319375443577, + "grad_norm": 0.17514468729496002, + "learning_rate": 7.650660042583393e-05, + "loss": 0.0058190785348415375, + "step": 165580 + }, + { + "epoch": 23.50461320085167, + "grad_norm": 0.49383804202079773, + "learning_rate": 7.650518097941803e-05, + "loss": 0.0041070360690355304, + "step": 165590 + }, + { + "epoch": 23.506032647267567, + "grad_norm": 0.011708893813192844, + "learning_rate": 7.650376153300213e-05, + "loss": 0.004030989855527878, + "step": 165600 + }, + { + "epoch": 23.50745209368346, + "grad_norm": 0.4135341942310333, + "learning_rate": 7.650234208658624e-05, + "loss": 0.021747976541519165, + "step": 165610 + }, + { + "epoch": 23.50887154009936, + "grad_norm": 0.05635320022702217, + "learning_rate": 7.650092264017034e-05, + "loss": 0.00957869291305542, + "step": 165620 + }, + { + "epoch": 23.51029098651526, + "grad_norm": 8.286080360412598, + "learning_rate": 7.649950319375445e-05, + "loss": 0.04734171628952026, + "step": 165630 + }, + { + "epoch": 23.511710432931157, + "grad_norm": 0.4656079113483429, + "learning_rate": 7.649808374733854e-05, + "loss": 0.028931576013565063, + "step": 165640 + }, + { + "epoch": 23.513129879347055, + "grad_norm": 0.12943938374519348, + "learning_rate": 7.649666430092265e-05, + "loss": 0.017710545659065248, + "step": 165650 + }, + { + "epoch": 23.514549325762953, + "grad_norm": 10.261246681213379, + "learning_rate": 7.649524485450674e-05, + "loss": 0.02387801855802536, + "step": 165660 + }, + { + "epoch": 23.51596877217885, + "grad_norm": 0.0336395688354969, + "learning_rate": 7.649382540809085e-05, + "loss": 0.04128268361091614, + "step": 165670 + }, + { + "epoch": 23.517388218594746, + "grad_norm": 0.8177370429039001, + "learning_rate": 7.649240596167495e-05, + "loss": 0.03893501758575439, + "step": 165680 + }, + { + "epoch": 23.518807665010645, + "grad_norm": 0.046147655695676804, + "learning_rate": 7.649098651525906e-05, + "loss": 0.0054889082908630375, + "step": 165690 + }, + { + "epoch": 23.520227111426543, + "grad_norm": 0.01702147163450718, + "learning_rate": 7.648956706884316e-05, + "loss": 0.004857656732201576, + "step": 165700 + }, + { + "epoch": 23.52164655784244, + "grad_norm": 1.578279733657837, + "learning_rate": 7.648814762242725e-05, + "loss": 0.01722201704978943, + "step": 165710 + }, + { + "epoch": 23.52306600425834, + "grad_norm": 0.15221214294433594, + "learning_rate": 7.648672817601136e-05, + "loss": 0.002542218193411827, + "step": 165720 + }, + { + "epoch": 23.524485450674238, + "grad_norm": 0.11896581202745438, + "learning_rate": 7.648530872959546e-05, + "loss": 0.016068589687347413, + "step": 165730 + }, + { + "epoch": 23.525904897090136, + "grad_norm": 0.18789708614349365, + "learning_rate": 7.648388928317957e-05, + "loss": 0.00824701189994812, + "step": 165740 + }, + { + "epoch": 23.52732434350603, + "grad_norm": 0.0275257620960474, + "learning_rate": 7.648246983676367e-05, + "loss": 0.07363538146018982, + "step": 165750 + }, + { + "epoch": 23.52874378992193, + "grad_norm": 2.3536603450775146, + "learning_rate": 7.648105039034777e-05, + "loss": 0.012419167906045914, + "step": 165760 + }, + { + "epoch": 23.530163236337827, + "grad_norm": 0.023220986127853394, + "learning_rate": 7.647963094393186e-05, + "loss": 0.054289400577545166, + "step": 165770 + }, + { + "epoch": 23.531582682753726, + "grad_norm": 0.02333774045109749, + "learning_rate": 7.647821149751597e-05, + "loss": 0.026269999146461488, + "step": 165780 + }, + { + "epoch": 23.533002129169624, + "grad_norm": 0.07132261246442795, + "learning_rate": 7.647679205110007e-05, + "loss": 0.012915019690990449, + "step": 165790 + }, + { + "epoch": 23.534421575585522, + "grad_norm": 0.10892273485660553, + "learning_rate": 7.647537260468418e-05, + "loss": 0.04277798235416412, + "step": 165800 + }, + { + "epoch": 23.53584102200142, + "grad_norm": 0.04926268383860588, + "learning_rate": 7.647395315826828e-05, + "loss": 0.0077508240938186646, + "step": 165810 + }, + { + "epoch": 23.537260468417315, + "grad_norm": 0.9743296504020691, + "learning_rate": 7.647253371185238e-05, + "loss": 0.004492961242794991, + "step": 165820 + }, + { + "epoch": 23.538679914833214, + "grad_norm": 0.09544076025485992, + "learning_rate": 7.647111426543649e-05, + "loss": 0.01040782779455185, + "step": 165830 + }, + { + "epoch": 23.540099361249112, + "grad_norm": 1.8101491928100586, + "learning_rate": 7.646969481902059e-05, + "loss": 0.020885756611824034, + "step": 165840 + }, + { + "epoch": 23.54151880766501, + "grad_norm": 0.16607189178466797, + "learning_rate": 7.64682753726047e-05, + "loss": 0.02012229859828949, + "step": 165850 + }, + { + "epoch": 23.54293825408091, + "grad_norm": 1.280266284942627, + "learning_rate": 7.646685592618878e-05, + "loss": 0.01627992242574692, + "step": 165860 + }, + { + "epoch": 23.544357700496807, + "grad_norm": 0.40139979124069214, + "learning_rate": 7.646543647977289e-05, + "loss": 0.00830095037817955, + "step": 165870 + }, + { + "epoch": 23.545777146912705, + "grad_norm": 0.8383504152297974, + "learning_rate": 7.646401703335699e-05, + "loss": 0.016970571875572205, + "step": 165880 + }, + { + "epoch": 23.5471965933286, + "grad_norm": 0.050403181463479996, + "learning_rate": 7.64625975869411e-05, + "loss": 0.004748716205358505, + "step": 165890 + }, + { + "epoch": 23.5486160397445, + "grad_norm": 0.07861621677875519, + "learning_rate": 7.64611781405252e-05, + "loss": 0.02165157496929169, + "step": 165900 + }, + { + "epoch": 23.550035486160397, + "grad_norm": 0.8995758891105652, + "learning_rate": 7.64597586941093e-05, + "loss": 0.020719289779663086, + "step": 165910 + }, + { + "epoch": 23.551454932576295, + "grad_norm": 1.4618560075759888, + "learning_rate": 7.64583392476934e-05, + "loss": 0.015445740520954132, + "step": 165920 + }, + { + "epoch": 23.552874378992193, + "grad_norm": 0.438198983669281, + "learning_rate": 7.64569198012775e-05, + "loss": 0.013957774639129639, + "step": 165930 + }, + { + "epoch": 23.55429382540809, + "grad_norm": 2.3302628993988037, + "learning_rate": 7.645550035486161e-05, + "loss": 0.014175288379192352, + "step": 165940 + }, + { + "epoch": 23.55571327182399, + "grad_norm": 0.32241666316986084, + "learning_rate": 7.645408090844571e-05, + "loss": 0.02702936828136444, + "step": 165950 + }, + { + "epoch": 23.557132718239885, + "grad_norm": 0.03670497611165047, + "learning_rate": 7.645266146202981e-05, + "loss": 0.0038024861365556716, + "step": 165960 + }, + { + "epoch": 23.558552164655783, + "grad_norm": 1.0309785604476929, + "learning_rate": 7.64512420156139e-05, + "loss": 0.02002967894077301, + "step": 165970 + }, + { + "epoch": 23.55997161107168, + "grad_norm": 13.818063735961914, + "learning_rate": 7.644982256919802e-05, + "loss": 0.03794377446174622, + "step": 165980 + }, + { + "epoch": 23.56139105748758, + "grad_norm": 0.17573407292366028, + "learning_rate": 7.644840312278211e-05, + "loss": 0.052223318815231325, + "step": 165990 + }, + { + "epoch": 23.562810503903478, + "grad_norm": 0.735020101070404, + "learning_rate": 7.644698367636623e-05, + "loss": 0.0120499387383461, + "step": 166000 + }, + { + "epoch": 23.562810503903478, + "eval_accuracy": 0.986837922044891, + "eval_loss": 0.04923011735081673, + "eval_runtime": 32.9408, + "eval_samples_per_second": 477.432, + "eval_steps_per_second": 14.936, + "step": 166000 + }, + { + "epoch": 23.564229950319376, + "grad_norm": 0.0702325776219368, + "learning_rate": 7.644556422995032e-05, + "loss": 0.0042747646570205685, + "step": 166010 + }, + { + "epoch": 23.565649396735274, + "grad_norm": 2.7291347980499268, + "learning_rate": 7.644414478353442e-05, + "loss": 0.013952228426933288, + "step": 166020 + }, + { + "epoch": 23.56706884315117, + "grad_norm": 0.2719586193561554, + "learning_rate": 7.644272533711853e-05, + "loss": 0.003962843492627144, + "step": 166030 + }, + { + "epoch": 23.568488289567068, + "grad_norm": 1.523728609085083, + "learning_rate": 7.644130589070263e-05, + "loss": 0.027393418550491332, + "step": 166040 + }, + { + "epoch": 23.569907735982966, + "grad_norm": 11.334314346313477, + "learning_rate": 7.643988644428674e-05, + "loss": 0.016920295357704163, + "step": 166050 + }, + { + "epoch": 23.571327182398864, + "grad_norm": 2.312696695327759, + "learning_rate": 7.643846699787084e-05, + "loss": 0.045561695098876955, + "step": 166060 + }, + { + "epoch": 23.572746628814762, + "grad_norm": 0.5361663699150085, + "learning_rate": 7.643704755145493e-05, + "loss": 0.015583740174770355, + "step": 166070 + }, + { + "epoch": 23.57416607523066, + "grad_norm": 11.523843765258789, + "learning_rate": 7.643562810503903e-05, + "loss": 0.026504126191139222, + "step": 166080 + }, + { + "epoch": 23.57558552164656, + "grad_norm": 10.987065315246582, + "learning_rate": 7.643420865862314e-05, + "loss": 0.0330510288476944, + "step": 166090 + }, + { + "epoch": 23.577004968062454, + "grad_norm": 0.049131739884614944, + "learning_rate": 7.643278921220724e-05, + "loss": 0.05242431163787842, + "step": 166100 + }, + { + "epoch": 23.578424414478352, + "grad_norm": 3.2354979515075684, + "learning_rate": 7.643136976579135e-05, + "loss": 0.019730955362319946, + "step": 166110 + }, + { + "epoch": 23.57984386089425, + "grad_norm": 0.09812629222869873, + "learning_rate": 7.642995031937545e-05, + "loss": 0.0051271352916955944, + "step": 166120 + }, + { + "epoch": 23.58126330731015, + "grad_norm": 9.233469009399414, + "learning_rate": 7.642853087295955e-05, + "loss": 0.0144666388630867, + "step": 166130 + }, + { + "epoch": 23.582682753726047, + "grad_norm": 0.015829313546419144, + "learning_rate": 7.642711142654366e-05, + "loss": 0.02698013186454773, + "step": 166140 + }, + { + "epoch": 23.584102200141945, + "grad_norm": 3.8716418743133545, + "learning_rate": 7.642569198012775e-05, + "loss": 0.024173861742019652, + "step": 166150 + }, + { + "epoch": 23.585521646557844, + "grad_norm": 13.690027236938477, + "learning_rate": 7.642427253371186e-05, + "loss": 0.008826808631420135, + "step": 166160 + }, + { + "epoch": 23.58694109297374, + "grad_norm": 3.2683138847351074, + "learning_rate": 7.642285308729595e-05, + "loss": 0.027902087569236754, + "step": 166170 + }, + { + "epoch": 23.588360539389637, + "grad_norm": 0.06893163919448853, + "learning_rate": 7.642143364088006e-05, + "loss": 0.012665122747421265, + "step": 166180 + }, + { + "epoch": 23.589779985805535, + "grad_norm": 0.07805333286523819, + "learning_rate": 7.642001419446416e-05, + "loss": 0.004722482338547706, + "step": 166190 + }, + { + "epoch": 23.591199432221433, + "grad_norm": 0.2086969017982483, + "learning_rate": 7.641859474804827e-05, + "loss": 0.03157641887664795, + "step": 166200 + }, + { + "epoch": 23.59261887863733, + "grad_norm": 0.018695466220378876, + "learning_rate": 7.641717530163237e-05, + "loss": 0.0029503095895051954, + "step": 166210 + }, + { + "epoch": 23.59403832505323, + "grad_norm": 0.5905424356460571, + "learning_rate": 7.641575585521646e-05, + "loss": 0.008354425430297852, + "step": 166220 + }, + { + "epoch": 23.59545777146913, + "grad_norm": 8.275030136108398, + "learning_rate": 7.641433640880057e-05, + "loss": 0.015690505504608154, + "step": 166230 + }, + { + "epoch": 23.596877217885023, + "grad_norm": 0.13027335703372955, + "learning_rate": 7.641291696238467e-05, + "loss": 0.017104052007198334, + "step": 166240 + }, + { + "epoch": 23.59829666430092, + "grad_norm": 0.272606760263443, + "learning_rate": 7.641149751596878e-05, + "loss": 0.02595866918563843, + "step": 166250 + }, + { + "epoch": 23.59971611071682, + "grad_norm": 0.022492557764053345, + "learning_rate": 7.641007806955288e-05, + "loss": 0.029835450649261474, + "step": 166260 + }, + { + "epoch": 23.601135557132718, + "grad_norm": 0.10061958432197571, + "learning_rate": 7.640865862313698e-05, + "loss": 0.0011877808719873428, + "step": 166270 + }, + { + "epoch": 23.602555003548616, + "grad_norm": 0.9486027956008911, + "learning_rate": 7.640723917672107e-05, + "loss": 0.015597468614578247, + "step": 166280 + }, + { + "epoch": 23.603974449964515, + "grad_norm": 0.033375877887010574, + "learning_rate": 7.640581973030518e-05, + "loss": 0.009247278422117233, + "step": 166290 + }, + { + "epoch": 23.605393896380413, + "grad_norm": 0.09446623176336288, + "learning_rate": 7.640440028388928e-05, + "loss": 0.020402249693870545, + "step": 166300 + }, + { + "epoch": 23.606813342796308, + "grad_norm": 6.851724147796631, + "learning_rate": 7.640298083747339e-05, + "loss": 0.011106319725513458, + "step": 166310 + }, + { + "epoch": 23.608232789212206, + "grad_norm": 0.12238011509180069, + "learning_rate": 7.640156139105749e-05, + "loss": 0.015091103315353394, + "step": 166320 + }, + { + "epoch": 23.609652235628104, + "grad_norm": 14.826149940490723, + "learning_rate": 7.640014194464159e-05, + "loss": 0.033084309101104735, + "step": 166330 + }, + { + "epoch": 23.611071682044003, + "grad_norm": 1.0483496189117432, + "learning_rate": 7.63987224982257e-05, + "loss": 0.016786381602287292, + "step": 166340 + }, + { + "epoch": 23.6124911284599, + "grad_norm": 1.5369335412979126, + "learning_rate": 7.63973030518098e-05, + "loss": 0.015024296939373016, + "step": 166350 + }, + { + "epoch": 23.6139105748758, + "grad_norm": 0.015181516297161579, + "learning_rate": 7.639588360539391e-05, + "loss": 0.002642759308218956, + "step": 166360 + }, + { + "epoch": 23.615330021291697, + "grad_norm": 5.026410102844238, + "learning_rate": 7.6394464158978e-05, + "loss": 0.011071635037660598, + "step": 166370 + }, + { + "epoch": 23.616749467707596, + "grad_norm": 0.064101941883564, + "learning_rate": 7.63930447125621e-05, + "loss": 0.012611904740333557, + "step": 166380 + }, + { + "epoch": 23.61816891412349, + "grad_norm": 0.04930287227034569, + "learning_rate": 7.63916252661462e-05, + "loss": 0.0034982305020093917, + "step": 166390 + }, + { + "epoch": 23.61958836053939, + "grad_norm": 1.126294493675232, + "learning_rate": 7.639020581973031e-05, + "loss": 0.005223702639341354, + "step": 166400 + }, + { + "epoch": 23.621007806955287, + "grad_norm": 0.27248871326446533, + "learning_rate": 7.638878637331441e-05, + "loss": 0.0010296527296304702, + "step": 166410 + }, + { + "epoch": 23.622427253371185, + "grad_norm": 0.3924824893474579, + "learning_rate": 7.638736692689852e-05, + "loss": 0.029463472962379455, + "step": 166420 + }, + { + "epoch": 23.623846699787084, + "grad_norm": 0.041560400277376175, + "learning_rate": 7.638594748048262e-05, + "loss": 0.0017043210566043854, + "step": 166430 + }, + { + "epoch": 23.625266146202982, + "grad_norm": 0.039127908647060394, + "learning_rate": 7.638452803406671e-05, + "loss": 0.030438715219497682, + "step": 166440 + }, + { + "epoch": 23.62668559261888, + "grad_norm": 0.0061663053929805756, + "learning_rate": 7.638310858765082e-05, + "loss": 0.0035256098955869673, + "step": 166450 + }, + { + "epoch": 23.628105039034775, + "grad_norm": 0.06858544796705246, + "learning_rate": 7.638168914123492e-05, + "loss": 0.058404940366745, + "step": 166460 + }, + { + "epoch": 23.629524485450673, + "grad_norm": 0.07751541584730148, + "learning_rate": 7.638026969481903e-05, + "loss": 0.0133286714553833, + "step": 166470 + }, + { + "epoch": 23.63094393186657, + "grad_norm": 15.264079093933105, + "learning_rate": 7.637885024840312e-05, + "loss": 0.01910979151725769, + "step": 166480 + }, + { + "epoch": 23.63236337828247, + "grad_norm": 0.16509348154067993, + "learning_rate": 7.637743080198723e-05, + "loss": 0.013694091141223908, + "step": 166490 + }, + { + "epoch": 23.63378282469837, + "grad_norm": 0.033642951399087906, + "learning_rate": 7.637601135557132e-05, + "loss": 0.037055274844169615, + "step": 166500 + }, + { + "epoch": 23.63378282469837, + "eval_accuracy": 0.9895720735041648, + "eval_loss": 0.037867508828639984, + "eval_runtime": 34.0498, + "eval_samples_per_second": 461.882, + "eval_steps_per_second": 14.449, + "step": 166500 + }, + { + "epoch": 23.635202271114267, + "grad_norm": 7.1324968338012695, + "learning_rate": 7.637459190915544e-05, + "loss": 0.049308350682258605, + "step": 166510 + }, + { + "epoch": 23.636621717530165, + "grad_norm": 0.03354381397366524, + "learning_rate": 7.637317246273953e-05, + "loss": 0.025635886192321777, + "step": 166520 + }, + { + "epoch": 23.63804116394606, + "grad_norm": 2.280115842819214, + "learning_rate": 7.637175301632363e-05, + "loss": 0.007556168735027314, + "step": 166530 + }, + { + "epoch": 23.639460610361958, + "grad_norm": 0.06778790801763535, + "learning_rate": 7.637033356990774e-05, + "loss": 0.0053503777831792835, + "step": 166540 + }, + { + "epoch": 23.640880056777856, + "grad_norm": 0.6515158414840698, + "learning_rate": 7.636891412349184e-05, + "loss": 0.017369255423545837, + "step": 166550 + }, + { + "epoch": 23.642299503193755, + "grad_norm": 2.2935478687286377, + "learning_rate": 7.636749467707595e-05, + "loss": 0.00392686128616333, + "step": 166560 + }, + { + "epoch": 23.643718949609653, + "grad_norm": 1.205484390258789, + "learning_rate": 7.636607523066005e-05, + "loss": 0.010910369455814362, + "step": 166570 + }, + { + "epoch": 23.64513839602555, + "grad_norm": 0.2494107037782669, + "learning_rate": 7.636465578424414e-05, + "loss": 0.015629078447818755, + "step": 166580 + }, + { + "epoch": 23.64655784244145, + "grad_norm": 10.63766098022461, + "learning_rate": 7.636323633782824e-05, + "loss": 0.07438828945159912, + "step": 166590 + }, + { + "epoch": 23.647977288857344, + "grad_norm": 0.880093514919281, + "learning_rate": 7.636181689141235e-05, + "loss": 0.0035771694034337997, + "step": 166600 + }, + { + "epoch": 23.649396735273243, + "grad_norm": 0.05285324156284332, + "learning_rate": 7.636039744499645e-05, + "loss": 0.005258613452315331, + "step": 166610 + }, + { + "epoch": 23.65081618168914, + "grad_norm": 1.0355547666549683, + "learning_rate": 7.635897799858056e-05, + "loss": 0.005540279299020767, + "step": 166620 + }, + { + "epoch": 23.65223562810504, + "grad_norm": 0.799923837184906, + "learning_rate": 7.635755855216466e-05, + "loss": 0.010623343288898468, + "step": 166630 + }, + { + "epoch": 23.653655074520938, + "grad_norm": 3.677203416824341, + "learning_rate": 7.635613910574876e-05, + "loss": 0.016145065426826477, + "step": 166640 + }, + { + "epoch": 23.655074520936836, + "grad_norm": 0.09950985759496689, + "learning_rate": 7.635471965933287e-05, + "loss": 0.026810050010681152, + "step": 166650 + }, + { + "epoch": 23.656493967352734, + "grad_norm": 0.04555220529437065, + "learning_rate": 7.635330021291696e-05, + "loss": 0.02183019518852234, + "step": 166660 + }, + { + "epoch": 23.65791341376863, + "grad_norm": 0.015314202755689621, + "learning_rate": 7.635188076650107e-05, + "loss": 0.008420588076114654, + "step": 166670 + }, + { + "epoch": 23.659332860184527, + "grad_norm": 0.021852195262908936, + "learning_rate": 7.635046132008517e-05, + "loss": 0.0130954310297966, + "step": 166680 + }, + { + "epoch": 23.660752306600425, + "grad_norm": 0.2052682787179947, + "learning_rate": 7.634904187366927e-05, + "loss": 0.004480284452438354, + "step": 166690 + }, + { + "epoch": 23.662171753016324, + "grad_norm": 0.04706482216715813, + "learning_rate": 7.634762242725337e-05, + "loss": 0.028238168358802794, + "step": 166700 + }, + { + "epoch": 23.663591199432222, + "grad_norm": 0.07862462103366852, + "learning_rate": 7.634620298083748e-05, + "loss": 0.003253858909010887, + "step": 166710 + }, + { + "epoch": 23.66501064584812, + "grad_norm": 0.030095521360635757, + "learning_rate": 7.634478353442159e-05, + "loss": 0.0016552004963159561, + "step": 166720 + }, + { + "epoch": 23.66643009226402, + "grad_norm": 2.3299341201782227, + "learning_rate": 7.634336408800569e-05, + "loss": 0.005684570223093033, + "step": 166730 + }, + { + "epoch": 23.667849538679913, + "grad_norm": 0.1388455033302307, + "learning_rate": 7.634194464158978e-05, + "loss": 0.013259288668632508, + "step": 166740 + }, + { + "epoch": 23.669268985095812, + "grad_norm": 0.0629689171910286, + "learning_rate": 7.634052519517388e-05, + "loss": 0.009448021650314331, + "step": 166750 + }, + { + "epoch": 23.67068843151171, + "grad_norm": 0.0377984382212162, + "learning_rate": 7.633910574875799e-05, + "loss": 0.009501864016056061, + "step": 166760 + }, + { + "epoch": 23.67210787792761, + "grad_norm": 4.690420150756836, + "learning_rate": 7.633768630234209e-05, + "loss": 0.025394073128700255, + "step": 166770 + }, + { + "epoch": 23.673527324343507, + "grad_norm": 0.20219728350639343, + "learning_rate": 7.63362668559262e-05, + "loss": 0.0015072576701641084, + "step": 166780 + }, + { + "epoch": 23.674946770759405, + "grad_norm": 3.1388227939605713, + "learning_rate": 7.633484740951028e-05, + "loss": 0.010977405309677123, + "step": 166790 + }, + { + "epoch": 23.676366217175303, + "grad_norm": 0.02655540592968464, + "learning_rate": 7.63334279630944e-05, + "loss": 0.004023692011833191, + "step": 166800 + }, + { + "epoch": 23.677785663591198, + "grad_norm": 0.46813061833381653, + "learning_rate": 7.63320085166785e-05, + "loss": 0.021081903576850893, + "step": 166810 + }, + { + "epoch": 23.679205110007096, + "grad_norm": 0.037993814796209335, + "learning_rate": 7.63305890702626e-05, + "loss": 0.02901386022567749, + "step": 166820 + }, + { + "epoch": 23.680624556422995, + "grad_norm": 0.12297865748405457, + "learning_rate": 7.632916962384671e-05, + "loss": 0.012157149612903595, + "step": 166830 + }, + { + "epoch": 23.682044002838893, + "grad_norm": 0.019008146598935127, + "learning_rate": 7.63277501774308e-05, + "loss": 0.038142696022987366, + "step": 166840 + }, + { + "epoch": 23.68346344925479, + "grad_norm": 4.955599308013916, + "learning_rate": 7.632633073101491e-05, + "loss": 0.021129383146762847, + "step": 166850 + }, + { + "epoch": 23.68488289567069, + "grad_norm": 0.03496038168668747, + "learning_rate": 7.6324911284599e-05, + "loss": 0.0022817034274339675, + "step": 166860 + }, + { + "epoch": 23.686302342086588, + "grad_norm": 6.744307041168213, + "learning_rate": 7.632349183818312e-05, + "loss": 0.07207911014556885, + "step": 166870 + }, + { + "epoch": 23.687721788502483, + "grad_norm": 7.5425238609313965, + "learning_rate": 7.632207239176721e-05, + "loss": 0.01375012993812561, + "step": 166880 + }, + { + "epoch": 23.68914123491838, + "grad_norm": 1.7219442129135132, + "learning_rate": 7.632065294535131e-05, + "loss": 0.03813965320587158, + "step": 166890 + }, + { + "epoch": 23.69056068133428, + "grad_norm": 0.7880526781082153, + "learning_rate": 7.631923349893542e-05, + "loss": 0.004120131582021713, + "step": 166900 + }, + { + "epoch": 23.691980127750178, + "grad_norm": 0.026899948716163635, + "learning_rate": 7.631781405251952e-05, + "loss": 0.04748384356498718, + "step": 166910 + }, + { + "epoch": 23.693399574166076, + "grad_norm": 3.0281615257263184, + "learning_rate": 7.631639460610363e-05, + "loss": 0.03217891454696655, + "step": 166920 + }, + { + "epoch": 23.694819020581974, + "grad_norm": 12.738275527954102, + "learning_rate": 7.631497515968773e-05, + "loss": 0.016994521021842957, + "step": 166930 + }, + { + "epoch": 23.696238466997873, + "grad_norm": 0.7541211247444153, + "learning_rate": 7.631355571327183e-05, + "loss": 0.012231212854385377, + "step": 166940 + }, + { + "epoch": 23.697657913413767, + "grad_norm": 2.5239059925079346, + "learning_rate": 7.631213626685592e-05, + "loss": 0.03071235418319702, + "step": 166950 + }, + { + "epoch": 23.699077359829666, + "grad_norm": 0.8639270067214966, + "learning_rate": 7.631071682044003e-05, + "loss": 0.010529343783855439, + "step": 166960 + }, + { + "epoch": 23.700496806245564, + "grad_norm": 1.2921233177185059, + "learning_rate": 7.630929737402413e-05, + "loss": 0.027306774258613588, + "step": 166970 + }, + { + "epoch": 23.701916252661462, + "grad_norm": 0.004106923472136259, + "learning_rate": 7.630787792760824e-05, + "loss": 0.052036821842193604, + "step": 166980 + }, + { + "epoch": 23.70333569907736, + "grad_norm": 0.024405891075730324, + "learning_rate": 7.630645848119234e-05, + "loss": 0.0068145602941513065, + "step": 166990 + }, + { + "epoch": 23.70475514549326, + "grad_norm": 5.151251792907715, + "learning_rate": 7.630503903477644e-05, + "loss": 0.010167023539543152, + "step": 167000 + }, + { + "epoch": 23.70475514549326, + "eval_accuracy": 0.9877281108920964, + "eval_loss": 0.046104803681373596, + "eval_runtime": 33.0325, + "eval_samples_per_second": 476.106, + "eval_steps_per_second": 14.894, + "step": 167000 + }, + { + "epoch": 23.706174591909157, + "grad_norm": 0.1469411998987198, + "learning_rate": 7.630361958836055e-05, + "loss": 0.013717851042747498, + "step": 167010 + }, + { + "epoch": 23.707594038325052, + "grad_norm": 0.22842828929424286, + "learning_rate": 7.630220014194465e-05, + "loss": 0.01649213582277298, + "step": 167020 + }, + { + "epoch": 23.70901348474095, + "grad_norm": 0.46063879132270813, + "learning_rate": 7.630078069552876e-05, + "loss": 0.0019221577793359756, + "step": 167030 + }, + { + "epoch": 23.71043293115685, + "grad_norm": 13.11252498626709, + "learning_rate": 7.629936124911285e-05, + "loss": 0.027537989616394042, + "step": 167040 + }, + { + "epoch": 23.711852377572747, + "grad_norm": 0.061158690601587296, + "learning_rate": 7.629794180269695e-05, + "loss": 0.03937384486198425, + "step": 167050 + }, + { + "epoch": 23.713271823988645, + "grad_norm": 0.10850509256124496, + "learning_rate": 7.629652235628105e-05, + "loss": 0.0050700224936008455, + "step": 167060 + }, + { + "epoch": 23.714691270404543, + "grad_norm": 0.31153807044029236, + "learning_rate": 7.629510290986516e-05, + "loss": 0.0573622465133667, + "step": 167070 + }, + { + "epoch": 23.71611071682044, + "grad_norm": 6.256492614746094, + "learning_rate": 7.629368346344926e-05, + "loss": 0.006951691210269928, + "step": 167080 + }, + { + "epoch": 23.717530163236336, + "grad_norm": 0.029116230085492134, + "learning_rate": 7.629226401703337e-05, + "loss": 0.011427338421344756, + "step": 167090 + }, + { + "epoch": 23.718949609652235, + "grad_norm": 0.02269347943365574, + "learning_rate": 7.629084457061747e-05, + "loss": 0.034312930703163144, + "step": 167100 + }, + { + "epoch": 23.720369056068133, + "grad_norm": 0.014398652128875256, + "learning_rate": 7.628942512420156e-05, + "loss": 0.01937624216079712, + "step": 167110 + }, + { + "epoch": 23.72178850248403, + "grad_norm": 6.361565113067627, + "learning_rate": 7.628800567778567e-05, + "loss": 0.010191404819488525, + "step": 167120 + }, + { + "epoch": 23.72320794889993, + "grad_norm": 0.8844833970069885, + "learning_rate": 7.628658623136977e-05, + "loss": 0.011906987428665161, + "step": 167130 + }, + { + "epoch": 23.724627395315828, + "grad_norm": 0.357957661151886, + "learning_rate": 7.628516678495388e-05, + "loss": 0.019338434934616087, + "step": 167140 + }, + { + "epoch": 23.726046841731726, + "grad_norm": 1.3925724029541016, + "learning_rate": 7.628374733853797e-05, + "loss": 0.004397198557853699, + "step": 167150 + }, + { + "epoch": 23.72746628814762, + "grad_norm": 0.4993216395378113, + "learning_rate": 7.628232789212208e-05, + "loss": 0.009803399443626404, + "step": 167160 + }, + { + "epoch": 23.72888573456352, + "grad_norm": 0.03436373174190521, + "learning_rate": 7.628090844570617e-05, + "loss": 0.015501382946968078, + "step": 167170 + }, + { + "epoch": 23.730305180979418, + "grad_norm": 4.2031683921813965, + "learning_rate": 7.627948899929028e-05, + "loss": 0.008236441761255264, + "step": 167180 + }, + { + "epoch": 23.731724627395316, + "grad_norm": 0.025608619675040245, + "learning_rate": 7.627806955287438e-05, + "loss": 0.05179571509361267, + "step": 167190 + }, + { + "epoch": 23.733144073811214, + "grad_norm": 0.7403807640075684, + "learning_rate": 7.627665010645848e-05, + "loss": 0.005863047763705253, + "step": 167200 + }, + { + "epoch": 23.734563520227113, + "grad_norm": 0.028073446825146675, + "learning_rate": 7.627523066004259e-05, + "loss": 0.012324770539999008, + "step": 167210 + }, + { + "epoch": 23.73598296664301, + "grad_norm": 16.898849487304688, + "learning_rate": 7.627381121362669e-05, + "loss": 0.02803591787815094, + "step": 167220 + }, + { + "epoch": 23.737402413058906, + "grad_norm": 2.5605363845825195, + "learning_rate": 7.62723917672108e-05, + "loss": 0.019580677151679993, + "step": 167230 + }, + { + "epoch": 23.738821859474804, + "grad_norm": 0.5643990635871887, + "learning_rate": 7.62709723207949e-05, + "loss": 0.01783321499824524, + "step": 167240 + }, + { + "epoch": 23.740241305890702, + "grad_norm": 0.10288077592849731, + "learning_rate": 7.6269552874379e-05, + "loss": 0.03179287016391754, + "step": 167250 + }, + { + "epoch": 23.7416607523066, + "grad_norm": 2.232056140899658, + "learning_rate": 7.626813342796309e-05, + "loss": 0.03927572369575501, + "step": 167260 + }, + { + "epoch": 23.7430801987225, + "grad_norm": 0.5587962865829468, + "learning_rate": 7.62667139815472e-05, + "loss": 0.032523933053016665, + "step": 167270 + }, + { + "epoch": 23.744499645138397, + "grad_norm": 10.553679466247559, + "learning_rate": 7.62652945351313e-05, + "loss": 0.026016277074813843, + "step": 167280 + }, + { + "epoch": 23.745919091554295, + "grad_norm": 0.2931702733039856, + "learning_rate": 7.626387508871541e-05, + "loss": 0.03787906467914581, + "step": 167290 + }, + { + "epoch": 23.74733853797019, + "grad_norm": 0.02630489505827427, + "learning_rate": 7.626245564229951e-05, + "loss": 0.04123930037021637, + "step": 167300 + }, + { + "epoch": 23.74875798438609, + "grad_norm": 1.4487509727478027, + "learning_rate": 7.62610361958836e-05, + "loss": 0.009749533236026764, + "step": 167310 + }, + { + "epoch": 23.750177430801987, + "grad_norm": 0.02529893070459366, + "learning_rate": 7.625961674946772e-05, + "loss": 0.00634746253490448, + "step": 167320 + }, + { + "epoch": 23.751596877217885, + "grad_norm": 0.24350571632385254, + "learning_rate": 7.625819730305181e-05, + "loss": 0.016427995264530183, + "step": 167330 + }, + { + "epoch": 23.753016323633783, + "grad_norm": 0.41244062781333923, + "learning_rate": 7.625677785663592e-05, + "loss": 0.002660691738128662, + "step": 167340 + }, + { + "epoch": 23.75443577004968, + "grad_norm": 5.875321865081787, + "learning_rate": 7.625535841022001e-05, + "loss": 0.026051869988441466, + "step": 167350 + }, + { + "epoch": 23.75585521646558, + "grad_norm": 0.10327660292387009, + "learning_rate": 7.62540809084457e-05, + "loss": 0.0633675992488861, + "step": 167360 + }, + { + "epoch": 23.757274662881475, + "grad_norm": 12.32266902923584, + "learning_rate": 7.625266146202982e-05, + "loss": 0.024318933486938477, + "step": 167370 + }, + { + "epoch": 23.758694109297373, + "grad_norm": 0.13631390035152435, + "learning_rate": 7.62513839602555e-05, + "loss": 0.05115037560462952, + "step": 167380 + }, + { + "epoch": 23.76011355571327, + "grad_norm": 0.5809052586555481, + "learning_rate": 7.624996451383961e-05, + "loss": 0.02293548882007599, + "step": 167390 + }, + { + "epoch": 23.76153300212917, + "grad_norm": 0.4039802551269531, + "learning_rate": 7.624854506742371e-05, + "loss": 0.004499823600053787, + "step": 167400 + }, + { + "epoch": 23.762952448545068, + "grad_norm": 8.738982200622559, + "learning_rate": 7.624712562100781e-05, + "loss": 0.010184122622013092, + "step": 167410 + }, + { + "epoch": 23.764371894960966, + "grad_norm": 0.01911412738263607, + "learning_rate": 7.624570617459192e-05, + "loss": 0.046522825956344604, + "step": 167420 + }, + { + "epoch": 23.765791341376865, + "grad_norm": 5.261871337890625, + "learning_rate": 7.624428672817602e-05, + "loss": 0.029325959086418153, + "step": 167430 + }, + { + "epoch": 23.76721078779276, + "grad_norm": 1.7784777879714966, + "learning_rate": 7.624286728176013e-05, + "loss": 0.0034421849995851517, + "step": 167440 + }, + { + "epoch": 23.768630234208658, + "grad_norm": 1.789939284324646, + "learning_rate": 7.624144783534421e-05, + "loss": 0.005495503172278404, + "step": 167450 + }, + { + "epoch": 23.770049680624556, + "grad_norm": 0.14589178562164307, + "learning_rate": 7.624002838892832e-05, + "loss": 0.005021782964468003, + "step": 167460 + }, + { + "epoch": 23.771469127040454, + "grad_norm": 0.15213048458099365, + "learning_rate": 7.623860894251242e-05, + "loss": 0.006531879305839539, + "step": 167470 + }, + { + "epoch": 23.772888573456353, + "grad_norm": 1.3282796144485474, + "learning_rate": 7.623718949609653e-05, + "loss": 0.021813642978668214, + "step": 167480 + }, + { + "epoch": 23.77430801987225, + "grad_norm": 0.2587389051914215, + "learning_rate": 7.623577004968063e-05, + "loss": 0.007964324951171876, + "step": 167490 + }, + { + "epoch": 23.77572746628815, + "grad_norm": 0.0209824126213789, + "learning_rate": 7.623435060326473e-05, + "loss": 0.014364463090896607, + "step": 167500 + }, + { + "epoch": 23.77572746628815, + "eval_accuracy": 0.9858205633623705, + "eval_loss": 0.051902394741773605, + "eval_runtime": 34.0754, + "eval_samples_per_second": 461.535, + "eval_steps_per_second": 14.439, + "step": 167500 + }, + { + "epoch": 23.777146912704044, + "grad_norm": 0.03589203581213951, + "learning_rate": 7.623293115684884e-05, + "loss": 0.007034775614738464, + "step": 167510 + }, + { + "epoch": 23.778566359119942, + "grad_norm": 2.8829751014709473, + "learning_rate": 7.623151171043293e-05, + "loss": 0.010953574627637862, + "step": 167520 + }, + { + "epoch": 23.77998580553584, + "grad_norm": 0.056974805891513824, + "learning_rate": 7.623009226401704e-05, + "loss": 0.009771785140037537, + "step": 167530 + }, + { + "epoch": 23.78140525195174, + "grad_norm": 1.721590280532837, + "learning_rate": 7.622867281760114e-05, + "loss": 0.0034759048372507094, + "step": 167540 + }, + { + "epoch": 23.782824698367637, + "grad_norm": 3.1928303241729736, + "learning_rate": 7.622725337118524e-05, + "loss": 0.07090861201286316, + "step": 167550 + }, + { + "epoch": 23.784244144783536, + "grad_norm": 0.040020719170570374, + "learning_rate": 7.622583392476934e-05, + "loss": 0.01729402840137482, + "step": 167560 + }, + { + "epoch": 23.785663591199434, + "grad_norm": 0.07612006366252899, + "learning_rate": 7.622441447835345e-05, + "loss": 0.0010789450258016585, + "step": 167570 + }, + { + "epoch": 23.78708303761533, + "grad_norm": 11.557964324951172, + "learning_rate": 7.622299503193754e-05, + "loss": 0.010658993571996688, + "step": 167580 + }, + { + "epoch": 23.788502484031227, + "grad_norm": 3.595459461212158, + "learning_rate": 7.622157558552166e-05, + "loss": 0.010254216194152833, + "step": 167590 + }, + { + "epoch": 23.789921930447125, + "grad_norm": 1.2548161745071411, + "learning_rate": 7.622015613910575e-05, + "loss": 0.024721571803092958, + "step": 167600 + }, + { + "epoch": 23.791341376863024, + "grad_norm": 9.190648078918457, + "learning_rate": 7.621873669268985e-05, + "loss": 0.024191364645957947, + "step": 167610 + }, + { + "epoch": 23.792760823278922, + "grad_norm": 0.5982431769371033, + "learning_rate": 7.621731724627396e-05, + "loss": 0.005518810823559761, + "step": 167620 + }, + { + "epoch": 23.79418026969482, + "grad_norm": 1.0946768522262573, + "learning_rate": 7.621589779985806e-05, + "loss": 0.008632493019104005, + "step": 167630 + }, + { + "epoch": 23.79559971611072, + "grad_norm": 8.32460880279541, + "learning_rate": 7.621447835344217e-05, + "loss": 0.017260505259037016, + "step": 167640 + }, + { + "epoch": 23.797019162526613, + "grad_norm": 0.03491184860467911, + "learning_rate": 7.621305890702625e-05, + "loss": 0.02144605964422226, + "step": 167650 + }, + { + "epoch": 23.79843860894251, + "grad_norm": 3.30985951423645, + "learning_rate": 7.621163946061036e-05, + "loss": 0.023031729459762573, + "step": 167660 + }, + { + "epoch": 23.79985805535841, + "grad_norm": 0.6586384177207947, + "learning_rate": 7.621022001419446e-05, + "loss": 0.03625448346138001, + "step": 167670 + }, + { + "epoch": 23.801277501774308, + "grad_norm": 0.006601836532354355, + "learning_rate": 7.620880056777857e-05, + "loss": 0.0035860814154148103, + "step": 167680 + }, + { + "epoch": 23.802696948190206, + "grad_norm": 0.019064730033278465, + "learning_rate": 7.620738112136267e-05, + "loss": 0.04919723868370056, + "step": 167690 + }, + { + "epoch": 23.804116394606105, + "grad_norm": 1.0975171327590942, + "learning_rate": 7.620596167494678e-05, + "loss": 0.01045132428407669, + "step": 167700 + }, + { + "epoch": 23.805535841022003, + "grad_norm": 2.954209327697754, + "learning_rate": 7.620454222853088e-05, + "loss": 0.07648754119873047, + "step": 167710 + }, + { + "epoch": 23.806955287437898, + "grad_norm": 0.22009466588497162, + "learning_rate": 7.620312278211498e-05, + "loss": 0.005010556057095527, + "step": 167720 + }, + { + "epoch": 23.808374733853796, + "grad_norm": 0.04210036247968674, + "learning_rate": 7.620170333569909e-05, + "loss": 0.007900955528020859, + "step": 167730 + }, + { + "epoch": 23.809794180269694, + "grad_norm": 4.023663520812988, + "learning_rate": 7.620028388928318e-05, + "loss": 0.04126327335834503, + "step": 167740 + }, + { + "epoch": 23.811213626685593, + "grad_norm": 0.2888350784778595, + "learning_rate": 7.61988644428673e-05, + "loss": 0.004656342044472695, + "step": 167750 + }, + { + "epoch": 23.81263307310149, + "grad_norm": 3.7590737342834473, + "learning_rate": 7.619744499645138e-05, + "loss": 0.01039106547832489, + "step": 167760 + }, + { + "epoch": 23.81405251951739, + "grad_norm": 2.480764627456665, + "learning_rate": 7.619602555003549e-05, + "loss": 0.023802779614925385, + "step": 167770 + }, + { + "epoch": 23.815471965933288, + "grad_norm": 0.03301168978214264, + "learning_rate": 7.619460610361959e-05, + "loss": 0.04971747398376465, + "step": 167780 + }, + { + "epoch": 23.816891412349182, + "grad_norm": 4.568196773529053, + "learning_rate": 7.61931866572037e-05, + "loss": 0.032951757311820984, + "step": 167790 + }, + { + "epoch": 23.81831085876508, + "grad_norm": 0.7247302532196045, + "learning_rate": 7.61917672107878e-05, + "loss": 0.026875874400138854, + "step": 167800 + }, + { + "epoch": 23.81973030518098, + "grad_norm": 14.190784454345703, + "learning_rate": 7.619034776437189e-05, + "loss": 0.027607321739196777, + "step": 167810 + }, + { + "epoch": 23.821149751596877, + "grad_norm": 0.6558734774589539, + "learning_rate": 7.6188928317956e-05, + "loss": 0.011190880089998245, + "step": 167820 + }, + { + "epoch": 23.822569198012776, + "grad_norm": 0.04329218342900276, + "learning_rate": 7.61875088715401e-05, + "loss": 0.03667688965797424, + "step": 167830 + }, + { + "epoch": 23.823988644428674, + "grad_norm": 0.1633724570274353, + "learning_rate": 7.618608942512421e-05, + "loss": 0.012880519032478333, + "step": 167840 + }, + { + "epoch": 23.825408090844572, + "grad_norm": 0.3695497214794159, + "learning_rate": 7.618466997870831e-05, + "loss": 0.017481671273708345, + "step": 167850 + }, + { + "epoch": 23.826827537260467, + "grad_norm": 2.6536381244659424, + "learning_rate": 7.618325053229241e-05, + "loss": 0.009333087503910065, + "step": 167860 + }, + { + "epoch": 23.828246983676365, + "grad_norm": 0.02337850071489811, + "learning_rate": 7.61818310858765e-05, + "loss": 0.0043557252734899524, + "step": 167870 + }, + { + "epoch": 23.829666430092264, + "grad_norm": 0.048771824687719345, + "learning_rate": 7.618041163946062e-05, + "loss": 0.0036912117153406142, + "step": 167880 + }, + { + "epoch": 23.831085876508162, + "grad_norm": 0.10625825077295303, + "learning_rate": 7.617899219304471e-05, + "loss": 0.025396078824996948, + "step": 167890 + }, + { + "epoch": 23.83250532292406, + "grad_norm": 0.025082001462578773, + "learning_rate": 7.617757274662882e-05, + "loss": 0.005510057136416435, + "step": 167900 + }, + { + "epoch": 23.83392476933996, + "grad_norm": 0.11727859079837799, + "learning_rate": 7.617615330021292e-05, + "loss": 0.0017296399921178818, + "step": 167910 + }, + { + "epoch": 23.835344215755857, + "grad_norm": 0.2323751151561737, + "learning_rate": 7.617473385379702e-05, + "loss": 0.009094659984111787, + "step": 167920 + }, + { + "epoch": 23.83676366217175, + "grad_norm": 8.61363697052002, + "learning_rate": 7.617331440738113e-05, + "loss": 0.03262055516242981, + "step": 167930 + }, + { + "epoch": 23.83818310858765, + "grad_norm": 10.127595901489258, + "learning_rate": 7.617189496096523e-05, + "loss": 0.02976325750350952, + "step": 167940 + }, + { + "epoch": 23.839602555003548, + "grad_norm": 0.07352416962385178, + "learning_rate": 7.617047551454934e-05, + "loss": 0.01564558148384094, + "step": 167950 + }, + { + "epoch": 23.841022001419446, + "grad_norm": 0.12310074269771576, + "learning_rate": 7.616905606813342e-05, + "loss": 0.03053504228591919, + "step": 167960 + }, + { + "epoch": 23.842441447835345, + "grad_norm": 0.02468065544962883, + "learning_rate": 7.616763662171753e-05, + "loss": 0.002717156335711479, + "step": 167970 + }, + { + "epoch": 23.843860894251243, + "grad_norm": 0.4745878279209137, + "learning_rate": 7.616621717530163e-05, + "loss": 0.028589496016502382, + "step": 167980 + }, + { + "epoch": 23.84528034066714, + "grad_norm": 0.2601860761642456, + "learning_rate": 7.616479772888574e-05, + "loss": 0.004789938777685165, + "step": 167990 + }, + { + "epoch": 23.846699787083036, + "grad_norm": 1.4876856803894043, + "learning_rate": 7.616337828246984e-05, + "loss": 0.024759700894355773, + "step": 168000 + }, + { + "epoch": 23.846699787083036, + "eval_accuracy": 0.9858205633623705, + "eval_loss": 0.054984964430332184, + "eval_runtime": 33.8361, + "eval_samples_per_second": 464.8, + "eval_steps_per_second": 14.541, + "step": 168000 + }, + { + "epoch": 23.848119233498934, + "grad_norm": 0.7367356419563293, + "learning_rate": 7.616195883605394e-05, + "loss": 0.010694995522499084, + "step": 168010 + }, + { + "epoch": 23.849538679914833, + "grad_norm": 0.5403726100921631, + "learning_rate": 7.616053938963805e-05, + "loss": 0.02502071261405945, + "step": 168020 + }, + { + "epoch": 23.85095812633073, + "grad_norm": 3.5068271160125732, + "learning_rate": 7.615911994322214e-05, + "loss": 0.028222286701202394, + "step": 168030 + }, + { + "epoch": 23.85237757274663, + "grad_norm": 0.30948877334594727, + "learning_rate": 7.615770049680625e-05, + "loss": 0.011601188778877258, + "step": 168040 + }, + { + "epoch": 23.853797019162528, + "grad_norm": 0.8888250589370728, + "learning_rate": 7.615628105039035e-05, + "loss": 0.0016216635704040527, + "step": 168050 + }, + { + "epoch": 23.855216465578426, + "grad_norm": 11.139790534973145, + "learning_rate": 7.615486160397446e-05, + "loss": 0.005026942119002342, + "step": 168060 + }, + { + "epoch": 23.85663591199432, + "grad_norm": 0.3636004626750946, + "learning_rate": 7.615344215755855e-05, + "loss": 0.003969952836632729, + "step": 168070 + }, + { + "epoch": 23.85805535841022, + "grad_norm": 0.1594230830669403, + "learning_rate": 7.615202271114266e-05, + "loss": 0.017302605509757995, + "step": 168080 + }, + { + "epoch": 23.859474804826117, + "grad_norm": 0.07329320162534714, + "learning_rate": 7.615060326472675e-05, + "loss": 0.01387576013803482, + "step": 168090 + }, + { + "epoch": 23.860894251242016, + "grad_norm": 0.07348180562257767, + "learning_rate": 7.614918381831087e-05, + "loss": 0.0068316437304019925, + "step": 168100 + }, + { + "epoch": 23.862313697657914, + "grad_norm": 0.0843789353966713, + "learning_rate": 7.614776437189496e-05, + "loss": 0.0008491825312376023, + "step": 168110 + }, + { + "epoch": 23.863733144073812, + "grad_norm": 9.862788200378418, + "learning_rate": 7.614634492547906e-05, + "loss": 0.03634263575077057, + "step": 168120 + }, + { + "epoch": 23.86515259048971, + "grad_norm": 0.1069280132651329, + "learning_rate": 7.614492547906317e-05, + "loss": 0.010353577136993409, + "step": 168130 + }, + { + "epoch": 23.866572036905605, + "grad_norm": 0.8557180762290955, + "learning_rate": 7.614350603264727e-05, + "loss": 0.028104868531227113, + "step": 168140 + }, + { + "epoch": 23.867991483321504, + "grad_norm": 0.07745290547609329, + "learning_rate": 7.614208658623138e-05, + "loss": 0.014706882834434509, + "step": 168150 + }, + { + "epoch": 23.869410929737402, + "grad_norm": 0.02865561842918396, + "learning_rate": 7.614066713981548e-05, + "loss": 0.06094932556152344, + "step": 168160 + }, + { + "epoch": 23.8708303761533, + "grad_norm": 0.09374476969242096, + "learning_rate": 7.613924769339957e-05, + "loss": 0.04995681643486023, + "step": 168170 + }, + { + "epoch": 23.8722498225692, + "grad_norm": 8.03510570526123, + "learning_rate": 7.613782824698367e-05, + "loss": 0.019052889943122864, + "step": 168180 + }, + { + "epoch": 23.873669268985097, + "grad_norm": 0.011294238269329071, + "learning_rate": 7.613640880056778e-05, + "loss": 0.01397276222705841, + "step": 168190 + }, + { + "epoch": 23.875088715400995, + "grad_norm": 0.017214147374033928, + "learning_rate": 7.613498935415188e-05, + "loss": 0.0021011587232351303, + "step": 168200 + }, + { + "epoch": 23.87650816181689, + "grad_norm": 11.948472023010254, + "learning_rate": 7.613356990773599e-05, + "loss": 0.015850481390953065, + "step": 168210 + }, + { + "epoch": 23.87792760823279, + "grad_norm": 0.10622105747461319, + "learning_rate": 7.613215046132009e-05, + "loss": 0.004595089703798294, + "step": 168220 + }, + { + "epoch": 23.879347054648687, + "grad_norm": 0.0879296064376831, + "learning_rate": 7.613073101490419e-05, + "loss": 0.026042377948760985, + "step": 168230 + }, + { + "epoch": 23.880766501064585, + "grad_norm": 0.0037178813945502043, + "learning_rate": 7.61293115684883e-05, + "loss": 0.013940519094467163, + "step": 168240 + }, + { + "epoch": 23.882185947480483, + "grad_norm": 0.07132456451654434, + "learning_rate": 7.61278921220724e-05, + "loss": 0.02108425498008728, + "step": 168250 + }, + { + "epoch": 23.88360539389638, + "grad_norm": 0.8982271552085876, + "learning_rate": 7.61264726756565e-05, + "loss": 0.020002074539661407, + "step": 168260 + }, + { + "epoch": 23.88502484031228, + "grad_norm": 0.03305570408701897, + "learning_rate": 7.612505322924059e-05, + "loss": 0.016467867791652678, + "step": 168270 + }, + { + "epoch": 23.886444286728175, + "grad_norm": 0.5016790628433228, + "learning_rate": 7.61236337828247e-05, + "loss": 0.007112418115139007, + "step": 168280 + }, + { + "epoch": 23.887863733144073, + "grad_norm": 7.937992095947266, + "learning_rate": 7.61222143364088e-05, + "loss": 0.01024741530418396, + "step": 168290 + }, + { + "epoch": 23.88928317955997, + "grad_norm": 0.13209857046604156, + "learning_rate": 7.612079488999291e-05, + "loss": 0.008599552512168884, + "step": 168300 + }, + { + "epoch": 23.89070262597587, + "grad_norm": 3.591090202331543, + "learning_rate": 7.6119375443577e-05, + "loss": 0.019913426041603087, + "step": 168310 + }, + { + "epoch": 23.892122072391768, + "grad_norm": 0.019227489829063416, + "learning_rate": 7.61179559971611e-05, + "loss": 0.006553129851818084, + "step": 168320 + }, + { + "epoch": 23.893541518807666, + "grad_norm": 1.869367003440857, + "learning_rate": 7.611653655074521e-05, + "loss": 0.008379875123500824, + "step": 168330 + }, + { + "epoch": 23.894960965223564, + "grad_norm": 0.0781581699848175, + "learning_rate": 7.611511710432931e-05, + "loss": 0.02674444019794464, + "step": 168340 + }, + { + "epoch": 23.89638041163946, + "grad_norm": 0.837508499622345, + "learning_rate": 7.611369765791342e-05, + "loss": 0.0016662921756505967, + "step": 168350 + }, + { + "epoch": 23.897799858055357, + "grad_norm": 3.4302585124969482, + "learning_rate": 7.611227821149752e-05, + "loss": 0.054758667945861816, + "step": 168360 + }, + { + "epoch": 23.899219304471256, + "grad_norm": 0.06267009675502777, + "learning_rate": 7.611085876508162e-05, + "loss": 0.008854514360427857, + "step": 168370 + }, + { + "epoch": 23.900638750887154, + "grad_norm": 0.01618783175945282, + "learning_rate": 7.610943931866571e-05, + "loss": 0.024230434000492095, + "step": 168380 + }, + { + "epoch": 23.902058197303052, + "grad_norm": 1.2235485315322876, + "learning_rate": 7.610801987224983e-05, + "loss": 0.016775470972061158, + "step": 168390 + }, + { + "epoch": 23.90347764371895, + "grad_norm": 0.10428119450807571, + "learning_rate": 7.610660042583392e-05, + "loss": 0.005650333315134049, + "step": 168400 + }, + { + "epoch": 23.90489709013485, + "grad_norm": 1.6171902418136597, + "learning_rate": 7.610518097941803e-05, + "loss": 0.003742320090532303, + "step": 168410 + }, + { + "epoch": 23.906316536550744, + "grad_norm": 2.129974126815796, + "learning_rate": 7.610376153300214e-05, + "loss": 0.040164852142333986, + "step": 168420 + }, + { + "epoch": 23.907735982966642, + "grad_norm": 14.222805976867676, + "learning_rate": 7.610234208658623e-05, + "loss": 0.017658919095993042, + "step": 168430 + }, + { + "epoch": 23.90915542938254, + "grad_norm": 5.9143829345703125, + "learning_rate": 7.610092264017034e-05, + "loss": 0.03851213753223419, + "step": 168440 + }, + { + "epoch": 23.91057487579844, + "grad_norm": 0.42284300923347473, + "learning_rate": 7.609950319375444e-05, + "loss": 0.018796858191490174, + "step": 168450 + }, + { + "epoch": 23.911994322214337, + "grad_norm": 0.754317581653595, + "learning_rate": 7.609808374733855e-05, + "loss": 0.02279668301343918, + "step": 168460 + }, + { + "epoch": 23.913413768630235, + "grad_norm": 0.4404948055744171, + "learning_rate": 7.609666430092264e-05, + "loss": 0.01788274347782135, + "step": 168470 + }, + { + "epoch": 23.914833215046134, + "grad_norm": 0.02267448790371418, + "learning_rate": 7.609524485450674e-05, + "loss": 0.029181969165802003, + "step": 168480 + }, + { + "epoch": 23.91625266146203, + "grad_norm": 0.2826504111289978, + "learning_rate": 7.609382540809084e-05, + "loss": 0.022127823531627656, + "step": 168490 + }, + { + "epoch": 23.917672107877927, + "grad_norm": 6.244472026824951, + "learning_rate": 7.609240596167495e-05, + "loss": 0.00946960598230362, + "step": 168500 + }, + { + "epoch": 23.917672107877927, + "eval_accuracy": 0.9888090544922744, + "eval_loss": 0.052029140293598175, + "eval_runtime": 34.7764, + "eval_samples_per_second": 452.232, + "eval_steps_per_second": 14.148, + "step": 168500 + }, + { + "epoch": 23.919091554293825, + "grad_norm": 0.1415909081697464, + "learning_rate": 7.609098651525906e-05, + "loss": 0.02274523675441742, + "step": 168510 + }, + { + "epoch": 23.920511000709723, + "grad_norm": 4.55612325668335, + "learning_rate": 7.608956706884316e-05, + "loss": 0.012448877841234208, + "step": 168520 + }, + { + "epoch": 23.92193044712562, + "grad_norm": 5.085221290588379, + "learning_rate": 7.608814762242726e-05, + "loss": 0.014644013345241546, + "step": 168530 + }, + { + "epoch": 23.92334989354152, + "grad_norm": 0.590652346611023, + "learning_rate": 7.608672817601135e-05, + "loss": 0.011618024110794068, + "step": 168540 + }, + { + "epoch": 23.924769339957418, + "grad_norm": 0.5412836074829102, + "learning_rate": 7.608530872959546e-05, + "loss": 0.018512165546417235, + "step": 168550 + }, + { + "epoch": 23.926188786373313, + "grad_norm": 0.016068147495388985, + "learning_rate": 7.608388928317956e-05, + "loss": 0.02637784481048584, + "step": 168560 + }, + { + "epoch": 23.92760823278921, + "grad_norm": 0.007463817019015551, + "learning_rate": 7.608246983676367e-05, + "loss": 0.009795069694519043, + "step": 168570 + }, + { + "epoch": 23.92902767920511, + "grad_norm": 0.04805463179945946, + "learning_rate": 7.608105039034776e-05, + "loss": 0.010183900594711304, + "step": 168580 + }, + { + "epoch": 23.930447125621008, + "grad_norm": 21.508934020996094, + "learning_rate": 7.607963094393187e-05, + "loss": 0.07529878616333008, + "step": 168590 + }, + { + "epoch": 23.931866572036906, + "grad_norm": 0.047254778444767, + "learning_rate": 7.607821149751598e-05, + "loss": 0.006389336287975311, + "step": 168600 + }, + { + "epoch": 23.933286018452804, + "grad_norm": 0.24035483598709106, + "learning_rate": 7.607679205110008e-05, + "loss": 0.03161117732524872, + "step": 168610 + }, + { + "epoch": 23.934705464868703, + "grad_norm": 13.560129165649414, + "learning_rate": 7.607537260468419e-05, + "loss": 0.005599255487322807, + "step": 168620 + }, + { + "epoch": 23.936124911284598, + "grad_norm": 0.09834058582782745, + "learning_rate": 7.607395315826827e-05, + "loss": 0.02100534290075302, + "step": 168630 + }, + { + "epoch": 23.937544357700496, + "grad_norm": 0.07057110965251923, + "learning_rate": 7.607253371185238e-05, + "loss": 0.06741889715194702, + "step": 168640 + }, + { + "epoch": 23.938963804116394, + "grad_norm": 0.24973490834236145, + "learning_rate": 7.607111426543648e-05, + "loss": 0.005516386404633522, + "step": 168650 + }, + { + "epoch": 23.940383250532292, + "grad_norm": 0.048833783715963364, + "learning_rate": 7.606969481902059e-05, + "loss": 0.0049417465925216675, + "step": 168660 + }, + { + "epoch": 23.94180269694819, + "grad_norm": 0.04425785318017006, + "learning_rate": 7.606827537260469e-05, + "loss": 0.0019655153155326843, + "step": 168670 + }, + { + "epoch": 23.94322214336409, + "grad_norm": 0.01385763194411993, + "learning_rate": 7.606685592618878e-05, + "loss": 0.0023289646953344343, + "step": 168680 + }, + { + "epoch": 23.944641589779987, + "grad_norm": 0.034699827432632446, + "learning_rate": 7.60654364797729e-05, + "loss": 0.004876042902469635, + "step": 168690 + }, + { + "epoch": 23.946061036195882, + "grad_norm": 0.09625930339097977, + "learning_rate": 7.606401703335699e-05, + "loss": 0.017396016418933867, + "step": 168700 + }, + { + "epoch": 23.94748048261178, + "grad_norm": 20.301925659179688, + "learning_rate": 7.60625975869411e-05, + "loss": 0.023265233635902403, + "step": 168710 + }, + { + "epoch": 23.94889992902768, + "grad_norm": 0.44493067264556885, + "learning_rate": 7.60611781405252e-05, + "loss": 0.0018508043140172958, + "step": 168720 + }, + { + "epoch": 23.950319375443577, + "grad_norm": 0.05027701333165169, + "learning_rate": 7.605975869410931e-05, + "loss": 0.00262409970164299, + "step": 168730 + }, + { + "epoch": 23.951738821859475, + "grad_norm": 0.2746206820011139, + "learning_rate": 7.60583392476934e-05, + "loss": 0.001998903602361679, + "step": 168740 + }, + { + "epoch": 23.953158268275374, + "grad_norm": 1.0237525701522827, + "learning_rate": 7.605691980127751e-05, + "loss": 0.006001731380820274, + "step": 168750 + }, + { + "epoch": 23.954577714691272, + "grad_norm": 0.0054278383031487465, + "learning_rate": 7.60555003548616e-05, + "loss": 0.022023700177669525, + "step": 168760 + }, + { + "epoch": 23.955997161107167, + "grad_norm": 8.833168029785156, + "learning_rate": 7.605408090844572e-05, + "loss": 0.017826683819293976, + "step": 168770 + }, + { + "epoch": 23.957416607523065, + "grad_norm": 2.877356767654419, + "learning_rate": 7.605266146202981e-05, + "loss": 0.06207270622253418, + "step": 168780 + }, + { + "epoch": 23.958836053938963, + "grad_norm": 0.06914941221475601, + "learning_rate": 7.605124201561391e-05, + "loss": 0.04116440415382385, + "step": 168790 + }, + { + "epoch": 23.96025550035486, + "grad_norm": 0.09866085648536682, + "learning_rate": 7.604982256919802e-05, + "loss": 0.002235472947359085, + "step": 168800 + }, + { + "epoch": 23.96167494677076, + "grad_norm": 0.021806256845593452, + "learning_rate": 7.604840312278212e-05, + "loss": 0.011258108913898468, + "step": 168810 + }, + { + "epoch": 23.96309439318666, + "grad_norm": 0.2655975818634033, + "learning_rate": 7.604698367636623e-05, + "loss": 0.04311327040195465, + "step": 168820 + }, + { + "epoch": 23.964513839602557, + "grad_norm": 0.055406276136636734, + "learning_rate": 7.604556422995033e-05, + "loss": 0.004423852264881134, + "step": 168830 + }, + { + "epoch": 23.96593328601845, + "grad_norm": 7.3143310546875, + "learning_rate": 7.604414478353442e-05, + "loss": 0.01725325882434845, + "step": 168840 + }, + { + "epoch": 23.96735273243435, + "grad_norm": 4.9558258056640625, + "learning_rate": 7.604272533711852e-05, + "loss": 0.007642513513565064, + "step": 168850 + }, + { + "epoch": 23.968772178850248, + "grad_norm": 0.25505316257476807, + "learning_rate": 7.604130589070263e-05, + "loss": 0.016956663131713866, + "step": 168860 + }, + { + "epoch": 23.970191625266146, + "grad_norm": 1.0525556802749634, + "learning_rate": 7.603988644428673e-05, + "loss": 0.023545366525650025, + "step": 168870 + }, + { + "epoch": 23.971611071682045, + "grad_norm": 0.0064539615996181965, + "learning_rate": 7.603846699787084e-05, + "loss": 0.005608995631337166, + "step": 168880 + }, + { + "epoch": 23.973030518097943, + "grad_norm": 0.7884691953659058, + "learning_rate": 7.603704755145494e-05, + "loss": 0.025215384364128113, + "step": 168890 + }, + { + "epoch": 23.97444996451384, + "grad_norm": 1.229339361190796, + "learning_rate": 7.603562810503904e-05, + "loss": 0.015713247656822204, + "step": 168900 + }, + { + "epoch": 23.975869410929736, + "grad_norm": 9.928125381469727, + "learning_rate": 7.603420865862315e-05, + "loss": 0.039359521865844724, + "step": 168910 + }, + { + "epoch": 23.977288857345634, + "grad_norm": 1.9655506610870361, + "learning_rate": 7.603278921220724e-05, + "loss": 0.005212109535932541, + "step": 168920 + }, + { + "epoch": 23.978708303761533, + "grad_norm": 8.039067268371582, + "learning_rate": 7.603136976579135e-05, + "loss": 0.0034734532237052917, + "step": 168930 + }, + { + "epoch": 23.98012775017743, + "grad_norm": 0.3236429691314697, + "learning_rate": 7.602995031937544e-05, + "loss": 0.017360779643058776, + "step": 168940 + }, + { + "epoch": 23.98154719659333, + "grad_norm": 0.3917522728443146, + "learning_rate": 7.602853087295955e-05, + "loss": 0.011505788564682007, + "step": 168950 + }, + { + "epoch": 23.982966643009227, + "grad_norm": 0.48919928073883057, + "learning_rate": 7.602711142654365e-05, + "loss": 0.008262070268392563, + "step": 168960 + }, + { + "epoch": 23.984386089425126, + "grad_norm": 0.06849221885204315, + "learning_rate": 7.602569198012776e-05, + "loss": 0.008187731355428695, + "step": 168970 + }, + { + "epoch": 23.98580553584102, + "grad_norm": 0.038796473294496536, + "learning_rate": 7.602427253371186e-05, + "loss": 0.04740491211414337, + "step": 168980 + }, + { + "epoch": 23.98722498225692, + "grad_norm": 0.5392748713493347, + "learning_rate": 7.602285308729595e-05, + "loss": 0.015546588599681855, + "step": 168990 + }, + { + "epoch": 23.988644428672817, + "grad_norm": 15.316886901855469, + "learning_rate": 7.602143364088006e-05, + "loss": 0.0369267463684082, + "step": 169000 + }, + { + "epoch": 23.988644428672817, + "eval_accuracy": 0.9794620715966172, + "eval_loss": 0.08323602378368378, + "eval_runtime": 33.8507, + "eval_samples_per_second": 464.6, + "eval_steps_per_second": 14.534, + "step": 169000 + }, + { + "epoch": 23.990063875088715, + "grad_norm": 0.15600624680519104, + "learning_rate": 7.602001419446416e-05, + "loss": 0.022381418943405153, + "step": 169010 + }, + { + "epoch": 23.991483321504614, + "grad_norm": 1.0785387754440308, + "learning_rate": 7.601859474804827e-05, + "loss": 0.06831463575363159, + "step": 169020 + }, + { + "epoch": 23.992902767920512, + "grad_norm": 4.070056915283203, + "learning_rate": 7.601717530163237e-05, + "loss": 0.040357553958892824, + "step": 169030 + }, + { + "epoch": 23.99432221433641, + "grad_norm": 2.2161612510681152, + "learning_rate": 7.601575585521647e-05, + "loss": 0.05488816499710083, + "step": 169040 + }, + { + "epoch": 23.995741660752305, + "grad_norm": 0.048251450061798096, + "learning_rate": 7.601433640880056e-05, + "loss": 0.03376807868480682, + "step": 169050 + }, + { + "epoch": 23.997161107168203, + "grad_norm": 3.9328057765960693, + "learning_rate": 7.601291696238467e-05, + "loss": 0.012197145074605942, + "step": 169060 + }, + { + "epoch": 23.9985805535841, + "grad_norm": 0.39443573355674744, + "learning_rate": 7.601149751596877e-05, + "loss": 0.009280651807785034, + "step": 169070 + }, + { + "epoch": 24.0, + "grad_norm": 1.5426521301269531, + "learning_rate": 7.601007806955288e-05, + "loss": 0.03151857852935791, + "step": 169080 + }, + { + "epoch": 24.0014194464159, + "grad_norm": 0.05361247807741165, + "learning_rate": 7.600865862313698e-05, + "loss": 0.013806866109371185, + "step": 169090 + }, + { + "epoch": 24.002838892831797, + "grad_norm": 9.00046443939209, + "learning_rate": 7.600723917672108e-05, + "loss": 0.04106190800666809, + "step": 169100 + }, + { + "epoch": 24.004258339247695, + "grad_norm": 10.69758129119873, + "learning_rate": 7.600581973030519e-05, + "loss": 0.010220853239297866, + "step": 169110 + }, + { + "epoch": 24.00567778566359, + "grad_norm": 0.014948396943509579, + "learning_rate": 7.600440028388929e-05, + "loss": 0.013470388948917389, + "step": 169120 + }, + { + "epoch": 24.007097232079488, + "grad_norm": 0.10554257780313492, + "learning_rate": 7.60029808374734e-05, + "loss": 0.005943117663264275, + "step": 169130 + }, + { + "epoch": 24.008516678495386, + "grad_norm": 0.08111888915300369, + "learning_rate": 7.60015613910575e-05, + "loss": 0.011417470872402191, + "step": 169140 + }, + { + "epoch": 24.009936124911285, + "grad_norm": 0.47531262040138245, + "learning_rate": 7.600014194464159e-05, + "loss": 0.025044825673103333, + "step": 169150 + }, + { + "epoch": 24.011355571327183, + "grad_norm": 0.4677565097808838, + "learning_rate": 7.599872249822569e-05, + "loss": 0.019099196791648863, + "step": 169160 + }, + { + "epoch": 24.01277501774308, + "grad_norm": 12.954564094543457, + "learning_rate": 7.59973030518098e-05, + "loss": 0.02503977417945862, + "step": 169170 + }, + { + "epoch": 24.01419446415898, + "grad_norm": 0.0027104404289275408, + "learning_rate": 7.59958836053939e-05, + "loss": 0.017257124185562134, + "step": 169180 + }, + { + "epoch": 24.015613910574874, + "grad_norm": 0.11230763792991638, + "learning_rate": 7.599446415897801e-05, + "loss": 0.0179431289434433, + "step": 169190 + }, + { + "epoch": 24.017033356990773, + "grad_norm": 0.030357424169778824, + "learning_rate": 7.59930447125621e-05, + "loss": 0.0021581590175628663, + "step": 169200 + }, + { + "epoch": 24.01845280340667, + "grad_norm": 0.34751302003860474, + "learning_rate": 7.59916252661462e-05, + "loss": 0.004046228528022766, + "step": 169210 + }, + { + "epoch": 24.01987224982257, + "grad_norm": 0.07445652037858963, + "learning_rate": 7.599020581973031e-05, + "loss": 0.005999014526605606, + "step": 169220 + }, + { + "epoch": 24.021291696238467, + "grad_norm": 9.903605461120605, + "learning_rate": 7.598878637331441e-05, + "loss": 0.03353565335273743, + "step": 169230 + }, + { + "epoch": 24.022711142654366, + "grad_norm": 8.163335800170898, + "learning_rate": 7.598736692689852e-05, + "loss": 0.031780070066452025, + "step": 169240 + }, + { + "epoch": 24.024130589070264, + "grad_norm": 0.04752206802368164, + "learning_rate": 7.59859474804826e-05, + "loss": 0.01055913269519806, + "step": 169250 + }, + { + "epoch": 24.02555003548616, + "grad_norm": 6.121513366699219, + "learning_rate": 7.598452803406672e-05, + "loss": 0.023373523354530336, + "step": 169260 + }, + { + "epoch": 24.026969481902057, + "grad_norm": 0.0438343845307827, + "learning_rate": 7.598310858765081e-05, + "loss": 0.005354354158043861, + "step": 169270 + }, + { + "epoch": 24.028388928317955, + "grad_norm": 0.05333344638347626, + "learning_rate": 7.598168914123493e-05, + "loss": 0.003076145052909851, + "step": 169280 + }, + { + "epoch": 24.029808374733854, + "grad_norm": 0.044588785618543625, + "learning_rate": 7.598026969481902e-05, + "loss": 0.014156928658485413, + "step": 169290 + }, + { + "epoch": 24.031227821149752, + "grad_norm": 0.2538122236728668, + "learning_rate": 7.597885024840312e-05, + "loss": 0.00860530138015747, + "step": 169300 + }, + { + "epoch": 24.03264726756565, + "grad_norm": 1.1952548027038574, + "learning_rate": 7.597743080198723e-05, + "loss": 0.002138742804527283, + "step": 169310 + }, + { + "epoch": 24.03406671398155, + "grad_norm": 0.8243319988250732, + "learning_rate": 7.597601135557133e-05, + "loss": 0.008482812345027924, + "step": 169320 + }, + { + "epoch": 24.035486160397443, + "grad_norm": 0.05707681179046631, + "learning_rate": 7.597459190915544e-05, + "loss": 0.0035252172499895098, + "step": 169330 + }, + { + "epoch": 24.03690560681334, + "grad_norm": 0.12729310989379883, + "learning_rate": 7.597317246273954e-05, + "loss": 0.012369635701179504, + "step": 169340 + }, + { + "epoch": 24.03832505322924, + "grad_norm": 1.023154377937317, + "learning_rate": 7.597175301632363e-05, + "loss": 0.013965144753456116, + "step": 169350 + }, + { + "epoch": 24.03974449964514, + "grad_norm": 3.5640392303466797, + "learning_rate": 7.597033356990773e-05, + "loss": 0.0051962099969387054, + "step": 169360 + }, + { + "epoch": 24.041163946061037, + "grad_norm": 0.1320590227842331, + "learning_rate": 7.596891412349184e-05, + "loss": 0.0074041813611984255, + "step": 169370 + }, + { + "epoch": 24.042583392476935, + "grad_norm": 0.012021848931908607, + "learning_rate": 7.596749467707594e-05, + "loss": 0.07598079442977905, + "step": 169380 + }, + { + "epoch": 24.044002838892833, + "grad_norm": 1.1247034072875977, + "learning_rate": 7.596607523066005e-05, + "loss": 0.0026121459901332854, + "step": 169390 + }, + { + "epoch": 24.045422285308728, + "grad_norm": 7.977129936218262, + "learning_rate": 7.596465578424415e-05, + "loss": 0.007048430293798447, + "step": 169400 + }, + { + "epoch": 24.046841731724626, + "grad_norm": 0.9555954933166504, + "learning_rate": 7.596323633782825e-05, + "loss": 0.02044675052165985, + "step": 169410 + }, + { + "epoch": 24.048261178140525, + "grad_norm": 3.0553064346313477, + "learning_rate": 7.596181689141236e-05, + "loss": 0.01212041974067688, + "step": 169420 + }, + { + "epoch": 24.049680624556423, + "grad_norm": 0.03543643653392792, + "learning_rate": 7.596039744499645e-05, + "loss": 0.014991587400436402, + "step": 169430 + }, + { + "epoch": 24.05110007097232, + "grad_norm": 0.01885661482810974, + "learning_rate": 7.595897799858056e-05, + "loss": 0.004816603288054467, + "step": 169440 + }, + { + "epoch": 24.05251951738822, + "grad_norm": 0.11294499784708023, + "learning_rate": 7.595755855216466e-05, + "loss": 0.02813560366630554, + "step": 169450 + }, + { + "epoch": 24.053938963804118, + "grad_norm": 0.017571574077010155, + "learning_rate": 7.595613910574876e-05, + "loss": 0.005906275659799576, + "step": 169460 + }, + { + "epoch": 24.055358410220013, + "grad_norm": 0.9754683971405029, + "learning_rate": 7.595471965933286e-05, + "loss": 0.004427610337734223, + "step": 169470 + }, + { + "epoch": 24.05677785663591, + "grad_norm": 0.025497982278466225, + "learning_rate": 7.595330021291697e-05, + "loss": 0.0031700864434242248, + "step": 169480 + }, + { + "epoch": 24.05819730305181, + "grad_norm": 3.293794870376587, + "learning_rate": 7.595188076650107e-05, + "loss": 0.03416322469711304, + "step": 169490 + }, + { + "epoch": 24.059616749467708, + "grad_norm": 0.08569347858428955, + "learning_rate": 7.595046132008518e-05, + "loss": 0.009620600938796997, + "step": 169500 + }, + { + "epoch": 24.059616749467708, + "eval_accuracy": 0.9845488650092198, + "eval_loss": 0.06801944226026535, + "eval_runtime": 33.2337, + "eval_samples_per_second": 473.224, + "eval_steps_per_second": 14.804, + "step": 169500 + }, + { + "epoch": 24.061036195883606, + "grad_norm": 11.956920623779297, + "learning_rate": 7.594904187366927e-05, + "loss": 0.021617290377616883, + "step": 169510 + }, + { + "epoch": 24.062455642299504, + "grad_norm": 0.012519452720880508, + "learning_rate": 7.594762242725337e-05, + "loss": 0.006755061447620392, + "step": 169520 + }, + { + "epoch": 24.063875088715402, + "grad_norm": 0.028763817623257637, + "learning_rate": 7.594620298083748e-05, + "loss": 0.014405745267868041, + "step": 169530 + }, + { + "epoch": 24.065294535131297, + "grad_norm": 0.007457975763827562, + "learning_rate": 7.594478353442158e-05, + "loss": 0.03239177465438843, + "step": 169540 + }, + { + "epoch": 24.066713981547196, + "grad_norm": 2.0120654106140137, + "learning_rate": 7.594336408800569e-05, + "loss": 0.012471822649240493, + "step": 169550 + }, + { + "epoch": 24.068133427963094, + "grad_norm": 0.11050406098365784, + "learning_rate": 7.594194464158977e-05, + "loss": 0.03097372651100159, + "step": 169560 + }, + { + "epoch": 24.069552874378992, + "grad_norm": 0.768715500831604, + "learning_rate": 7.594052519517388e-05, + "loss": 0.04848639965057373, + "step": 169570 + }, + { + "epoch": 24.07097232079489, + "grad_norm": 0.1261855512857437, + "learning_rate": 7.593910574875798e-05, + "loss": 0.037220558524131774, + "step": 169580 + }, + { + "epoch": 24.07239176721079, + "grad_norm": 0.17539672553539276, + "learning_rate": 7.593768630234209e-05, + "loss": 0.004328291863203049, + "step": 169590 + }, + { + "epoch": 24.073811213626687, + "grad_norm": 0.49602094292640686, + "learning_rate": 7.593626685592619e-05, + "loss": 0.04311779737472534, + "step": 169600 + }, + { + "epoch": 24.075230660042582, + "grad_norm": 0.0037629303988069296, + "learning_rate": 7.593484740951029e-05, + "loss": 0.029575619101524352, + "step": 169610 + }, + { + "epoch": 24.07665010645848, + "grad_norm": 8.118293762207031, + "learning_rate": 7.59334279630944e-05, + "loss": 0.024983832240104677, + "step": 169620 + }, + { + "epoch": 24.07806955287438, + "grad_norm": 0.1724485456943512, + "learning_rate": 7.59320085166785e-05, + "loss": 0.00881030336022377, + "step": 169630 + }, + { + "epoch": 24.079488999290277, + "grad_norm": 0.9402016401290894, + "learning_rate": 7.593058907026261e-05, + "loss": 0.03307490050792694, + "step": 169640 + }, + { + "epoch": 24.080908445706175, + "grad_norm": 19.973377227783203, + "learning_rate": 7.59291696238467e-05, + "loss": 0.06771699190139771, + "step": 169650 + }, + { + "epoch": 24.082327892122073, + "grad_norm": 0.05742507800459862, + "learning_rate": 7.59277501774308e-05, + "loss": 0.018230313062667848, + "step": 169660 + }, + { + "epoch": 24.08374733853797, + "grad_norm": 0.02922886610031128, + "learning_rate": 7.59263307310149e-05, + "loss": 0.02530248463153839, + "step": 169670 + }, + { + "epoch": 24.085166784953866, + "grad_norm": 4.755521297454834, + "learning_rate": 7.592491128459901e-05, + "loss": 0.02809591293334961, + "step": 169680 + }, + { + "epoch": 24.086586231369765, + "grad_norm": 1.0391374826431274, + "learning_rate": 7.592349183818311e-05, + "loss": 0.07206405997276306, + "step": 169690 + }, + { + "epoch": 24.088005677785663, + "grad_norm": 7.303318500518799, + "learning_rate": 7.59222143364088e-05, + "loss": 0.01425800919532776, + "step": 169700 + }, + { + "epoch": 24.08942512420156, + "grad_norm": 8.41904067993164, + "learning_rate": 7.59207948899929e-05, + "loss": 0.05929157733917236, + "step": 169710 + }, + { + "epoch": 24.09084457061746, + "grad_norm": 0.0235306229442358, + "learning_rate": 7.591937544357701e-05, + "loss": 0.0033751770853996275, + "step": 169720 + }, + { + "epoch": 24.092264017033358, + "grad_norm": 0.17745310068130493, + "learning_rate": 7.591795599716111e-05, + "loss": 0.015691483020782472, + "step": 169730 + }, + { + "epoch": 24.093683463449256, + "grad_norm": 0.23903630673885345, + "learning_rate": 7.591653655074521e-05, + "loss": 0.009946343302726746, + "step": 169740 + }, + { + "epoch": 24.09510290986515, + "grad_norm": 0.050239283591508865, + "learning_rate": 7.591511710432932e-05, + "loss": 0.0019076723605394364, + "step": 169750 + }, + { + "epoch": 24.09652235628105, + "grad_norm": 0.3548555076122284, + "learning_rate": 7.591369765791342e-05, + "loss": 0.022541196644306184, + "step": 169760 + }, + { + "epoch": 24.097941802696948, + "grad_norm": 1.339597225189209, + "learning_rate": 7.591227821149753e-05, + "loss": 0.007295440137386322, + "step": 169770 + }, + { + "epoch": 24.099361249112846, + "grad_norm": 0.7307567000389099, + "learning_rate": 7.591085876508163e-05, + "loss": 0.017021115124225616, + "step": 169780 + }, + { + "epoch": 24.100780695528744, + "grad_norm": 10.33015251159668, + "learning_rate": 7.590943931866572e-05, + "loss": 0.03950236141681671, + "step": 169790 + }, + { + "epoch": 24.102200141944643, + "grad_norm": 10.832118034362793, + "learning_rate": 7.590801987224982e-05, + "loss": 0.016736145317554473, + "step": 169800 + }, + { + "epoch": 24.10361958836054, + "grad_norm": 0.021747421473264694, + "learning_rate": 7.590660042583393e-05, + "loss": 0.01111241728067398, + "step": 169810 + }, + { + "epoch": 24.105039034776436, + "grad_norm": 0.04057983681559563, + "learning_rate": 7.590518097941803e-05, + "loss": 0.02612481713294983, + "step": 169820 + }, + { + "epoch": 24.106458481192334, + "grad_norm": 0.5650833249092102, + "learning_rate": 7.590376153300214e-05, + "loss": 0.013202930986881255, + "step": 169830 + }, + { + "epoch": 24.107877927608232, + "grad_norm": 0.017477121204137802, + "learning_rate": 7.590234208658624e-05, + "loss": 0.013005024194717408, + "step": 169840 + }, + { + "epoch": 24.10929737402413, + "grad_norm": 7.640966415405273, + "learning_rate": 7.590092264017033e-05, + "loss": 0.03998966813087464, + "step": 169850 + }, + { + "epoch": 24.11071682044003, + "grad_norm": 0.021480172872543335, + "learning_rate": 7.589950319375445e-05, + "loss": 0.009985390305519103, + "step": 169860 + }, + { + "epoch": 24.112136266855927, + "grad_norm": 3.858417272567749, + "learning_rate": 7.589808374733854e-05, + "loss": 0.00977463200688362, + "step": 169870 + }, + { + "epoch": 24.113555713271825, + "grad_norm": 2.297473907470703, + "learning_rate": 7.589666430092265e-05, + "loss": 0.009137506783008575, + "step": 169880 + }, + { + "epoch": 24.11497515968772, + "grad_norm": 2.806868076324463, + "learning_rate": 7.589524485450674e-05, + "loss": 0.04026742577552796, + "step": 169890 + }, + { + "epoch": 24.11639460610362, + "grad_norm": 8.567550659179688, + "learning_rate": 7.589382540809085e-05, + "loss": 0.013880674540996552, + "step": 169900 + }, + { + "epoch": 24.117814052519517, + "grad_norm": 0.10329990833997726, + "learning_rate": 7.589240596167495e-05, + "loss": 0.019956156611442566, + "step": 169910 + }, + { + "epoch": 24.119233498935415, + "grad_norm": 8.854355812072754, + "learning_rate": 7.589098651525906e-05, + "loss": 0.019835728406906127, + "step": 169920 + }, + { + "epoch": 24.120652945351313, + "grad_norm": 0.11139008402824402, + "learning_rate": 7.588956706884315e-05, + "loss": 0.0018617279827594757, + "step": 169930 + }, + { + "epoch": 24.12207239176721, + "grad_norm": 0.015306922607123852, + "learning_rate": 7.588814762242725e-05, + "loss": 0.00806845873594284, + "step": 169940 + }, + { + "epoch": 24.12349183818311, + "grad_norm": 3.183070659637451, + "learning_rate": 7.588672817601136e-05, + "loss": 0.010787233710289001, + "step": 169950 + }, + { + "epoch": 24.124911284599005, + "grad_norm": 0.5154094099998474, + "learning_rate": 7.588530872959546e-05, + "loss": 0.01863774061203003, + "step": 169960 + }, + { + "epoch": 24.126330731014903, + "grad_norm": 0.13330891728401184, + "learning_rate": 7.588388928317957e-05, + "loss": 0.016619732975959776, + "step": 169970 + }, + { + "epoch": 24.1277501774308, + "grad_norm": 0.5580123662948608, + "learning_rate": 7.588246983676367e-05, + "loss": 0.006367402523756028, + "step": 169980 + }, + { + "epoch": 24.1291696238467, + "grad_norm": 1.7294703722000122, + "learning_rate": 7.588105039034777e-05, + "loss": 0.007877826690673828, + "step": 169990 + }, + { + "epoch": 24.130589070262598, + "grad_norm": 0.02211158350110054, + "learning_rate": 7.587963094393186e-05, + "loss": 0.026298925280570984, + "step": 170000 + }, + { + "epoch": 24.130589070262598, + "eval_accuracy": 0.9846124499268774, + "eval_loss": 0.06488175690174103, + "eval_runtime": 32.8696, + "eval_samples_per_second": 478.467, + "eval_steps_per_second": 14.968, + "step": 170000 + }, + { + "epoch": 24.132008516678496, + "grad_norm": 0.005628833547234535, + "learning_rate": 7.587821149751597e-05, + "loss": 0.009133793413639069, + "step": 170010 + }, + { + "epoch": 24.133427963094395, + "grad_norm": 0.03364453464746475, + "learning_rate": 7.587679205110007e-05, + "loss": 0.0008713401854038239, + "step": 170020 + }, + { + "epoch": 24.13484740951029, + "grad_norm": 0.05407797545194626, + "learning_rate": 7.587537260468418e-05, + "loss": 0.030221804976463318, + "step": 170030 + }, + { + "epoch": 24.136266855926188, + "grad_norm": 0.04706760495901108, + "learning_rate": 7.587395315826828e-05, + "loss": 0.0024727236479520796, + "step": 170040 + }, + { + "epoch": 24.137686302342086, + "grad_norm": 8.674266815185547, + "learning_rate": 7.587253371185238e-05, + "loss": 0.027379289269447327, + "step": 170050 + }, + { + "epoch": 24.139105748757984, + "grad_norm": 0.08296661078929901, + "learning_rate": 7.587111426543649e-05, + "loss": 0.0157430961728096, + "step": 170060 + }, + { + "epoch": 24.140525195173883, + "grad_norm": 0.10498236864805222, + "learning_rate": 7.586969481902059e-05, + "loss": 0.03745521605014801, + "step": 170070 + }, + { + "epoch": 24.14194464158978, + "grad_norm": 0.04347184672951698, + "learning_rate": 7.58682753726047e-05, + "loss": 0.0026082448661327363, + "step": 170080 + }, + { + "epoch": 24.14336408800568, + "grad_norm": 3.1085121631622314, + "learning_rate": 7.586685592618878e-05, + "loss": 0.005405990779399872, + "step": 170090 + }, + { + "epoch": 24.144783534421574, + "grad_norm": 5.15566873550415, + "learning_rate": 7.586543647977289e-05, + "loss": 0.003469008207321167, + "step": 170100 + }, + { + "epoch": 24.146202980837472, + "grad_norm": 0.43907177448272705, + "learning_rate": 7.586401703335699e-05, + "loss": 0.021270370483398436, + "step": 170110 + }, + { + "epoch": 24.14762242725337, + "grad_norm": 0.0023006913252174854, + "learning_rate": 7.58625975869411e-05, + "loss": 0.021012817323207856, + "step": 170120 + }, + { + "epoch": 24.14904187366927, + "grad_norm": 0.33030903339385986, + "learning_rate": 7.58611781405252e-05, + "loss": 0.022517643868923187, + "step": 170130 + }, + { + "epoch": 24.150461320085167, + "grad_norm": 0.4161616563796997, + "learning_rate": 7.585975869410931e-05, + "loss": 0.024981804192066193, + "step": 170140 + }, + { + "epoch": 24.151880766501066, + "grad_norm": 11.65206527709961, + "learning_rate": 7.58583392476934e-05, + "loss": 0.05744643807411194, + "step": 170150 + }, + { + "epoch": 24.153300212916964, + "grad_norm": 0.010300575755536556, + "learning_rate": 7.58569198012775e-05, + "loss": 0.004296479746699333, + "step": 170160 + }, + { + "epoch": 24.15471965933286, + "grad_norm": 0.17267712950706482, + "learning_rate": 7.585550035486161e-05, + "loss": 0.01614364981651306, + "step": 170170 + }, + { + "epoch": 24.156139105748757, + "grad_norm": 0.020199207589030266, + "learning_rate": 7.585408090844571e-05, + "loss": 0.003402118384838104, + "step": 170180 + }, + { + "epoch": 24.157558552164655, + "grad_norm": 4.328697681427002, + "learning_rate": 7.585266146202982e-05, + "loss": 0.008073128759860992, + "step": 170190 + }, + { + "epoch": 24.158977998580554, + "grad_norm": 2.1502907276153564, + "learning_rate": 7.58512420156139e-05, + "loss": 0.017155613005161285, + "step": 170200 + }, + { + "epoch": 24.160397444996452, + "grad_norm": 0.012652352452278137, + "learning_rate": 7.584982256919802e-05, + "loss": 0.004721416160464287, + "step": 170210 + }, + { + "epoch": 24.16181689141235, + "grad_norm": 1.917319893836975, + "learning_rate": 7.584840312278211e-05, + "loss": 0.004468687996268273, + "step": 170220 + }, + { + "epoch": 24.16323633782825, + "grad_norm": 7.574029922485352, + "learning_rate": 7.584698367636622e-05, + "loss": 0.02497868090867996, + "step": 170230 + }, + { + "epoch": 24.164655784244143, + "grad_norm": 0.37815436720848083, + "learning_rate": 7.584556422995032e-05, + "loss": 0.04553220272064209, + "step": 170240 + }, + { + "epoch": 24.16607523066004, + "grad_norm": 1.5925405025482178, + "learning_rate": 7.584414478353442e-05, + "loss": 0.028094983100891112, + "step": 170250 + }, + { + "epoch": 24.16749467707594, + "grad_norm": 8.128960609436035, + "learning_rate": 7.584272533711853e-05, + "loss": 0.040837767720222476, + "step": 170260 + }, + { + "epoch": 24.168914123491838, + "grad_norm": 0.10213065892457962, + "learning_rate": 7.584130589070263e-05, + "loss": 0.0661386489868164, + "step": 170270 + }, + { + "epoch": 24.170333569907736, + "grad_norm": 7.448816299438477, + "learning_rate": 7.583988644428674e-05, + "loss": 0.015805299580097198, + "step": 170280 + }, + { + "epoch": 24.171753016323635, + "grad_norm": 0.5614497065544128, + "learning_rate": 7.583846699787084e-05, + "loss": 0.007815079391002655, + "step": 170290 + }, + { + "epoch": 24.173172462739533, + "grad_norm": 0.796163022518158, + "learning_rate": 7.583704755145493e-05, + "loss": 0.010730551183223724, + "step": 170300 + }, + { + "epoch": 24.174591909155428, + "grad_norm": 0.45461323857307434, + "learning_rate": 7.583562810503903e-05, + "loss": 0.009848369657993317, + "step": 170310 + }, + { + "epoch": 24.176011355571326, + "grad_norm": 0.35377371311187744, + "learning_rate": 7.583420865862314e-05, + "loss": 0.02429445683956146, + "step": 170320 + }, + { + "epoch": 24.177430801987224, + "grad_norm": 0.45060235261917114, + "learning_rate": 7.583278921220724e-05, + "loss": 0.017373840510845184, + "step": 170330 + }, + { + "epoch": 24.178850248403123, + "grad_norm": 0.41975292563438416, + "learning_rate": 7.583136976579135e-05, + "loss": 0.046639513969421384, + "step": 170340 + }, + { + "epoch": 24.18026969481902, + "grad_norm": 1.646340250968933, + "learning_rate": 7.582995031937545e-05, + "loss": 0.014303261041641235, + "step": 170350 + }, + { + "epoch": 24.18168914123492, + "grad_norm": 0.3326791524887085, + "learning_rate": 7.582853087295954e-05, + "loss": 0.03869196176528931, + "step": 170360 + }, + { + "epoch": 24.183108587650818, + "grad_norm": 0.6868438124656677, + "learning_rate": 7.582711142654366e-05, + "loss": 0.027368614077568056, + "step": 170370 + }, + { + "epoch": 24.184528034066712, + "grad_norm": 13.971768379211426, + "learning_rate": 7.582569198012775e-05, + "loss": 0.051496374607086184, + "step": 170380 + }, + { + "epoch": 24.18594748048261, + "grad_norm": 0.35145944356918335, + "learning_rate": 7.582427253371186e-05, + "loss": 0.011154238879680634, + "step": 170390 + }, + { + "epoch": 24.18736692689851, + "grad_norm": 0.6389665603637695, + "learning_rate": 7.582285308729595e-05, + "loss": 0.012382078170776366, + "step": 170400 + }, + { + "epoch": 24.188786373314407, + "grad_norm": 4.356932163238525, + "learning_rate": 7.582143364088006e-05, + "loss": 0.01011217087507248, + "step": 170410 + }, + { + "epoch": 24.190205819730306, + "grad_norm": 0.4150574207305908, + "learning_rate": 7.582001419446416e-05, + "loss": 0.003952211886644364, + "step": 170420 + }, + { + "epoch": 24.191625266146204, + "grad_norm": 0.04953921586275101, + "learning_rate": 7.581859474804827e-05, + "loss": 0.024195228517055512, + "step": 170430 + }, + { + "epoch": 24.193044712562102, + "grad_norm": 1.3297760486602783, + "learning_rate": 7.581717530163236e-05, + "loss": 0.01958087682723999, + "step": 170440 + }, + { + "epoch": 24.194464158977997, + "grad_norm": 0.11100302636623383, + "learning_rate": 7.581575585521646e-05, + "loss": 0.004563049226999283, + "step": 170450 + }, + { + "epoch": 24.195883605393895, + "grad_norm": 0.8193221092224121, + "learning_rate": 7.581433640880057e-05, + "loss": 0.011020318418741227, + "step": 170460 + }, + { + "epoch": 24.197303051809794, + "grad_norm": 16.3663387298584, + "learning_rate": 7.581291696238467e-05, + "loss": 0.025051560997962952, + "step": 170470 + }, + { + "epoch": 24.198722498225692, + "grad_norm": 0.1069154292345047, + "learning_rate": 7.581149751596878e-05, + "loss": 0.004601104184985161, + "step": 170480 + }, + { + "epoch": 24.20014194464159, + "grad_norm": 0.6329733729362488, + "learning_rate": 7.581007806955288e-05, + "loss": 0.02126217484474182, + "step": 170490 + }, + { + "epoch": 24.20156139105749, + "grad_norm": 8.838003158569336, + "learning_rate": 7.580865862313699e-05, + "loss": 0.06399988532066345, + "step": 170500 + }, + { + "epoch": 24.20156139105749, + "eval_accuracy": 0.9799707509378776, + "eval_loss": 0.07987058162689209, + "eval_runtime": 32.9678, + "eval_samples_per_second": 477.042, + "eval_steps_per_second": 14.924, + "step": 170500 + }, + { + "epoch": 24.202980837473387, + "grad_norm": 0.006893422454595566, + "learning_rate": 7.580723917672107e-05, + "loss": 0.011419574916362762, + "step": 170510 + }, + { + "epoch": 24.20440028388928, + "grad_norm": 0.3292061388492584, + "learning_rate": 7.580581973030518e-05, + "loss": 0.002167768031358719, + "step": 170520 + }, + { + "epoch": 24.20581973030518, + "grad_norm": 0.01943243108689785, + "learning_rate": 7.580440028388928e-05, + "loss": 0.039792847633361814, + "step": 170530 + }, + { + "epoch": 24.207239176721078, + "grad_norm": 0.3743377923965454, + "learning_rate": 7.580298083747339e-05, + "loss": 0.005728993192315101, + "step": 170540 + }, + { + "epoch": 24.208658623136976, + "grad_norm": 1.0033869743347168, + "learning_rate": 7.580156139105749e-05, + "loss": 0.006830430030822754, + "step": 170550 + }, + { + "epoch": 24.210078069552875, + "grad_norm": 1.0010801553726196, + "learning_rate": 7.580014194464159e-05, + "loss": 0.005035979300737381, + "step": 170560 + }, + { + "epoch": 24.211497515968773, + "grad_norm": 0.016392450779676437, + "learning_rate": 7.57987224982257e-05, + "loss": 0.004353587701916695, + "step": 170570 + }, + { + "epoch": 24.21291696238467, + "grad_norm": 0.006741642020642757, + "learning_rate": 7.57973030518098e-05, + "loss": 0.006458216160535812, + "step": 170580 + }, + { + "epoch": 24.214336408800566, + "grad_norm": 2.642361879348755, + "learning_rate": 7.57958836053939e-05, + "loss": 0.037814149260520936, + "step": 170590 + }, + { + "epoch": 24.215755855216464, + "grad_norm": 0.013270823284983635, + "learning_rate": 7.5794464158978e-05, + "loss": 0.013965161144733429, + "step": 170600 + }, + { + "epoch": 24.217175301632363, + "grad_norm": 0.0040735648944973946, + "learning_rate": 7.57930447125621e-05, + "loss": 0.007597902417182922, + "step": 170610 + }, + { + "epoch": 24.21859474804826, + "grad_norm": 5.0308966636657715, + "learning_rate": 7.57916252661462e-05, + "loss": 0.027482470870018004, + "step": 170620 + }, + { + "epoch": 24.22001419446416, + "grad_norm": 0.1058560386300087, + "learning_rate": 7.579020581973031e-05, + "loss": 0.0028643358498811724, + "step": 170630 + }, + { + "epoch": 24.221433640880058, + "grad_norm": 0.3358331322669983, + "learning_rate": 7.57887863733144e-05, + "loss": 0.015095248818397522, + "step": 170640 + }, + { + "epoch": 24.222853087295956, + "grad_norm": 0.1466071605682373, + "learning_rate": 7.578736692689852e-05, + "loss": 0.017482084035873414, + "step": 170650 + }, + { + "epoch": 24.22427253371185, + "grad_norm": 0.04823797568678856, + "learning_rate": 7.578594748048261e-05, + "loss": 0.05199518203735352, + "step": 170660 + }, + { + "epoch": 24.22569198012775, + "grad_norm": 1.2597646713256836, + "learning_rate": 7.578452803406671e-05, + "loss": 0.020428699254989625, + "step": 170670 + }, + { + "epoch": 24.227111426543647, + "grad_norm": 5.110867977142334, + "learning_rate": 7.578310858765082e-05, + "loss": 0.004891027137637139, + "step": 170680 + }, + { + "epoch": 24.228530872959546, + "grad_norm": 1.8324166536331177, + "learning_rate": 7.578168914123492e-05, + "loss": 0.011243074387311935, + "step": 170690 + }, + { + "epoch": 24.229950319375444, + "grad_norm": 7.369711399078369, + "learning_rate": 7.578026969481903e-05, + "loss": 0.024842533469200134, + "step": 170700 + }, + { + "epoch": 24.231369765791342, + "grad_norm": 0.022505031898617744, + "learning_rate": 7.577885024840312e-05, + "loss": 0.03136737942695618, + "step": 170710 + }, + { + "epoch": 24.23278921220724, + "grad_norm": 0.980998694896698, + "learning_rate": 7.577743080198723e-05, + "loss": 0.006584975123405457, + "step": 170720 + }, + { + "epoch": 24.234208658623135, + "grad_norm": 2.044980049133301, + "learning_rate": 7.577601135557132e-05, + "loss": 0.13177597522735596, + "step": 170730 + }, + { + "epoch": 24.235628105039034, + "grad_norm": 0.07145638018846512, + "learning_rate": 7.577459190915543e-05, + "loss": 0.023698781430721284, + "step": 170740 + }, + { + "epoch": 24.237047551454932, + "grad_norm": 0.02371988631784916, + "learning_rate": 7.577317246273955e-05, + "loss": 0.0026623275130987166, + "step": 170750 + }, + { + "epoch": 24.23846699787083, + "grad_norm": 0.32833969593048096, + "learning_rate": 7.577175301632363e-05, + "loss": 0.010100835561752319, + "step": 170760 + }, + { + "epoch": 24.23988644428673, + "grad_norm": 16.563562393188477, + "learning_rate": 7.577033356990774e-05, + "loss": 0.034396234154701236, + "step": 170770 + }, + { + "epoch": 24.241305890702627, + "grad_norm": 3.395127534866333, + "learning_rate": 7.576891412349184e-05, + "loss": 0.00807100534439087, + "step": 170780 + }, + { + "epoch": 24.242725337118525, + "grad_norm": 0.003005518112331629, + "learning_rate": 7.576749467707595e-05, + "loss": 0.007678534090518952, + "step": 170790 + }, + { + "epoch": 24.24414478353442, + "grad_norm": 0.4488643407821655, + "learning_rate": 7.576607523066005e-05, + "loss": 0.007151262462139129, + "step": 170800 + }, + { + "epoch": 24.24556422995032, + "grad_norm": 5.994481563568115, + "learning_rate": 7.576465578424414e-05, + "loss": 0.005180511996150017, + "step": 170810 + }, + { + "epoch": 24.246983676366217, + "grad_norm": 0.04760206118226051, + "learning_rate": 7.576323633782824e-05, + "loss": 0.007648360729217529, + "step": 170820 + }, + { + "epoch": 24.248403122782115, + "grad_norm": 0.038604285567998886, + "learning_rate": 7.576181689141235e-05, + "loss": 0.004905746132135391, + "step": 170830 + }, + { + "epoch": 24.249822569198013, + "grad_norm": 2.2688546180725098, + "learning_rate": 7.576039744499646e-05, + "loss": 0.006448967754840851, + "step": 170840 + }, + { + "epoch": 24.25124201561391, + "grad_norm": 0.5143846273422241, + "learning_rate": 7.575897799858056e-05, + "loss": 0.004924257844686508, + "step": 170850 + }, + { + "epoch": 24.25266146202981, + "grad_norm": 0.01915423572063446, + "learning_rate": 7.575755855216467e-05, + "loss": 0.0024891678243875504, + "step": 170860 + }, + { + "epoch": 24.254080908445705, + "grad_norm": 0.88966304063797, + "learning_rate": 7.575613910574875e-05, + "loss": 0.007110300660133362, + "step": 170870 + }, + { + "epoch": 24.255500354861603, + "grad_norm": 15.078179359436035, + "learning_rate": 7.575471965933287e-05, + "loss": 0.036874312162399295, + "step": 170880 + }, + { + "epoch": 24.2569198012775, + "grad_norm": 0.8519644737243652, + "learning_rate": 7.575330021291696e-05, + "loss": 0.013236203789710998, + "step": 170890 + }, + { + "epoch": 24.2583392476934, + "grad_norm": 0.0043980940245091915, + "learning_rate": 7.575188076650107e-05, + "loss": 0.03487876057624817, + "step": 170900 + }, + { + "epoch": 24.259758694109298, + "grad_norm": 0.631964385509491, + "learning_rate": 7.575046132008517e-05, + "loss": 0.01473209261894226, + "step": 170910 + }, + { + "epoch": 24.261178140525196, + "grad_norm": 0.7692946195602417, + "learning_rate": 7.574904187366927e-05, + "loss": 0.033982694149017334, + "step": 170920 + }, + { + "epoch": 24.262597586941094, + "grad_norm": 0.3826451897621155, + "learning_rate": 7.574762242725338e-05, + "loss": 0.006551919877529145, + "step": 170930 + }, + { + "epoch": 24.26401703335699, + "grad_norm": 0.036588337272405624, + "learning_rate": 7.574620298083748e-05, + "loss": 0.027697181701660155, + "step": 170940 + }, + { + "epoch": 24.265436479772887, + "grad_norm": 0.46344104409217834, + "learning_rate": 7.574478353442159e-05, + "loss": 0.0070559106767177585, + "step": 170950 + }, + { + "epoch": 24.266855926188786, + "grad_norm": 0.010940377600491047, + "learning_rate": 7.574336408800569e-05, + "loss": 0.027180743217468262, + "step": 170960 + }, + { + "epoch": 24.268275372604684, + "grad_norm": 0.08343552052974701, + "learning_rate": 7.574194464158978e-05, + "loss": 0.005247928574681282, + "step": 170970 + }, + { + "epoch": 24.269694819020582, + "grad_norm": 0.07037290185689926, + "learning_rate": 7.574052519517388e-05, + "loss": 0.007797211408615112, + "step": 170980 + }, + { + "epoch": 24.27111426543648, + "grad_norm": 5.2546772956848145, + "learning_rate": 7.573910574875799e-05, + "loss": 0.02071610540151596, + "step": 170990 + }, + { + "epoch": 24.27253371185238, + "grad_norm": 0.015103437006473541, + "learning_rate": 7.573768630234209e-05, + "loss": 0.007820375263690948, + "step": 171000 + }, + { + "epoch": 24.27253371185238, + "eval_accuracy": 0.9862656577859732, + "eval_loss": 0.049098722636699677, + "eval_runtime": 32.8509, + "eval_samples_per_second": 478.739, + "eval_steps_per_second": 14.977, + "step": 171000 + }, + { + "epoch": 24.273953158268274, + "grad_norm": 0.050331905484199524, + "learning_rate": 7.57362668559262e-05, + "loss": 0.04263402819633484, + "step": 171010 + }, + { + "epoch": 24.275372604684172, + "grad_norm": 7.950806617736816, + "learning_rate": 7.57348474095103e-05, + "loss": 0.014951135218143462, + "step": 171020 + }, + { + "epoch": 24.27679205110007, + "grad_norm": 0.1425125151872635, + "learning_rate": 7.57334279630944e-05, + "loss": 0.026498037576675414, + "step": 171030 + }, + { + "epoch": 24.27821149751597, + "grad_norm": 11.705103874206543, + "learning_rate": 7.57320085166785e-05, + "loss": 0.02368048280477524, + "step": 171040 + }, + { + "epoch": 24.279630943931867, + "grad_norm": 1.0479013919830322, + "learning_rate": 7.57305890702626e-05, + "loss": 0.008747637271881104, + "step": 171050 + }, + { + "epoch": 24.281050390347765, + "grad_norm": 0.16132241487503052, + "learning_rate": 7.572916962384671e-05, + "loss": 0.03507925868034363, + "step": 171060 + }, + { + "epoch": 24.282469836763664, + "grad_norm": 0.23041999340057373, + "learning_rate": 7.57277501774308e-05, + "loss": 0.007162519544363022, + "step": 171070 + }, + { + "epoch": 24.28388928317956, + "grad_norm": 0.026853308081626892, + "learning_rate": 7.572633073101491e-05, + "loss": 0.005885357037186623, + "step": 171080 + }, + { + "epoch": 24.285308729595457, + "grad_norm": 0.2846274971961975, + "learning_rate": 7.5724911284599e-05, + "loss": 0.03424282371997833, + "step": 171090 + }, + { + "epoch": 24.286728176011355, + "grad_norm": 0.05527833104133606, + "learning_rate": 7.572349183818312e-05, + "loss": 0.0028684016317129133, + "step": 171100 + }, + { + "epoch": 24.288147622427253, + "grad_norm": 0.03230841085314751, + "learning_rate": 7.572207239176721e-05, + "loss": 0.011814162135124207, + "step": 171110 + }, + { + "epoch": 24.28956706884315, + "grad_norm": 16.41860580444336, + "learning_rate": 7.572065294535131e-05, + "loss": 0.057567369937896726, + "step": 171120 + }, + { + "epoch": 24.29098651525905, + "grad_norm": 0.06742767989635468, + "learning_rate": 7.571923349893542e-05, + "loss": 0.0333555668592453, + "step": 171130 + }, + { + "epoch": 24.292405961674948, + "grad_norm": 0.017710834741592407, + "learning_rate": 7.571781405251952e-05, + "loss": 0.005928101390600205, + "step": 171140 + }, + { + "epoch": 24.293825408090843, + "grad_norm": 0.1413985937833786, + "learning_rate": 7.571639460610363e-05, + "loss": 0.05438186526298523, + "step": 171150 + }, + { + "epoch": 24.29524485450674, + "grad_norm": 0.025823326781392097, + "learning_rate": 7.571497515968773e-05, + "loss": 0.006939646601676941, + "step": 171160 + }, + { + "epoch": 24.29666430092264, + "grad_norm": 0.04532528296113014, + "learning_rate": 7.571355571327184e-05, + "loss": 0.017600148916244507, + "step": 171170 + }, + { + "epoch": 24.298083747338538, + "grad_norm": 17.45870018005371, + "learning_rate": 7.571213626685592e-05, + "loss": 0.03837184309959411, + "step": 171180 + }, + { + "epoch": 24.299503193754436, + "grad_norm": 1.1822377443313599, + "learning_rate": 7.571071682044003e-05, + "loss": 0.008452177047729492, + "step": 171190 + }, + { + "epoch": 24.300922640170334, + "grad_norm": 0.12417076528072357, + "learning_rate": 7.570929737402413e-05, + "loss": 0.02179790586233139, + "step": 171200 + }, + { + "epoch": 24.302342086586233, + "grad_norm": 10.740368843078613, + "learning_rate": 7.570787792760824e-05, + "loss": 0.04583992958068848, + "step": 171210 + }, + { + "epoch": 24.303761533002127, + "grad_norm": 0.01561831682920456, + "learning_rate": 7.570645848119234e-05, + "loss": 0.017994926869869234, + "step": 171220 + }, + { + "epoch": 24.305180979418026, + "grad_norm": 8.518933296203613, + "learning_rate": 7.570503903477644e-05, + "loss": 0.02884921431541443, + "step": 171230 + }, + { + "epoch": 24.306600425833924, + "grad_norm": 2.047966480255127, + "learning_rate": 7.570361958836055e-05, + "loss": 0.01905878782272339, + "step": 171240 + }, + { + "epoch": 24.308019872249822, + "grad_norm": 4.679335594177246, + "learning_rate": 7.570220014194464e-05, + "loss": 0.025890105962753297, + "step": 171250 + }, + { + "epoch": 24.30943931866572, + "grad_norm": 0.49490824341773987, + "learning_rate": 7.570078069552876e-05, + "loss": 0.04034099578857422, + "step": 171260 + }, + { + "epoch": 24.31085876508162, + "grad_norm": 1.944568395614624, + "learning_rate": 7.569936124911285e-05, + "loss": 0.026923549175262452, + "step": 171270 + }, + { + "epoch": 24.312278211497517, + "grad_norm": 8.678698539733887, + "learning_rate": 7.569794180269695e-05, + "loss": 0.03364227414131164, + "step": 171280 + }, + { + "epoch": 24.313697657913412, + "grad_norm": 0.025599056854844093, + "learning_rate": 7.569652235628105e-05, + "loss": 0.06062678098678589, + "step": 171290 + }, + { + "epoch": 24.31511710432931, + "grad_norm": 0.03698037564754486, + "learning_rate": 7.569510290986516e-05, + "loss": 0.04037070274353027, + "step": 171300 + }, + { + "epoch": 24.31653655074521, + "grad_norm": 0.5826331973075867, + "learning_rate": 7.569368346344926e-05, + "loss": 0.030045381188392638, + "step": 171310 + }, + { + "epoch": 24.317955997161107, + "grad_norm": 0.3373778760433197, + "learning_rate": 7.569226401703337e-05, + "loss": 0.013584882020950317, + "step": 171320 + }, + { + "epoch": 24.319375443577005, + "grad_norm": 6.711911201477051, + "learning_rate": 7.569084457061746e-05, + "loss": 0.012325920164585114, + "step": 171330 + }, + { + "epoch": 24.320794889992904, + "grad_norm": 6.94309663772583, + "learning_rate": 7.568942512420156e-05, + "loss": 0.029617860913276672, + "step": 171340 + }, + { + "epoch": 24.322214336408802, + "grad_norm": 0.20177099108695984, + "learning_rate": 7.568800567778567e-05, + "loss": 0.01592907905578613, + "step": 171350 + }, + { + "epoch": 24.323633782824697, + "grad_norm": 4.83730411529541, + "learning_rate": 7.568658623136977e-05, + "loss": 0.047815841436386106, + "step": 171360 + }, + { + "epoch": 24.325053229240595, + "grad_norm": 0.07368935644626617, + "learning_rate": 7.568516678495388e-05, + "loss": 0.0050449702888727185, + "step": 171370 + }, + { + "epoch": 24.326472675656493, + "grad_norm": 5.064146041870117, + "learning_rate": 7.568374733853796e-05, + "loss": 0.02245374321937561, + "step": 171380 + }, + { + "epoch": 24.32789212207239, + "grad_norm": 0.02634347230195999, + "learning_rate": 7.568232789212208e-05, + "loss": 0.011611734330654145, + "step": 171390 + }, + { + "epoch": 24.32931156848829, + "grad_norm": 0.11630851030349731, + "learning_rate": 7.568090844570617e-05, + "loss": 0.0057693000882864, + "step": 171400 + }, + { + "epoch": 24.330731014904188, + "grad_norm": 10.611739158630371, + "learning_rate": 7.567948899929028e-05, + "loss": 0.022560597956180574, + "step": 171410 + }, + { + "epoch": 24.332150461320087, + "grad_norm": 0.22935491800308228, + "learning_rate": 7.567806955287438e-05, + "loss": 0.006095694750547409, + "step": 171420 + }, + { + "epoch": 24.33356990773598, + "grad_norm": 3.480710983276367, + "learning_rate": 7.567665010645848e-05, + "loss": 0.034514331817626955, + "step": 171430 + }, + { + "epoch": 24.33498935415188, + "grad_norm": 0.022428065538406372, + "learning_rate": 7.567523066004259e-05, + "loss": 0.02567698061466217, + "step": 171440 + }, + { + "epoch": 24.336408800567778, + "grad_norm": 13.715996742248535, + "learning_rate": 7.567381121362669e-05, + "loss": 0.009360069036483764, + "step": 171450 + }, + { + "epoch": 24.337828246983676, + "grad_norm": 0.05670325458049774, + "learning_rate": 7.56723917672108e-05, + "loss": 0.007637037336826325, + "step": 171460 + }, + { + "epoch": 24.339247693399575, + "grad_norm": 1.9487965106964111, + "learning_rate": 7.56709723207949e-05, + "loss": 0.005399488657712936, + "step": 171470 + }, + { + "epoch": 24.340667139815473, + "grad_norm": 0.0489615835249424, + "learning_rate": 7.566955287437899e-05, + "loss": 0.017137110233306885, + "step": 171480 + }, + { + "epoch": 24.34208658623137, + "grad_norm": 2.4007930755615234, + "learning_rate": 7.566813342796309e-05, + "loss": 0.015601171553134919, + "step": 171490 + }, + { + "epoch": 24.343506032647266, + "grad_norm": 0.12732060253620148, + "learning_rate": 7.56667139815472e-05, + "loss": 0.00559876412153244, + "step": 171500 + }, + { + "epoch": 24.343506032647266, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.04761320352554321, + "eval_runtime": 33.3053, + "eval_samples_per_second": 472.208, + "eval_steps_per_second": 14.772, + "step": 171500 + }, + { + "epoch": 24.344925479063164, + "grad_norm": 0.7616419792175293, + "learning_rate": 7.56652945351313e-05, + "loss": 0.0023174192756414413, + "step": 171510 + }, + { + "epoch": 24.346344925479062, + "grad_norm": 0.0772976279258728, + "learning_rate": 7.566387508871541e-05, + "loss": 0.016668049991130827, + "step": 171520 + }, + { + "epoch": 24.34776437189496, + "grad_norm": 1.6490932703018188, + "learning_rate": 7.56624556422995e-05, + "loss": 0.019576871395111085, + "step": 171530 + }, + { + "epoch": 24.34918381831086, + "grad_norm": 0.5442006587982178, + "learning_rate": 7.56610361958836e-05, + "loss": 0.047711098194122316, + "step": 171540 + }, + { + "epoch": 24.350603264726757, + "grad_norm": 0.31651896238327026, + "learning_rate": 7.565961674946771e-05, + "loss": 0.0011479377746582032, + "step": 171550 + }, + { + "epoch": 24.352022711142656, + "grad_norm": 0.7583051323890686, + "learning_rate": 7.565819730305181e-05, + "loss": 0.010561546683311463, + "step": 171560 + }, + { + "epoch": 24.35344215755855, + "grad_norm": 0.027747943997383118, + "learning_rate": 7.565677785663592e-05, + "loss": 0.0010014042258262635, + "step": 171570 + }, + { + "epoch": 24.35486160397445, + "grad_norm": 0.9117996692657471, + "learning_rate": 7.565535841022002e-05, + "loss": 0.028149786591529845, + "step": 171580 + }, + { + "epoch": 24.356281050390347, + "grad_norm": 3.782623529434204, + "learning_rate": 7.565393896380412e-05, + "loss": 0.010963347554206849, + "step": 171590 + }, + { + "epoch": 24.357700496806245, + "grad_norm": 0.013087119907140732, + "learning_rate": 7.565251951738822e-05, + "loss": 0.018326695263385772, + "step": 171600 + }, + { + "epoch": 24.359119943222144, + "grad_norm": 0.03415163978934288, + "learning_rate": 7.565110007097233e-05, + "loss": 0.010242117941379547, + "step": 171610 + }, + { + "epoch": 24.360539389638042, + "grad_norm": 0.549580991268158, + "learning_rate": 7.564968062455642e-05, + "loss": 0.01814238876104355, + "step": 171620 + }, + { + "epoch": 24.36195883605394, + "grad_norm": 0.0215440820902586, + "learning_rate": 7.564826117814053e-05, + "loss": 0.009796377271413803, + "step": 171630 + }, + { + "epoch": 24.363378282469835, + "grad_norm": 0.06809904426336288, + "learning_rate": 7.564684173172463e-05, + "loss": 0.0035464704036712647, + "step": 171640 + }, + { + "epoch": 24.364797728885733, + "grad_norm": 13.471537590026855, + "learning_rate": 7.564542228530873e-05, + "loss": 0.036951324343681334, + "step": 171650 + }, + { + "epoch": 24.36621717530163, + "grad_norm": 0.3744471073150635, + "learning_rate": 7.564400283889284e-05, + "loss": 0.00898383855819702, + "step": 171660 + }, + { + "epoch": 24.36763662171753, + "grad_norm": 0.6272673606872559, + "learning_rate": 7.564258339247694e-05, + "loss": 0.04889333248138428, + "step": 171670 + }, + { + "epoch": 24.36905606813343, + "grad_norm": 0.03984551504254341, + "learning_rate": 7.564116394606105e-05, + "loss": 0.00712449848651886, + "step": 171680 + }, + { + "epoch": 24.370475514549327, + "grad_norm": 0.024507081136107445, + "learning_rate": 7.563974449964513e-05, + "loss": 0.011318628489971162, + "step": 171690 + }, + { + "epoch": 24.371894960965225, + "grad_norm": 0.0023899103980511427, + "learning_rate": 7.563832505322924e-05, + "loss": 0.022556543350219727, + "step": 171700 + }, + { + "epoch": 24.37331440738112, + "grad_norm": 5.964735507965088, + "learning_rate": 7.563690560681334e-05, + "loss": 0.00799437314271927, + "step": 171710 + }, + { + "epoch": 24.374733853797018, + "grad_norm": 0.054741475731134415, + "learning_rate": 7.563548616039745e-05, + "loss": 0.026009556651115418, + "step": 171720 + }, + { + "epoch": 24.376153300212916, + "grad_norm": 0.04695875942707062, + "learning_rate": 7.563406671398155e-05, + "loss": 0.008736077696084976, + "step": 171730 + }, + { + "epoch": 24.377572746628815, + "grad_norm": 0.09234274923801422, + "learning_rate": 7.563264726756565e-05, + "loss": 0.01729717254638672, + "step": 171740 + }, + { + "epoch": 24.378992193044713, + "grad_norm": 0.7623608112335205, + "learning_rate": 7.563122782114976e-05, + "loss": 0.01614385098218918, + "step": 171750 + }, + { + "epoch": 24.38041163946061, + "grad_norm": 0.026332417502999306, + "learning_rate": 7.562980837473385e-05, + "loss": 0.03324398100376129, + "step": 171760 + }, + { + "epoch": 24.38183108587651, + "grad_norm": 0.06467438489198685, + "learning_rate": 7.562838892831797e-05, + "loss": 0.023391297459602355, + "step": 171770 + }, + { + "epoch": 24.383250532292404, + "grad_norm": 0.004523133393377066, + "learning_rate": 7.562696948190206e-05, + "loss": 0.023759207129478453, + "step": 171780 + }, + { + "epoch": 24.384669978708303, + "grad_norm": 14.442704200744629, + "learning_rate": 7.562555003548616e-05, + "loss": 0.026186758279800416, + "step": 171790 + }, + { + "epoch": 24.3860894251242, + "grad_norm": 0.13622568547725677, + "learning_rate": 7.562413058907026e-05, + "loss": 0.005373586341738701, + "step": 171800 + }, + { + "epoch": 24.3875088715401, + "grad_norm": 0.159526526927948, + "learning_rate": 7.562271114265437e-05, + "loss": 0.03176321685314178, + "step": 171810 + }, + { + "epoch": 24.388928317955997, + "grad_norm": 0.029167456552386284, + "learning_rate": 7.562129169623847e-05, + "loss": 0.0016418952494859696, + "step": 171820 + }, + { + "epoch": 24.390347764371896, + "grad_norm": 3.619213104248047, + "learning_rate": 7.561987224982258e-05, + "loss": 0.016650237143039703, + "step": 171830 + }, + { + "epoch": 24.391767210787794, + "grad_norm": 1.7061997652053833, + "learning_rate": 7.561845280340667e-05, + "loss": 0.006128300726413727, + "step": 171840 + }, + { + "epoch": 24.39318665720369, + "grad_norm": 0.17823182046413422, + "learning_rate": 7.561703335699077e-05, + "loss": 0.017860589921474455, + "step": 171850 + }, + { + "epoch": 24.394606103619587, + "grad_norm": 0.08890421688556671, + "learning_rate": 7.561561391057488e-05, + "loss": 0.03100501298904419, + "step": 171860 + }, + { + "epoch": 24.396025550035485, + "grad_norm": 0.8950623273849487, + "learning_rate": 7.561419446415898e-05, + "loss": 0.014384913444519042, + "step": 171870 + }, + { + "epoch": 24.397444996451384, + "grad_norm": 0.11846330761909485, + "learning_rate": 7.561277501774309e-05, + "loss": 0.007321345806121826, + "step": 171880 + }, + { + "epoch": 24.398864442867282, + "grad_norm": 0.006659661885350943, + "learning_rate": 7.561135557132719e-05, + "loss": 0.017016442120075227, + "step": 171890 + }, + { + "epoch": 24.40028388928318, + "grad_norm": 0.057772520929574966, + "learning_rate": 7.560993612491129e-05, + "loss": 0.033233410120010375, + "step": 171900 + }, + { + "epoch": 24.40170333569908, + "grad_norm": 0.470302015542984, + "learning_rate": 7.560851667849538e-05, + "loss": 0.011119353771209716, + "step": 171910 + }, + { + "epoch": 24.403122782114973, + "grad_norm": 0.00832580029964447, + "learning_rate": 7.56070972320795e-05, + "loss": 0.0012454293668270112, + "step": 171920 + }, + { + "epoch": 24.40454222853087, + "grad_norm": 0.3036321699619293, + "learning_rate": 7.560567778566359e-05, + "loss": 0.005230455473065376, + "step": 171930 + }, + { + "epoch": 24.40596167494677, + "grad_norm": 0.006322559900581837, + "learning_rate": 7.56042583392477e-05, + "loss": 0.002500082179903984, + "step": 171940 + }, + { + "epoch": 24.40738112136267, + "grad_norm": 13.434318542480469, + "learning_rate": 7.56028388928318e-05, + "loss": 0.02330121248960495, + "step": 171950 + }, + { + "epoch": 24.408800567778567, + "grad_norm": 3.704366445541382, + "learning_rate": 7.56014194464159e-05, + "loss": 0.014177140593528748, + "step": 171960 + }, + { + "epoch": 24.410220014194465, + "grad_norm": 0.12989938259124756, + "learning_rate": 7.560000000000001e-05, + "loss": 0.002335922420024872, + "step": 171970 + }, + { + "epoch": 24.411639460610363, + "grad_norm": 0.9910168051719666, + "learning_rate": 7.55985805535841e-05, + "loss": 0.014373110234737396, + "step": 171980 + }, + { + "epoch": 24.413058907026258, + "grad_norm": 1.589354157447815, + "learning_rate": 7.559716110716822e-05, + "loss": 0.0042845699936151505, + "step": 171990 + }, + { + "epoch": 24.414478353442156, + "grad_norm": 0.020753614604473114, + "learning_rate": 7.55957416607523e-05, + "loss": 0.0032450903207063677, + "step": 172000 + }, + { + "epoch": 24.414478353442156, + "eval_accuracy": 0.9872194315508361, + "eval_loss": 0.05515184625983238, + "eval_runtime": 34.8672, + "eval_samples_per_second": 451.054, + "eval_steps_per_second": 14.111, + "step": 172000 + }, + { + "epoch": 24.415897799858055, + "grad_norm": 4.205295562744141, + "learning_rate": 7.559432221433641e-05, + "loss": 0.11013655662536621, + "step": 172010 + }, + { + "epoch": 24.417317246273953, + "grad_norm": 0.14868435263633728, + "learning_rate": 7.559290276792051e-05, + "loss": 0.015578103065490723, + "step": 172020 + }, + { + "epoch": 24.41873669268985, + "grad_norm": 11.67628002166748, + "learning_rate": 7.559148332150462e-05, + "loss": 0.008479131758213044, + "step": 172030 + }, + { + "epoch": 24.42015613910575, + "grad_norm": 0.04485812038183212, + "learning_rate": 7.559006387508873e-05, + "loss": 0.012803144752979279, + "step": 172040 + }, + { + "epoch": 24.421575585521648, + "grad_norm": 0.014638926833868027, + "learning_rate": 7.558864442867281e-05, + "loss": 0.04284865856170654, + "step": 172050 + }, + { + "epoch": 24.422995031937543, + "grad_norm": 1.0862561464309692, + "learning_rate": 7.558722498225693e-05, + "loss": 0.007360052317380905, + "step": 172060 + }, + { + "epoch": 24.42441447835344, + "grad_norm": 1.1442761421203613, + "learning_rate": 7.558580553584102e-05, + "loss": 0.042764914035797116, + "step": 172070 + }, + { + "epoch": 24.42583392476934, + "grad_norm": 0.2889437675476074, + "learning_rate": 7.558438608942513e-05, + "loss": 0.037979108095169065, + "step": 172080 + }, + { + "epoch": 24.427253371185238, + "grad_norm": 0.0801275297999382, + "learning_rate": 7.558296664300923e-05, + "loss": 0.002913491800427437, + "step": 172090 + }, + { + "epoch": 24.428672817601136, + "grad_norm": 0.6781374216079712, + "learning_rate": 7.558154719659333e-05, + "loss": 0.03361157476902008, + "step": 172100 + }, + { + "epoch": 24.430092264017034, + "grad_norm": 5.486020088195801, + "learning_rate": 7.558012775017743e-05, + "loss": 0.012221167981624603, + "step": 172110 + }, + { + "epoch": 24.431511710432932, + "grad_norm": 0.058155838400125504, + "learning_rate": 7.557870830376154e-05, + "loss": 0.02586734890937805, + "step": 172120 + }, + { + "epoch": 24.432931156848827, + "grad_norm": 0.028147872537374496, + "learning_rate": 7.557728885734565e-05, + "loss": 0.022798049449920654, + "step": 172130 + }, + { + "epoch": 24.434350603264726, + "grad_norm": 10.431013107299805, + "learning_rate": 7.557586941092974e-05, + "loss": 0.015030691027641296, + "step": 172140 + }, + { + "epoch": 24.435770049680624, + "grad_norm": 0.022211086004972458, + "learning_rate": 7.557444996451384e-05, + "loss": 0.011979791522026061, + "step": 172150 + }, + { + "epoch": 24.437189496096522, + "grad_norm": 0.20406194031238556, + "learning_rate": 7.557303051809794e-05, + "loss": 0.03270866870880127, + "step": 172160 + }, + { + "epoch": 24.43860894251242, + "grad_norm": 0.04677702486515045, + "learning_rate": 7.557161107168205e-05, + "loss": 0.004667292535305023, + "step": 172170 + }, + { + "epoch": 24.44002838892832, + "grad_norm": 6.069349765777588, + "learning_rate": 7.557019162526615e-05, + "loss": 0.005503764376044273, + "step": 172180 + }, + { + "epoch": 24.441447835344217, + "grad_norm": 0.05811656638979912, + "learning_rate": 7.556877217885026e-05, + "loss": 0.00427556149661541, + "step": 172190 + }, + { + "epoch": 24.442867281760112, + "grad_norm": 0.017314760014414787, + "learning_rate": 7.556735273243434e-05, + "loss": 0.013457880914211273, + "step": 172200 + }, + { + "epoch": 24.44428672817601, + "grad_norm": 0.013891647569835186, + "learning_rate": 7.556593328601845e-05, + "loss": 0.019684898853302, + "step": 172210 + }, + { + "epoch": 24.44570617459191, + "grad_norm": 0.2491278201341629, + "learning_rate": 7.556451383960255e-05, + "loss": 0.00856083482503891, + "step": 172220 + }, + { + "epoch": 24.447125621007807, + "grad_norm": 1.848828673362732, + "learning_rate": 7.556309439318666e-05, + "loss": 0.00400652177631855, + "step": 172230 + }, + { + "epoch": 24.448545067423705, + "grad_norm": 0.03616641089320183, + "learning_rate": 7.556167494677077e-05, + "loss": 0.014692434668540954, + "step": 172240 + }, + { + "epoch": 24.449964513839603, + "grad_norm": 0.14106939733028412, + "learning_rate": 7.556025550035487e-05, + "loss": 0.0032199986279010774, + "step": 172250 + }, + { + "epoch": 24.4513839602555, + "grad_norm": 0.5355103611946106, + "learning_rate": 7.555883605393897e-05, + "loss": 0.030160155892372132, + "step": 172260 + }, + { + "epoch": 24.4528034066714, + "grad_norm": 12.240964889526367, + "learning_rate": 7.555741660752306e-05, + "loss": 0.031699654459953305, + "step": 172270 + }, + { + "epoch": 24.454222853087295, + "grad_norm": 0.12350873649120331, + "learning_rate": 7.555599716110718e-05, + "loss": 0.015441574156284332, + "step": 172280 + }, + { + "epoch": 24.455642299503193, + "grad_norm": 2.022411823272705, + "learning_rate": 7.555457771469127e-05, + "loss": 0.018189707398414613, + "step": 172290 + }, + { + "epoch": 24.45706174591909, + "grad_norm": 0.6644578576087952, + "learning_rate": 7.555315826827538e-05, + "loss": 0.016485252976417543, + "step": 172300 + }, + { + "epoch": 24.45848119233499, + "grad_norm": 0.6055423617362976, + "learning_rate": 7.555173882185947e-05, + "loss": 0.01698671281337738, + "step": 172310 + }, + { + "epoch": 24.459900638750888, + "grad_norm": 0.07722372561693192, + "learning_rate": 7.555031937544358e-05, + "loss": 0.015683093667030336, + "step": 172320 + }, + { + "epoch": 24.461320085166786, + "grad_norm": 0.013709519989788532, + "learning_rate": 7.554889992902769e-05, + "loss": 0.02126365154981613, + "step": 172330 + }, + { + "epoch": 24.462739531582685, + "grad_norm": 0.004744543693959713, + "learning_rate": 7.554748048261179e-05, + "loss": 0.009096023440361024, + "step": 172340 + }, + { + "epoch": 24.46415897799858, + "grad_norm": 1.9653034210205078, + "learning_rate": 7.55460610361959e-05, + "loss": 0.0075356349349021915, + "step": 172350 + }, + { + "epoch": 24.465578424414478, + "grad_norm": 0.14765673875808716, + "learning_rate": 7.554464158977998e-05, + "loss": 0.02745935320854187, + "step": 172360 + }, + { + "epoch": 24.466997870830376, + "grad_norm": 0.09819276630878448, + "learning_rate": 7.554322214336409e-05, + "loss": 0.008370977640151978, + "step": 172370 + }, + { + "epoch": 24.468417317246274, + "grad_norm": 8.715452194213867, + "learning_rate": 7.554180269694819e-05, + "loss": 0.02947773039340973, + "step": 172380 + }, + { + "epoch": 24.469836763662173, + "grad_norm": 0.17428545653820038, + "learning_rate": 7.55403832505323e-05, + "loss": 0.0038420304656028746, + "step": 172390 + }, + { + "epoch": 24.47125621007807, + "grad_norm": 0.05542585998773575, + "learning_rate": 7.55389638041164e-05, + "loss": 0.012527038156986237, + "step": 172400 + }, + { + "epoch": 24.47267565649397, + "grad_norm": 3.3686344623565674, + "learning_rate": 7.55375443577005e-05, + "loss": 0.02081933170557022, + "step": 172410 + }, + { + "epoch": 24.474095102909864, + "grad_norm": 2.4575035572052, + "learning_rate": 7.55361249112846e-05, + "loss": 0.03591987490653992, + "step": 172420 + }, + { + "epoch": 24.475514549325762, + "grad_norm": 0.10014995187520981, + "learning_rate": 7.55347054648687e-05, + "loss": 0.02637418806552887, + "step": 172430 + }, + { + "epoch": 24.47693399574166, + "grad_norm": 0.14975695312023163, + "learning_rate": 7.553328601845282e-05, + "loss": 0.05801851749420166, + "step": 172440 + }, + { + "epoch": 24.47835344215756, + "grad_norm": 7.230225086212158, + "learning_rate": 7.553186657203691e-05, + "loss": 0.0037820491939783095, + "step": 172450 + }, + { + "epoch": 24.479772888573457, + "grad_norm": 0.03988121077418327, + "learning_rate": 7.553044712562101e-05, + "loss": 0.039179539680480956, + "step": 172460 + }, + { + "epoch": 24.481192334989355, + "grad_norm": 0.40523064136505127, + "learning_rate": 7.552902767920511e-05, + "loss": 0.0022327039390802384, + "step": 172470 + }, + { + "epoch": 24.482611781405254, + "grad_norm": 0.02132718823850155, + "learning_rate": 7.552760823278922e-05, + "loss": 0.009044615924358368, + "step": 172480 + }, + { + "epoch": 24.48403122782115, + "grad_norm": 7.030731678009033, + "learning_rate": 7.552618878637332e-05, + "loss": 0.01862950325012207, + "step": 172490 + }, + { + "epoch": 24.485450674237047, + "grad_norm": 0.024500075727701187, + "learning_rate": 7.552476933995743e-05, + "loss": 0.009690459072589874, + "step": 172500 + }, + { + "epoch": 24.485450674237047, + "eval_accuracy": 0.9866471672919184, + "eval_loss": 0.05648936331272125, + "eval_runtime": 35.1279, + "eval_samples_per_second": 447.706, + "eval_steps_per_second": 14.006, + "step": 172500 + }, + { + "epoch": 24.486870120652945, + "grad_norm": 0.04415478929877281, + "learning_rate": 7.552334989354152e-05, + "loss": 0.01055758148431778, + "step": 172510 + }, + { + "epoch": 24.488289567068843, + "grad_norm": 0.27077895402908325, + "learning_rate": 7.552207239176722e-05, + "loss": 0.031713688373565675, + "step": 172520 + }, + { + "epoch": 24.48970901348474, + "grad_norm": 1.6702265739440918, + "learning_rate": 7.55206529453513e-05, + "loss": 0.0102662093937397, + "step": 172530 + }, + { + "epoch": 24.49112845990064, + "grad_norm": 0.3075665235519409, + "learning_rate": 7.551923349893542e-05, + "loss": 0.016057641804218294, + "step": 172540 + }, + { + "epoch": 24.49254790631654, + "grad_norm": 0.04443064704537392, + "learning_rate": 7.551781405251951e-05, + "loss": 0.009007921814918518, + "step": 172550 + }, + { + "epoch": 24.493967352732433, + "grad_norm": 0.2729581594467163, + "learning_rate": 7.551639460610363e-05, + "loss": 0.00744268000125885, + "step": 172560 + }, + { + "epoch": 24.49538679914833, + "grad_norm": 0.035349324345588684, + "learning_rate": 7.551497515968772e-05, + "loss": 0.023080652952194212, + "step": 172570 + }, + { + "epoch": 24.49680624556423, + "grad_norm": 2.4601495265960693, + "learning_rate": 7.551355571327183e-05, + "loss": 0.011022865772247314, + "step": 172580 + }, + { + "epoch": 24.498225691980128, + "grad_norm": 15.835770606994629, + "learning_rate": 7.551213626685593e-05, + "loss": 0.04426452219486236, + "step": 172590 + }, + { + "epoch": 24.499645138396026, + "grad_norm": 2.3078885078430176, + "learning_rate": 7.551071682044003e-05, + "loss": 0.025807699561119078, + "step": 172600 + }, + { + "epoch": 24.501064584811925, + "grad_norm": 0.08158712089061737, + "learning_rate": 7.550929737402414e-05, + "loss": 0.007269416749477386, + "step": 172610 + }, + { + "epoch": 24.502484031227823, + "grad_norm": 0.032372210174798965, + "learning_rate": 7.550787792760824e-05, + "loss": 0.004850380495190621, + "step": 172620 + }, + { + "epoch": 24.503903477643718, + "grad_norm": 0.12573477625846863, + "learning_rate": 7.550645848119235e-05, + "loss": 0.008511296659708022, + "step": 172630 + }, + { + "epoch": 24.505322924059616, + "grad_norm": 1.4681878089904785, + "learning_rate": 7.550503903477643e-05, + "loss": 0.013103844225406646, + "step": 172640 + }, + { + "epoch": 24.506742370475514, + "grad_norm": 0.19516591727733612, + "learning_rate": 7.550361958836054e-05, + "loss": 0.0056366708129644396, + "step": 172650 + }, + { + "epoch": 24.508161816891413, + "grad_norm": 0.872588574886322, + "learning_rate": 7.550220014194464e-05, + "loss": 0.044895032048225404, + "step": 172660 + }, + { + "epoch": 24.50958126330731, + "grad_norm": 0.05518448352813721, + "learning_rate": 7.550078069552875e-05, + "loss": 0.049129828810691833, + "step": 172670 + }, + { + "epoch": 24.51100070972321, + "grad_norm": 18.374984741210938, + "learning_rate": 7.549936124911285e-05, + "loss": 0.026254922151565552, + "step": 172680 + }, + { + "epoch": 24.512420156139108, + "grad_norm": 2.6585330963134766, + "learning_rate": 7.549794180269695e-05, + "loss": 0.008225750178098679, + "step": 172690 + }, + { + "epoch": 24.513839602555002, + "grad_norm": 0.13649216294288635, + "learning_rate": 7.549652235628106e-05, + "loss": 0.021029479801654816, + "step": 172700 + }, + { + "epoch": 24.5152590489709, + "grad_norm": 0.022937094792723656, + "learning_rate": 7.549510290986515e-05, + "loss": 0.003818303719162941, + "step": 172710 + }, + { + "epoch": 24.5166784953868, + "grad_norm": 1.2533864974975586, + "learning_rate": 7.549368346344926e-05, + "loss": 0.012846650183200836, + "step": 172720 + }, + { + "epoch": 24.518097941802697, + "grad_norm": 0.11382197588682175, + "learning_rate": 7.549226401703336e-05, + "loss": 0.002881494536995888, + "step": 172730 + }, + { + "epoch": 24.519517388218595, + "grad_norm": 0.17265233397483826, + "learning_rate": 7.549084457061746e-05, + "loss": 0.019856438040733337, + "step": 172740 + }, + { + "epoch": 24.520936834634494, + "grad_norm": 0.010358629748225212, + "learning_rate": 7.548942512420156e-05, + "loss": 0.010412326455116272, + "step": 172750 + }, + { + "epoch": 24.522356281050392, + "grad_norm": 0.028100673109292984, + "learning_rate": 7.548800567778567e-05, + "loss": 0.0008450828492641449, + "step": 172760 + }, + { + "epoch": 24.523775727466287, + "grad_norm": 2.0101466178894043, + "learning_rate": 7.548658623136977e-05, + "loss": 0.011216413229703903, + "step": 172770 + }, + { + "epoch": 24.525195173882185, + "grad_norm": 0.5748875141143799, + "learning_rate": 7.548516678495388e-05, + "loss": 0.009169350564479827, + "step": 172780 + }, + { + "epoch": 24.526614620298083, + "grad_norm": 0.21246570348739624, + "learning_rate": 7.548374733853797e-05, + "loss": 0.003082595020532608, + "step": 172790 + }, + { + "epoch": 24.528034066713982, + "grad_norm": 2.1780924797058105, + "learning_rate": 7.548232789212207e-05, + "loss": 0.004590088874101639, + "step": 172800 + }, + { + "epoch": 24.52945351312988, + "grad_norm": 0.16839826107025146, + "learning_rate": 7.548090844570618e-05, + "loss": 0.01387241780757904, + "step": 172810 + }, + { + "epoch": 24.53087295954578, + "grad_norm": 0.007299561984837055, + "learning_rate": 7.547948899929028e-05, + "loss": 0.005253856256604195, + "step": 172820 + }, + { + "epoch": 24.532292405961677, + "grad_norm": 0.050835102796554565, + "learning_rate": 7.547806955287439e-05, + "loss": 0.008864811062812806, + "step": 172830 + }, + { + "epoch": 24.53371185237757, + "grad_norm": 21.204370498657227, + "learning_rate": 7.547665010645847e-05, + "loss": 0.014755184948444366, + "step": 172840 + }, + { + "epoch": 24.53513129879347, + "grad_norm": 0.8988128304481506, + "learning_rate": 7.547523066004258e-05, + "loss": 0.009728729724884033, + "step": 172850 + }, + { + "epoch": 24.536550745209368, + "grad_norm": 11.716155052185059, + "learning_rate": 7.547381121362668e-05, + "loss": 0.014095558226108551, + "step": 172860 + }, + { + "epoch": 24.537970191625266, + "grad_norm": 0.05363769084215164, + "learning_rate": 7.547239176721079e-05, + "loss": 0.02228357195854187, + "step": 172870 + }, + { + "epoch": 24.539389638041165, + "grad_norm": 4.397151470184326, + "learning_rate": 7.547097232079489e-05, + "loss": 0.03082396984100342, + "step": 172880 + }, + { + "epoch": 24.540809084457063, + "grad_norm": 0.12464103102684021, + "learning_rate": 7.546955287437899e-05, + "loss": 0.0033983219414949416, + "step": 172890 + }, + { + "epoch": 24.54222853087296, + "grad_norm": 0.03616682067513466, + "learning_rate": 7.54681334279631e-05, + "loss": 0.020311611890792846, + "step": 172900 + }, + { + "epoch": 24.543647977288856, + "grad_norm": 0.9650914669036865, + "learning_rate": 7.54667139815472e-05, + "loss": 0.02631561756134033, + "step": 172910 + }, + { + "epoch": 24.545067423704754, + "grad_norm": 0.014833922497928143, + "learning_rate": 7.546529453513131e-05, + "loss": 0.027789205312728882, + "step": 172920 + }, + { + "epoch": 24.546486870120653, + "grad_norm": 11.547720909118652, + "learning_rate": 7.54638750887154e-05, + "loss": 0.036921563744544986, + "step": 172930 + }, + { + "epoch": 24.54790631653655, + "grad_norm": 0.05448067560791969, + "learning_rate": 7.546245564229952e-05, + "loss": 0.04919399619102478, + "step": 172940 + }, + { + "epoch": 24.54932576295245, + "grad_norm": 0.4612639248371124, + "learning_rate": 7.54610361958836e-05, + "loss": 0.009317369759082794, + "step": 172950 + }, + { + "epoch": 24.550745209368348, + "grad_norm": 0.13400031626224518, + "learning_rate": 7.545961674946771e-05, + "loss": 0.032600441575050355, + "step": 172960 + }, + { + "epoch": 24.552164655784246, + "grad_norm": 0.9484822750091553, + "learning_rate": 7.545819730305181e-05, + "loss": 0.024781063199043274, + "step": 172970 + }, + { + "epoch": 24.55358410220014, + "grad_norm": 0.050443921238183975, + "learning_rate": 7.545677785663592e-05, + "loss": 0.029584148526191713, + "step": 172980 + }, + { + "epoch": 24.55500354861604, + "grad_norm": 0.32550525665283203, + "learning_rate": 7.545535841022003e-05, + "loss": 0.0023061655461788177, + "step": 172990 + }, + { + "epoch": 24.556422995031937, + "grad_norm": 1.5090726613998413, + "learning_rate": 7.545393896380411e-05, + "loss": 0.04822568297386169, + "step": 173000 + }, + { + "epoch": 24.556422995031937, + "eval_accuracy": 0.9855662236917403, + "eval_loss": 0.053299061954021454, + "eval_runtime": 32.7496, + "eval_samples_per_second": 480.219, + "eval_steps_per_second": 15.023, + "step": 173000 + }, + { + "epoch": 24.557842441447836, + "grad_norm": 0.10883989930152893, + "learning_rate": 7.545251951738822e-05, + "loss": 0.028909367322921754, + "step": 173010 + }, + { + "epoch": 24.559261887863734, + "grad_norm": 2.4337449073791504, + "learning_rate": 7.545110007097232e-05, + "loss": 0.024964602291584016, + "step": 173020 + }, + { + "epoch": 24.560681334279632, + "grad_norm": 0.7381154894828796, + "learning_rate": 7.544968062455643e-05, + "loss": 0.0008930198848247529, + "step": 173030 + }, + { + "epoch": 24.56210078069553, + "grad_norm": 0.037697162479162216, + "learning_rate": 7.544826117814053e-05, + "loss": 0.005639796331524849, + "step": 173040 + }, + { + "epoch": 24.563520227111425, + "grad_norm": 0.08479748666286469, + "learning_rate": 7.544684173172463e-05, + "loss": 0.01674344539642334, + "step": 173050 + }, + { + "epoch": 24.564939673527324, + "grad_norm": 2.163802146911621, + "learning_rate": 7.544542228530872e-05, + "loss": 0.004720341786742211, + "step": 173060 + }, + { + "epoch": 24.566359119943222, + "grad_norm": 0.6964181661605835, + "learning_rate": 7.544400283889284e-05, + "loss": 0.024261587858200075, + "step": 173070 + }, + { + "epoch": 24.56777856635912, + "grad_norm": 0.011276614852249622, + "learning_rate": 7.544258339247695e-05, + "loss": 0.004311401769518853, + "step": 173080 + }, + { + "epoch": 24.56919801277502, + "grad_norm": 1.5126656293869019, + "learning_rate": 7.544116394606104e-05, + "loss": 0.008133313804864883, + "step": 173090 + }, + { + "epoch": 24.570617459190917, + "grad_norm": 2.243642807006836, + "learning_rate": 7.543974449964514e-05, + "loss": 0.008069783449172974, + "step": 173100 + }, + { + "epoch": 24.572036905606815, + "grad_norm": 0.12908108532428741, + "learning_rate": 7.543832505322924e-05, + "loss": 0.027231383323669433, + "step": 173110 + }, + { + "epoch": 24.57345635202271, + "grad_norm": 0.17065855860710144, + "learning_rate": 7.543690560681335e-05, + "loss": 0.001540723443031311, + "step": 173120 + }, + { + "epoch": 24.574875798438608, + "grad_norm": 0.03855713829398155, + "learning_rate": 7.543548616039745e-05, + "loss": 0.010275854170322419, + "step": 173130 + }, + { + "epoch": 24.576295244854506, + "grad_norm": 1.6242815256118774, + "learning_rate": 7.543406671398156e-05, + "loss": 0.011269563436508178, + "step": 173140 + }, + { + "epoch": 24.577714691270405, + "grad_norm": 0.020505007356405258, + "learning_rate": 7.543264726756564e-05, + "loss": 0.004435010999441147, + "step": 173150 + }, + { + "epoch": 24.579134137686303, + "grad_norm": 0.26409074664115906, + "learning_rate": 7.543122782114975e-05, + "loss": 0.0259554922580719, + "step": 173160 + }, + { + "epoch": 24.5805535841022, + "grad_norm": 2.699275016784668, + "learning_rate": 7.542980837473386e-05, + "loss": 0.011736828088760375, + "step": 173170 + }, + { + "epoch": 24.5819730305181, + "grad_norm": 6.434495449066162, + "learning_rate": 7.542838892831796e-05, + "loss": 0.019031801819801332, + "step": 173180 + }, + { + "epoch": 24.583392476933994, + "grad_norm": 0.16141745448112488, + "learning_rate": 7.542696948190207e-05, + "loss": 0.02666536271572113, + "step": 173190 + }, + { + "epoch": 24.584811923349893, + "grad_norm": 0.297879695892334, + "learning_rate": 7.542555003548616e-05, + "loss": 0.03981755375862121, + "step": 173200 + }, + { + "epoch": 24.58623136976579, + "grad_norm": 2.3711814880371094, + "learning_rate": 7.542413058907027e-05, + "loss": 0.02553957998752594, + "step": 173210 + }, + { + "epoch": 24.58765081618169, + "grad_norm": 0.053933609277009964, + "learning_rate": 7.542271114265436e-05, + "loss": 0.015029634535312652, + "step": 173220 + }, + { + "epoch": 24.589070262597588, + "grad_norm": 0.016468452289700508, + "learning_rate": 7.542129169623847e-05, + "loss": 0.006254653632640839, + "step": 173230 + }, + { + "epoch": 24.590489709013486, + "grad_norm": 0.0055716149508953094, + "learning_rate": 7.541987224982257e-05, + "loss": 0.00433087982237339, + "step": 173240 + }, + { + "epoch": 24.591909155429384, + "grad_norm": 0.099225252866745, + "learning_rate": 7.541845280340667e-05, + "loss": 0.009511874616146087, + "step": 173250 + }, + { + "epoch": 24.59332860184528, + "grad_norm": 0.27751070261001587, + "learning_rate": 7.541703335699078e-05, + "loss": 0.007168865203857422, + "step": 173260 + }, + { + "epoch": 24.594748048261177, + "grad_norm": 0.08781001716852188, + "learning_rate": 7.541561391057488e-05, + "loss": 0.002042149007320404, + "step": 173270 + }, + { + "epoch": 24.596167494677076, + "grad_norm": 0.12209714949131012, + "learning_rate": 7.541419446415899e-05, + "loss": 0.026114165782928467, + "step": 173280 + }, + { + "epoch": 24.597586941092974, + "grad_norm": 9.877880096435547, + "learning_rate": 7.541277501774309e-05, + "loss": 0.005551735684275627, + "step": 173290 + }, + { + "epoch": 24.599006387508872, + "grad_norm": 0.003487059148028493, + "learning_rate": 7.54113555713272e-05, + "loss": 0.02010783851146698, + "step": 173300 + }, + { + "epoch": 24.60042583392477, + "grad_norm": 0.39019468426704407, + "learning_rate": 7.540993612491128e-05, + "loss": 0.007010398805141449, + "step": 173310 + }, + { + "epoch": 24.60184528034067, + "grad_norm": 5.931338310241699, + "learning_rate": 7.540851667849539e-05, + "loss": 0.005229796096682548, + "step": 173320 + }, + { + "epoch": 24.603264726756564, + "grad_norm": 0.3461534082889557, + "learning_rate": 7.540709723207949e-05, + "loss": 0.015190723538398742, + "step": 173330 + }, + { + "epoch": 24.604684173172462, + "grad_norm": 6.468252658843994, + "learning_rate": 7.54056777856636e-05, + "loss": 0.029922986030578615, + "step": 173340 + }, + { + "epoch": 24.60610361958836, + "grad_norm": 0.11011413484811783, + "learning_rate": 7.54042583392477e-05, + "loss": 0.021273039281368256, + "step": 173350 + }, + { + "epoch": 24.60752306600426, + "grad_norm": 7.940593719482422, + "learning_rate": 7.54028388928318e-05, + "loss": 0.009889286756515504, + "step": 173360 + }, + { + "epoch": 24.608942512420157, + "grad_norm": 0.3290795385837555, + "learning_rate": 7.54014194464159e-05, + "loss": 0.014547610282897949, + "step": 173370 + }, + { + "epoch": 24.610361958836055, + "grad_norm": 1.2322243452072144, + "learning_rate": 7.54e-05, + "loss": 0.02730262279510498, + "step": 173380 + }, + { + "epoch": 24.611781405251953, + "grad_norm": 0.06572465598583221, + "learning_rate": 7.539858055358411e-05, + "loss": 0.004366625845432281, + "step": 173390 + }, + { + "epoch": 24.613200851667848, + "grad_norm": 0.0385855995118618, + "learning_rate": 7.539716110716821e-05, + "loss": 0.017726033926010132, + "step": 173400 + }, + { + "epoch": 24.614620298083747, + "grad_norm": 3.4153127670288086, + "learning_rate": 7.539574166075231e-05, + "loss": 0.06881921291351319, + "step": 173410 + }, + { + "epoch": 24.616039744499645, + "grad_norm": 0.07019100338220596, + "learning_rate": 7.53943222143364e-05, + "loss": 0.01205579936504364, + "step": 173420 + }, + { + "epoch": 24.617459190915543, + "grad_norm": 0.8339296579360962, + "learning_rate": 7.539290276792052e-05, + "loss": 0.03111276626586914, + "step": 173430 + }, + { + "epoch": 24.61887863733144, + "grad_norm": 4.628482341766357, + "learning_rate": 7.539148332150461e-05, + "loss": 0.04865820109844208, + "step": 173440 + }, + { + "epoch": 24.62029808374734, + "grad_norm": 0.006172515917569399, + "learning_rate": 7.539006387508873e-05, + "loss": 0.0114622563123703, + "step": 173450 + }, + { + "epoch": 24.621717530163238, + "grad_norm": 0.2510690689086914, + "learning_rate": 7.538864442867282e-05, + "loss": 0.006974605470895767, + "step": 173460 + }, + { + "epoch": 24.623136976579133, + "grad_norm": 1.2571231126785278, + "learning_rate": 7.538722498225692e-05, + "loss": 0.03992668688297272, + "step": 173470 + }, + { + "epoch": 24.62455642299503, + "grad_norm": 0.004121400415897369, + "learning_rate": 7.538580553584103e-05, + "loss": 0.029168635606765747, + "step": 173480 + }, + { + "epoch": 24.62597586941093, + "grad_norm": 6.808797836303711, + "learning_rate": 7.538438608942513e-05, + "loss": 0.008000247180461884, + "step": 173490 + }, + { + "epoch": 24.627395315826828, + "grad_norm": 0.21779268980026245, + "learning_rate": 7.538296664300924e-05, + "loss": 0.008310173451900483, + "step": 173500 + }, + { + "epoch": 24.627395315826828, + "eval_accuracy": 0.9817511286322884, + "eval_loss": 0.07154663652181625, + "eval_runtime": 33.2848, + "eval_samples_per_second": 472.498, + "eval_steps_per_second": 14.782, + "step": 173500 + }, + { + "epoch": 24.628814762242726, + "grad_norm": 7.705733776092529, + "learning_rate": 7.538154719659332e-05, + "loss": 0.019583386182785035, + "step": 173510 + }, + { + "epoch": 24.630234208658624, + "grad_norm": 2.811020851135254, + "learning_rate": 7.538012775017743e-05, + "loss": 0.01926662027835846, + "step": 173520 + }, + { + "epoch": 24.631653655074523, + "grad_norm": 1.1018458604812622, + "learning_rate": 7.537870830376153e-05, + "loss": 0.014549502730369568, + "step": 173530 + }, + { + "epoch": 24.633073101490417, + "grad_norm": 1.3479515314102173, + "learning_rate": 7.537728885734564e-05, + "loss": 0.01471194177865982, + "step": 173540 + }, + { + "epoch": 24.634492547906316, + "grad_norm": 0.034686602652072906, + "learning_rate": 7.537586941092974e-05, + "loss": 0.00515458956360817, + "step": 173550 + }, + { + "epoch": 24.635911994322214, + "grad_norm": 0.09343839436769485, + "learning_rate": 7.537444996451384e-05, + "loss": 0.015272516012191772, + "step": 173560 + }, + { + "epoch": 24.637331440738112, + "grad_norm": 0.8138301968574524, + "learning_rate": 7.537303051809795e-05, + "loss": 0.009325875341892243, + "step": 173570 + }, + { + "epoch": 24.63875088715401, + "grad_norm": 0.0479440875351429, + "learning_rate": 7.537161107168205e-05, + "loss": 0.0012496449053287506, + "step": 173580 + }, + { + "epoch": 24.64017033356991, + "grad_norm": 1.3075040578842163, + "learning_rate": 7.537019162526616e-05, + "loss": 0.01431242674589157, + "step": 173590 + }, + { + "epoch": 24.641589779985807, + "grad_norm": 3.1167502403259277, + "learning_rate": 7.536877217885025e-05, + "loss": 0.006149962544441223, + "step": 173600 + }, + { + "epoch": 24.643009226401702, + "grad_norm": 1.778635025024414, + "learning_rate": 7.536735273243435e-05, + "loss": 0.009568555653095246, + "step": 173610 + }, + { + "epoch": 24.6444286728176, + "grad_norm": 0.05880541726946831, + "learning_rate": 7.536593328601845e-05, + "loss": 0.005240377783775329, + "step": 173620 + }, + { + "epoch": 24.6458481192335, + "grad_norm": 0.08257175236940384, + "learning_rate": 7.536451383960256e-05, + "loss": 0.042884424328804016, + "step": 173630 + }, + { + "epoch": 24.647267565649397, + "grad_norm": 4.259369373321533, + "learning_rate": 7.536309439318666e-05, + "loss": 0.019749659299850463, + "step": 173640 + }, + { + "epoch": 24.648687012065295, + "grad_norm": 0.010093705728650093, + "learning_rate": 7.536167494677077e-05, + "loss": 0.026267236471176146, + "step": 173650 + }, + { + "epoch": 24.650106458481194, + "grad_norm": 0.43593189120292664, + "learning_rate": 7.536025550035487e-05, + "loss": 0.001036933809518814, + "step": 173660 + }, + { + "epoch": 24.651525904897092, + "grad_norm": 1.1529186964035034, + "learning_rate": 7.535883605393896e-05, + "loss": 0.01952963471412659, + "step": 173670 + }, + { + "epoch": 24.652945351312987, + "grad_norm": 0.5616307854652405, + "learning_rate": 7.535741660752307e-05, + "loss": 0.0015532765537500381, + "step": 173680 + }, + { + "epoch": 24.654364797728885, + "grad_norm": 0.1121378093957901, + "learning_rate": 7.535599716110717e-05, + "loss": 0.019932952523231507, + "step": 173690 + }, + { + "epoch": 24.655784244144783, + "grad_norm": 0.1303601861000061, + "learning_rate": 7.535457771469128e-05, + "loss": 0.01972169578075409, + "step": 173700 + }, + { + "epoch": 24.65720369056068, + "grad_norm": 7.903414249420166, + "learning_rate": 7.535315826827538e-05, + "loss": 0.012403970211744308, + "step": 173710 + }, + { + "epoch": 24.65862313697658, + "grad_norm": 0.010363122448325157, + "learning_rate": 7.535173882185948e-05, + "loss": 0.04594444036483765, + "step": 173720 + }, + { + "epoch": 24.660042583392478, + "grad_norm": 0.1596154123544693, + "learning_rate": 7.535031937544357e-05, + "loss": 0.009175486117601394, + "step": 173730 + }, + { + "epoch": 24.661462029808376, + "grad_norm": 0.02869066223502159, + "learning_rate": 7.534889992902768e-05, + "loss": 0.01546778976917267, + "step": 173740 + }, + { + "epoch": 24.66288147622427, + "grad_norm": 0.034056100994348526, + "learning_rate": 7.534748048261178e-05, + "loss": 0.009404253959655762, + "step": 173750 + }, + { + "epoch": 24.66430092264017, + "grad_norm": 5.095276355743408, + "learning_rate": 7.534606103619589e-05, + "loss": 0.017928995192050934, + "step": 173760 + }, + { + "epoch": 24.665720369056068, + "grad_norm": 0.02879653126001358, + "learning_rate": 7.534464158977999e-05, + "loss": 0.020333236455917357, + "step": 173770 + }, + { + "epoch": 24.667139815471966, + "grad_norm": 0.02906031720340252, + "learning_rate": 7.534322214336409e-05, + "loss": 0.01710437089204788, + "step": 173780 + }, + { + "epoch": 24.668559261887864, + "grad_norm": 0.0936087816953659, + "learning_rate": 7.53418026969482e-05, + "loss": 0.017091310024261473, + "step": 173790 + }, + { + "epoch": 24.669978708303763, + "grad_norm": 0.10297982394695282, + "learning_rate": 7.53403832505323e-05, + "loss": 0.008067121356725692, + "step": 173800 + }, + { + "epoch": 24.67139815471966, + "grad_norm": 0.019238462671637535, + "learning_rate": 7.533896380411641e-05, + "loss": 0.02637377679347992, + "step": 173810 + }, + { + "epoch": 24.672817601135556, + "grad_norm": 0.004669790156185627, + "learning_rate": 7.533754435770049e-05, + "loss": 0.013121181726455688, + "step": 173820 + }, + { + "epoch": 24.674237047551454, + "grad_norm": 0.160783052444458, + "learning_rate": 7.53361249112846e-05, + "loss": 0.03263942003250122, + "step": 173830 + }, + { + "epoch": 24.675656493967352, + "grad_norm": 0.06396448612213135, + "learning_rate": 7.53347054648687e-05, + "loss": 0.018804968893527986, + "step": 173840 + }, + { + "epoch": 24.67707594038325, + "grad_norm": 0.20103542506694794, + "learning_rate": 7.533328601845281e-05, + "loss": 0.019319495558738707, + "step": 173850 + }, + { + "epoch": 24.67849538679915, + "grad_norm": 0.051260966807603836, + "learning_rate": 7.533186657203691e-05, + "loss": 0.015797223150730132, + "step": 173860 + }, + { + "epoch": 24.679914833215047, + "grad_norm": 0.2959256172180176, + "learning_rate": 7.5330447125621e-05, + "loss": 0.022255034744739534, + "step": 173870 + }, + { + "epoch": 24.681334279630946, + "grad_norm": 3.666099786758423, + "learning_rate": 7.532902767920512e-05, + "loss": 0.008968500047922134, + "step": 173880 + }, + { + "epoch": 24.68275372604684, + "grad_norm": 0.026177098974585533, + "learning_rate": 7.532760823278921e-05, + "loss": 0.02103397697210312, + "step": 173890 + }, + { + "epoch": 24.68417317246274, + "grad_norm": 13.08432674407959, + "learning_rate": 7.532618878637332e-05, + "loss": 0.03584373593330383, + "step": 173900 + }, + { + "epoch": 24.685592618878637, + "grad_norm": 7.127645969390869, + "learning_rate": 7.532476933995742e-05, + "loss": 0.019541648030281068, + "step": 173910 + }, + { + "epoch": 24.687012065294535, + "grad_norm": 0.8689559698104858, + "learning_rate": 7.532334989354152e-05, + "loss": 0.034372559189796446, + "step": 173920 + }, + { + "epoch": 24.688431511710434, + "grad_norm": 15.407941818237305, + "learning_rate": 7.532193044712562e-05, + "loss": 0.042028778791427614, + "step": 173930 + }, + { + "epoch": 24.689850958126332, + "grad_norm": 6.145047187805176, + "learning_rate": 7.532051100070973e-05, + "loss": 0.06301599740982056, + "step": 173940 + }, + { + "epoch": 24.69127040454223, + "grad_norm": 0.029459280893206596, + "learning_rate": 7.531909155429382e-05, + "loss": 0.025892430543899538, + "step": 173950 + }, + { + "epoch": 24.692689850958125, + "grad_norm": 0.8161712884902954, + "learning_rate": 7.531767210787794e-05, + "loss": 0.018246373534202574, + "step": 173960 + }, + { + "epoch": 24.694109297374023, + "grad_norm": 0.2843663990497589, + "learning_rate": 7.531625266146203e-05, + "loss": 0.012884204089641572, + "step": 173970 + }, + { + "epoch": 24.69552874378992, + "grad_norm": 0.05580803006887436, + "learning_rate": 7.531483321504613e-05, + "loss": 0.01538470983505249, + "step": 173980 + }, + { + "epoch": 24.69694819020582, + "grad_norm": 0.1336229145526886, + "learning_rate": 7.531341376863024e-05, + "loss": 0.006511572003364563, + "step": 173990 + }, + { + "epoch": 24.698367636621718, + "grad_norm": 0.0070353541523218155, + "learning_rate": 7.531199432221434e-05, + "loss": 0.037652122974395755, + "step": 174000 + }, + { + "epoch": 24.698367636621718, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.05611168593168259, + "eval_runtime": 33.3009, + "eval_samples_per_second": 472.269, + "eval_steps_per_second": 14.774, + "step": 174000 + }, + { + "epoch": 24.699787083037616, + "grad_norm": 13.81317138671875, + "learning_rate": 7.531057487579845e-05, + "loss": 0.025649493932723998, + "step": 174010 + }, + { + "epoch": 24.701206529453515, + "grad_norm": 0.023678680881857872, + "learning_rate": 7.530915542938255e-05, + "loss": 0.0018464617431163787, + "step": 174020 + }, + { + "epoch": 24.70262597586941, + "grad_norm": 0.36120280623435974, + "learning_rate": 7.530773598296664e-05, + "loss": 0.019840852916240694, + "step": 174030 + }, + { + "epoch": 24.704045422285308, + "grad_norm": 4.366157531738281, + "learning_rate": 7.530631653655074e-05, + "loss": 0.012384282052516937, + "step": 174040 + }, + { + "epoch": 24.705464868701206, + "grad_norm": 0.3357887864112854, + "learning_rate": 7.530489709013485e-05, + "loss": 0.008448261767625809, + "step": 174050 + }, + { + "epoch": 24.706884315117104, + "grad_norm": 0.0734194815158844, + "learning_rate": 7.530347764371895e-05, + "loss": 0.012040060758590699, + "step": 174060 + }, + { + "epoch": 24.708303761533003, + "grad_norm": 0.18158185482025146, + "learning_rate": 7.530205819730306e-05, + "loss": 0.02122834324836731, + "step": 174070 + }, + { + "epoch": 24.7097232079489, + "grad_norm": 0.5702908039093018, + "learning_rate": 7.530063875088716e-05, + "loss": 0.015496766567230225, + "step": 174080 + }, + { + "epoch": 24.7111426543648, + "grad_norm": 2.1380672454833984, + "learning_rate": 7.529921930447126e-05, + "loss": 0.004717253148555756, + "step": 174090 + }, + { + "epoch": 24.712562100780694, + "grad_norm": 10.769475936889648, + "learning_rate": 7.529779985805537e-05, + "loss": 0.04724853038787842, + "step": 174100 + }, + { + "epoch": 24.713981547196592, + "grad_norm": 0.9456741213798523, + "learning_rate": 7.529638041163946e-05, + "loss": 0.008170842379331588, + "step": 174110 + }, + { + "epoch": 24.71540099361249, + "grad_norm": 0.4752744734287262, + "learning_rate": 7.529496096522357e-05, + "loss": 0.0060107138007879255, + "step": 174120 + }, + { + "epoch": 24.71682044002839, + "grad_norm": 1.646991491317749, + "learning_rate": 7.529354151880766e-05, + "loss": 0.004364002123475074, + "step": 174130 + }, + { + "epoch": 24.718239886444287, + "grad_norm": 0.06133211776614189, + "learning_rate": 7.529212207239177e-05, + "loss": 0.015852633118629455, + "step": 174140 + }, + { + "epoch": 24.719659332860186, + "grad_norm": 1.9032034873962402, + "learning_rate": 7.529070262597587e-05, + "loss": 0.038377884030342105, + "step": 174150 + }, + { + "epoch": 24.721078779276084, + "grad_norm": 0.2995649576187134, + "learning_rate": 7.528928317955998e-05, + "loss": 0.05668915510177612, + "step": 174160 + }, + { + "epoch": 24.72249822569198, + "grad_norm": 0.023115361109375954, + "learning_rate": 7.528786373314408e-05, + "loss": 0.031367212533950806, + "step": 174170 + }, + { + "epoch": 24.723917672107877, + "grad_norm": 7.3869218826293945, + "learning_rate": 7.528644428672817e-05, + "loss": 0.0047724649310112, + "step": 174180 + }, + { + "epoch": 24.725337118523775, + "grad_norm": 3.0998125076293945, + "learning_rate": 7.528502484031228e-05, + "loss": 0.01514744758605957, + "step": 174190 + }, + { + "epoch": 24.726756564939674, + "grad_norm": 0.28923550248146057, + "learning_rate": 7.528360539389638e-05, + "loss": 0.018315699696540833, + "step": 174200 + }, + { + "epoch": 24.728176011355572, + "grad_norm": 0.0283610038459301, + "learning_rate": 7.528218594748049e-05, + "loss": 0.01048092320561409, + "step": 174210 + }, + { + "epoch": 24.72959545777147, + "grad_norm": 0.19790737330913544, + "learning_rate": 7.528076650106459e-05, + "loss": 0.036326315999031064, + "step": 174220 + }, + { + "epoch": 24.73101490418737, + "grad_norm": 0.1673627644777298, + "learning_rate": 7.527934705464869e-05, + "loss": 0.003476467356085777, + "step": 174230 + }, + { + "epoch": 24.732434350603263, + "grad_norm": 1.00002920627594, + "learning_rate": 7.527792760823278e-05, + "loss": 0.038858985900878905, + "step": 174240 + }, + { + "epoch": 24.73385379701916, + "grad_norm": 0.051591239869594574, + "learning_rate": 7.52765081618169e-05, + "loss": 0.04238108396530151, + "step": 174250 + }, + { + "epoch": 24.73527324343506, + "grad_norm": 0.10495876520872116, + "learning_rate": 7.527508871540099e-05, + "loss": 0.008474375307559966, + "step": 174260 + }, + { + "epoch": 24.73669268985096, + "grad_norm": 0.03314083442091942, + "learning_rate": 7.52736692689851e-05, + "loss": 0.0045479096472263334, + "step": 174270 + }, + { + "epoch": 24.738112136266857, + "grad_norm": 0.0293995700776577, + "learning_rate": 7.52722498225692e-05, + "loss": 0.017963461577892303, + "step": 174280 + }, + { + "epoch": 24.739531582682755, + "grad_norm": 0.07112109661102295, + "learning_rate": 7.52708303761533e-05, + "loss": 0.011599650979042054, + "step": 174290 + }, + { + "epoch": 24.740951029098653, + "grad_norm": 0.008821835741400719, + "learning_rate": 7.526941092973741e-05, + "loss": 0.022014503180980683, + "step": 174300 + }, + { + "epoch": 24.742370475514548, + "grad_norm": 0.0797174721956253, + "learning_rate": 7.52679914833215e-05, + "loss": 0.0019339270889759063, + "step": 174310 + }, + { + "epoch": 24.743789921930446, + "grad_norm": 0.021719349548220634, + "learning_rate": 7.526657203690562e-05, + "loss": 0.003751084581017494, + "step": 174320 + }, + { + "epoch": 24.745209368346345, + "grad_norm": 0.029782230034470558, + "learning_rate": 7.526515259048971e-05, + "loss": 0.015615200996398926, + "step": 174330 + }, + { + "epoch": 24.746628814762243, + "grad_norm": 0.03500068932771683, + "learning_rate": 7.526373314407381e-05, + "loss": 0.00558471716940403, + "step": 174340 + }, + { + "epoch": 24.74804826117814, + "grad_norm": 0.029169904068112373, + "learning_rate": 7.526231369765791e-05, + "loss": 0.017357069253921508, + "step": 174350 + }, + { + "epoch": 24.74946770759404, + "grad_norm": 7.586524963378906, + "learning_rate": 7.526089425124202e-05, + "loss": 0.014643791317939758, + "step": 174360 + }, + { + "epoch": 24.750887154009938, + "grad_norm": 0.07049157470464706, + "learning_rate": 7.525947480482612e-05, + "loss": 0.029498291015625, + "step": 174370 + }, + { + "epoch": 24.752306600425833, + "grad_norm": 0.6616369485855103, + "learning_rate": 7.525805535841023e-05, + "loss": 0.006075846776366234, + "step": 174380 + }, + { + "epoch": 24.75372604684173, + "grad_norm": 11.370431900024414, + "learning_rate": 7.525663591199433e-05, + "loss": 0.012667596340179443, + "step": 174390 + }, + { + "epoch": 24.75514549325763, + "grad_norm": 8.695616722106934, + "learning_rate": 7.525521646557842e-05, + "loss": 0.004614725708961487, + "step": 174400 + }, + { + "epoch": 24.756564939673527, + "grad_norm": 0.9564875364303589, + "learning_rate": 7.525379701916253e-05, + "loss": 0.016885387897491454, + "step": 174410 + }, + { + "epoch": 24.757984386089426, + "grad_norm": 0.3337288796901703, + "learning_rate": 7.525237757274663e-05, + "loss": 0.007147558778524399, + "step": 174420 + }, + { + "epoch": 24.759403832505324, + "grad_norm": 0.04963868483901024, + "learning_rate": 7.525095812633074e-05, + "loss": 0.00917949080467224, + "step": 174430 + }, + { + "epoch": 24.760823278921222, + "grad_norm": 0.10429813712835312, + "learning_rate": 7.524953867991483e-05, + "loss": 0.025586032867431642, + "step": 174440 + }, + { + "epoch": 24.762242725337117, + "grad_norm": 8.712050437927246, + "learning_rate": 7.524811923349894e-05, + "loss": 0.04460093677043915, + "step": 174450 + }, + { + "epoch": 24.763662171753015, + "grad_norm": 15.997976303100586, + "learning_rate": 7.524669978708303e-05, + "loss": 0.07125446796417237, + "step": 174460 + }, + { + "epoch": 24.765081618168914, + "grad_norm": 0.06645198911428452, + "learning_rate": 7.524528034066715e-05, + "loss": 0.010012517869472503, + "step": 174470 + }, + { + "epoch": 24.766501064584812, + "grad_norm": 0.011653372086584568, + "learning_rate": 7.524386089425126e-05, + "loss": 0.025356784462928772, + "step": 174480 + }, + { + "epoch": 24.76792051100071, + "grad_norm": 0.5109019875526428, + "learning_rate": 7.524244144783534e-05, + "loss": 0.03153618574142456, + "step": 174490 + }, + { + "epoch": 24.76933995741661, + "grad_norm": 0.04092046990990639, + "learning_rate": 7.524102200141945e-05, + "loss": 0.047999075055122374, + "step": 174500 + }, + { + "epoch": 24.76933995741661, + "eval_accuracy": 0.9898899980924525, + "eval_loss": 0.03449544683098793, + "eval_runtime": 33.2563, + "eval_samples_per_second": 472.904, + "eval_steps_per_second": 14.794, + "step": 174500 + }, + { + "epoch": 24.770759403832507, + "grad_norm": 9.587790489196777, + "learning_rate": 7.523960255500355e-05, + "loss": 0.01698211133480072, + "step": 174510 + }, + { + "epoch": 24.7721788502484, + "grad_norm": 0.031794656068086624, + "learning_rate": 7.523818310858766e-05, + "loss": 0.008427159488201141, + "step": 174520 + }, + { + "epoch": 24.7735982966643, + "grad_norm": 13.463812828063965, + "learning_rate": 7.523676366217176e-05, + "loss": 0.0057944685220718386, + "step": 174530 + }, + { + "epoch": 24.7750177430802, + "grad_norm": 0.04302189499139786, + "learning_rate": 7.523534421575585e-05, + "loss": 0.004958771914243698, + "step": 174540 + }, + { + "epoch": 24.776437189496097, + "grad_norm": 0.8411322236061096, + "learning_rate": 7.523392476933995e-05, + "loss": 0.028694337606430052, + "step": 174550 + }, + { + "epoch": 24.777856635911995, + "grad_norm": 5.27100133895874, + "learning_rate": 7.523250532292406e-05, + "loss": 0.0034240961074829103, + "step": 174560 + }, + { + "epoch": 24.779276082327893, + "grad_norm": 1.7790260314941406, + "learning_rate": 7.523108587650817e-05, + "loss": 0.01732173264026642, + "step": 174570 + }, + { + "epoch": 24.78069552874379, + "grad_norm": 1.320960521697998, + "learning_rate": 7.522966643009227e-05, + "loss": 0.009014591574668884, + "step": 174580 + }, + { + "epoch": 24.782114975159686, + "grad_norm": 0.05678262934088707, + "learning_rate": 7.522824698367637e-05, + "loss": 0.003004784509539604, + "step": 174590 + }, + { + "epoch": 24.783534421575585, + "grad_norm": 0.016186537221074104, + "learning_rate": 7.522682753726047e-05, + "loss": 0.03396380543708801, + "step": 174600 + }, + { + "epoch": 24.784953867991483, + "grad_norm": 14.311001777648926, + "learning_rate": 7.522540809084458e-05, + "loss": 0.04358760118484497, + "step": 174610 + }, + { + "epoch": 24.78637331440738, + "grad_norm": 0.1282660812139511, + "learning_rate": 7.522398864442867e-05, + "loss": 0.00944647341966629, + "step": 174620 + }, + { + "epoch": 24.78779276082328, + "grad_norm": 1.1548993587493896, + "learning_rate": 7.522256919801278e-05, + "loss": 0.004563160240650177, + "step": 174630 + }, + { + "epoch": 24.789212207239178, + "grad_norm": 0.33657294511795044, + "learning_rate": 7.522114975159687e-05, + "loss": 0.010202559828758239, + "step": 174640 + }, + { + "epoch": 24.790631653655076, + "grad_norm": 8.293270111083984, + "learning_rate": 7.521973030518098e-05, + "loss": 0.017596372961997987, + "step": 174650 + }, + { + "epoch": 24.79205110007097, + "grad_norm": 6.582569599151611, + "learning_rate": 7.521831085876509e-05, + "loss": 0.027582597732543946, + "step": 174660 + }, + { + "epoch": 24.79347054648687, + "grad_norm": 0.643806517124176, + "learning_rate": 7.521689141234919e-05, + "loss": 0.011861677467823028, + "step": 174670 + }, + { + "epoch": 24.794889992902768, + "grad_norm": 0.7834032773971558, + "learning_rate": 7.52154719659333e-05, + "loss": 0.004603806510567665, + "step": 174680 + }, + { + "epoch": 24.796309439318666, + "grad_norm": 7.0120038986206055, + "learning_rate": 7.52140525195174e-05, + "loss": 0.015449795126914977, + "step": 174690 + }, + { + "epoch": 24.797728885734564, + "grad_norm": 10.408753395080566, + "learning_rate": 7.52126330731015e-05, + "loss": 0.015052142739295959, + "step": 174700 + }, + { + "epoch": 24.799148332150462, + "grad_norm": 0.20423302054405212, + "learning_rate": 7.521121362668559e-05, + "loss": 0.023136843740940095, + "step": 174710 + }, + { + "epoch": 24.80056777856636, + "grad_norm": 0.006995536852627993, + "learning_rate": 7.52097941802697e-05, + "loss": 0.011541590094566345, + "step": 174720 + }, + { + "epoch": 24.801987224982255, + "grad_norm": 0.058341626077890396, + "learning_rate": 7.52083747338538e-05, + "loss": 0.007179992645978928, + "step": 174730 + }, + { + "epoch": 24.803406671398154, + "grad_norm": 0.012762105092406273, + "learning_rate": 7.520695528743791e-05, + "loss": 0.003169621527194977, + "step": 174740 + }, + { + "epoch": 24.804826117814052, + "grad_norm": 2.673100709915161, + "learning_rate": 7.520553584102201e-05, + "loss": 0.018645356595516204, + "step": 174750 + }, + { + "epoch": 24.80624556422995, + "grad_norm": 0.09460142254829407, + "learning_rate": 7.52041163946061e-05, + "loss": 0.023665910959243773, + "step": 174760 + }, + { + "epoch": 24.80766501064585, + "grad_norm": 0.01073943916708231, + "learning_rate": 7.520269694819022e-05, + "loss": 0.007674752175807953, + "step": 174770 + }, + { + "epoch": 24.809084457061747, + "grad_norm": 0.04801369458436966, + "learning_rate": 7.520127750177431e-05, + "loss": 0.01689731776714325, + "step": 174780 + }, + { + "epoch": 24.810503903477645, + "grad_norm": 0.04774191603064537, + "learning_rate": 7.519985805535842e-05, + "loss": 0.0017985638231039047, + "step": 174790 + }, + { + "epoch": 24.81192334989354, + "grad_norm": 0.1210622638463974, + "learning_rate": 7.519843860894251e-05, + "loss": 0.008014217019081116, + "step": 174800 + }, + { + "epoch": 24.81334279630944, + "grad_norm": 0.09643175452947617, + "learning_rate": 7.519701916252662e-05, + "loss": 0.00177404023706913, + "step": 174810 + }, + { + "epoch": 24.814762242725337, + "grad_norm": 0.27548062801361084, + "learning_rate": 7.519559971611072e-05, + "loss": 0.014251914620399476, + "step": 174820 + }, + { + "epoch": 24.816181689141235, + "grad_norm": 0.24879677593708038, + "learning_rate": 7.519418026969483e-05, + "loss": 0.008309140801429749, + "step": 174830 + }, + { + "epoch": 24.817601135557133, + "grad_norm": 0.47994929552078247, + "learning_rate": 7.519276082327892e-05, + "loss": 0.03306210339069367, + "step": 174840 + }, + { + "epoch": 24.81902058197303, + "grad_norm": 1.9139819145202637, + "learning_rate": 7.519134137686302e-05, + "loss": 0.033734101057052615, + "step": 174850 + }, + { + "epoch": 24.82044002838893, + "grad_norm": 0.045094821602106094, + "learning_rate": 7.518992193044713e-05, + "loss": 0.01556089222431183, + "step": 174860 + }, + { + "epoch": 24.821859474804825, + "grad_norm": 0.013437124900519848, + "learning_rate": 7.518850248403123e-05, + "loss": 0.059854811429977416, + "step": 174870 + }, + { + "epoch": 24.823278921220723, + "grad_norm": 0.10781147330999374, + "learning_rate": 7.518708303761534e-05, + "loss": 0.0018862254917621612, + "step": 174880 + }, + { + "epoch": 24.82469836763662, + "grad_norm": 1.0356255769729614, + "learning_rate": 7.518566359119944e-05, + "loss": 0.056297552585601804, + "step": 174890 + }, + { + "epoch": 24.82611781405252, + "grad_norm": 1.0135916471481323, + "learning_rate": 7.518424414478354e-05, + "loss": 0.005737875029444695, + "step": 174900 + }, + { + "epoch": 24.827537260468418, + "grad_norm": 0.05034732446074486, + "learning_rate": 7.518282469836763e-05, + "loss": 0.00260453037917614, + "step": 174910 + }, + { + "epoch": 24.828956706884316, + "grad_norm": 0.03612521290779114, + "learning_rate": 7.518140525195174e-05, + "loss": 0.0020385969430208204, + "step": 174920 + }, + { + "epoch": 24.830376153300215, + "grad_norm": 0.7404531836509705, + "learning_rate": 7.517998580553584e-05, + "loss": 0.044337224960327146, + "step": 174930 + }, + { + "epoch": 24.83179559971611, + "grad_norm": 0.03960367664694786, + "learning_rate": 7.517856635911995e-05, + "loss": 0.0030037347227334977, + "step": 174940 + }, + { + "epoch": 24.833215046132008, + "grad_norm": 13.623032569885254, + "learning_rate": 7.517714691270405e-05, + "loss": 0.03327717185020447, + "step": 174950 + }, + { + "epoch": 24.834634492547906, + "grad_norm": 0.24466368556022644, + "learning_rate": 7.517572746628815e-05, + "loss": 0.022659875452518463, + "step": 174960 + }, + { + "epoch": 24.836053938963804, + "grad_norm": 5.822574615478516, + "learning_rate": 7.517430801987226e-05, + "loss": 0.014040997624397278, + "step": 174970 + }, + { + "epoch": 24.837473385379703, + "grad_norm": 0.19128546118736267, + "learning_rate": 7.517288857345636e-05, + "loss": 0.013789917528629302, + "step": 174980 + }, + { + "epoch": 24.8388928317956, + "grad_norm": 0.08661620318889618, + "learning_rate": 7.517146912704047e-05, + "loss": 0.022546961903572083, + "step": 174990 + }, + { + "epoch": 24.8403122782115, + "grad_norm": 0.06488239020109177, + "learning_rate": 7.517004968062455e-05, + "loss": 0.021421028673648833, + "step": 175000 + }, + { + "epoch": 24.8403122782115, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.059894103556871414, + "eval_runtime": 34.0237, + "eval_samples_per_second": 462.237, + "eval_steps_per_second": 14.461, + "step": 175000 + }, + { + "epoch": 24.841731724627394, + "grad_norm": 0.009624416008591652, + "learning_rate": 7.516863023420866e-05, + "loss": 0.008913154900074004, + "step": 175010 + }, + { + "epoch": 24.843151171043292, + "grad_norm": 0.024270085617899895, + "learning_rate": 7.516721078779276e-05, + "loss": 0.00412944033741951, + "step": 175020 + }, + { + "epoch": 24.84457061745919, + "grad_norm": 5.38871431350708, + "learning_rate": 7.516579134137687e-05, + "loss": 0.010882017016410828, + "step": 175030 + }, + { + "epoch": 24.84599006387509, + "grad_norm": 1.1620713472366333, + "learning_rate": 7.516437189496097e-05, + "loss": 0.0066196158528327945, + "step": 175040 + }, + { + "epoch": 24.847409510290987, + "grad_norm": 1.4279991388320923, + "learning_rate": 7.516295244854508e-05, + "loss": 0.03985439836978912, + "step": 175050 + }, + { + "epoch": 24.848828956706885, + "grad_norm": 0.03753452003002167, + "learning_rate": 7.516153300212918e-05, + "loss": 0.011036480963230132, + "step": 175060 + }, + { + "epoch": 24.850248403122784, + "grad_norm": 0.03312923386693001, + "learning_rate": 7.516011355571327e-05, + "loss": 0.004231118783354759, + "step": 175070 + }, + { + "epoch": 24.85166784953868, + "grad_norm": 0.6216834783554077, + "learning_rate": 7.515869410929738e-05, + "loss": 0.003838919848203659, + "step": 175080 + }, + { + "epoch": 24.853087295954577, + "grad_norm": 0.0060589490458369255, + "learning_rate": 7.515727466288148e-05, + "loss": 0.012651222944259643, + "step": 175090 + }, + { + "epoch": 24.854506742370475, + "grad_norm": 16.96720314025879, + "learning_rate": 7.515585521646559e-05, + "loss": 0.026090008020401, + "step": 175100 + }, + { + "epoch": 24.855926188786373, + "grad_norm": 0.1251462697982788, + "learning_rate": 7.515443577004968e-05, + "loss": 0.003367970883846283, + "step": 175110 + }, + { + "epoch": 24.85734563520227, + "grad_norm": 0.10500666499137878, + "learning_rate": 7.515301632363379e-05, + "loss": 0.008258894085884094, + "step": 175120 + }, + { + "epoch": 24.85876508161817, + "grad_norm": 0.05207778140902519, + "learning_rate": 7.515159687721788e-05, + "loss": 0.002167348563671112, + "step": 175130 + }, + { + "epoch": 24.86018452803407, + "grad_norm": 3.116264581680298, + "learning_rate": 7.5150177430802e-05, + "loss": 0.018656530976295473, + "step": 175140 + }, + { + "epoch": 24.861603974449963, + "grad_norm": 0.014056084677577019, + "learning_rate": 7.514875798438609e-05, + "loss": 0.002227706089615822, + "step": 175150 + }, + { + "epoch": 24.86302342086586, + "grad_norm": 0.10195112228393555, + "learning_rate": 7.514733853797019e-05, + "loss": 0.02582433819770813, + "step": 175160 + }, + { + "epoch": 24.86444286728176, + "grad_norm": 7.722496032714844, + "learning_rate": 7.51459190915543e-05, + "loss": 0.008167944848537445, + "step": 175170 + }, + { + "epoch": 24.865862313697658, + "grad_norm": 0.3382679224014282, + "learning_rate": 7.51444996451384e-05, + "loss": 0.017223405838012695, + "step": 175180 + }, + { + "epoch": 24.867281760113556, + "grad_norm": 0.04141584411263466, + "learning_rate": 7.514308019872251e-05, + "loss": 0.005016700178384781, + "step": 175190 + }, + { + "epoch": 24.868701206529455, + "grad_norm": 0.20137231051921844, + "learning_rate": 7.51416607523066e-05, + "loss": 0.004444120824337006, + "step": 175200 + }, + { + "epoch": 24.870120652945353, + "grad_norm": 9.847540855407715, + "learning_rate": 7.51402413058907e-05, + "loss": 0.049246639013290405, + "step": 175210 + }, + { + "epoch": 24.871540099361248, + "grad_norm": 0.008080052211880684, + "learning_rate": 7.51388218594748e-05, + "loss": 0.020504456758499146, + "step": 175220 + }, + { + "epoch": 24.872959545777146, + "grad_norm": 1.105514407157898, + "learning_rate": 7.513740241305891e-05, + "loss": 0.01990496665239334, + "step": 175230 + }, + { + "epoch": 24.874378992193044, + "grad_norm": 0.07193966209888458, + "learning_rate": 7.513598296664301e-05, + "loss": 0.009424051642417908, + "step": 175240 + }, + { + "epoch": 24.875798438608943, + "grad_norm": 0.03143780305981636, + "learning_rate": 7.513456352022712e-05, + "loss": 0.012867766618728637, + "step": 175250 + }, + { + "epoch": 24.87721788502484, + "grad_norm": 0.05059746652841568, + "learning_rate": 7.513314407381122e-05, + "loss": 0.00686897560954094, + "step": 175260 + }, + { + "epoch": 24.87863733144074, + "grad_norm": 0.418129563331604, + "learning_rate": 7.513172462739532e-05, + "loss": 0.007020705938339233, + "step": 175270 + }, + { + "epoch": 24.880056777856637, + "grad_norm": 0.27555546164512634, + "learning_rate": 7.513030518097943e-05, + "loss": 0.01744609773159027, + "step": 175280 + }, + { + "epoch": 24.881476224272532, + "grad_norm": 4.550117492675781, + "learning_rate": 7.512888573456352e-05, + "loss": 0.0501729428768158, + "step": 175290 + }, + { + "epoch": 24.88289567068843, + "grad_norm": 1.2427401542663574, + "learning_rate": 7.512746628814763e-05, + "loss": 0.011917130649089813, + "step": 175300 + }, + { + "epoch": 24.88431511710433, + "grad_norm": 0.10436800122261047, + "learning_rate": 7.512604684173172e-05, + "loss": 0.01766515076160431, + "step": 175310 + }, + { + "epoch": 24.885734563520227, + "grad_norm": 0.019094472751021385, + "learning_rate": 7.512462739531583e-05, + "loss": 0.005873279646039009, + "step": 175320 + }, + { + "epoch": 24.887154009936125, + "grad_norm": 0.05248106271028519, + "learning_rate": 7.512320794889993e-05, + "loss": 0.005902415886521339, + "step": 175330 + }, + { + "epoch": 24.888573456352024, + "grad_norm": 0.1522170454263687, + "learning_rate": 7.512178850248404e-05, + "loss": 0.0057086929678916935, + "step": 175340 + }, + { + "epoch": 24.889992902767922, + "grad_norm": 0.03702174872159958, + "learning_rate": 7.512036905606813e-05, + "loss": 0.008611040562391281, + "step": 175350 + }, + { + "epoch": 24.891412349183817, + "grad_norm": 1.8310575485229492, + "learning_rate": 7.511894960965223e-05, + "loss": 0.009664189815521241, + "step": 175360 + }, + { + "epoch": 24.892831795599715, + "grad_norm": 0.15983669459819794, + "learning_rate": 7.511753016323634e-05, + "loss": 0.030514401197433472, + "step": 175370 + }, + { + "epoch": 24.894251242015613, + "grad_norm": 3.2224626541137695, + "learning_rate": 7.511611071682044e-05, + "loss": 0.012683349847793578, + "step": 175380 + }, + { + "epoch": 24.89567068843151, + "grad_norm": 0.2894139289855957, + "learning_rate": 7.511469127040455e-05, + "loss": 0.0012529436498880386, + "step": 175390 + }, + { + "epoch": 24.89709013484741, + "grad_norm": 4.431601047515869, + "learning_rate": 7.511327182398865e-05, + "loss": 0.003205907344818115, + "step": 175400 + }, + { + "epoch": 24.89850958126331, + "grad_norm": 0.009868744760751724, + "learning_rate": 7.511185237757276e-05, + "loss": 0.018482215702533722, + "step": 175410 + }, + { + "epoch": 24.899929027679207, + "grad_norm": 0.013247550465166569, + "learning_rate": 7.511043293115684e-05, + "loss": 0.02899858057498932, + "step": 175420 + }, + { + "epoch": 24.9013484740951, + "grad_norm": 0.07279037684202194, + "learning_rate": 7.510901348474095e-05, + "loss": 0.013199052214622498, + "step": 175430 + }, + { + "epoch": 24.902767920511, + "grad_norm": 0.012355794198811054, + "learning_rate": 7.510759403832505e-05, + "loss": 0.01848658174276352, + "step": 175440 + }, + { + "epoch": 24.904187366926898, + "grad_norm": 0.08375567197799683, + "learning_rate": 7.510617459190916e-05, + "loss": 0.030598175525665284, + "step": 175450 + }, + { + "epoch": 24.905606813342796, + "grad_norm": 1.4662492275238037, + "learning_rate": 7.510475514549326e-05, + "loss": 0.0074163593351840975, + "step": 175460 + }, + { + "epoch": 24.907026259758695, + "grad_norm": 0.4756302833557129, + "learning_rate": 7.510333569907736e-05, + "loss": 0.03932933807373047, + "step": 175470 + }, + { + "epoch": 24.908445706174593, + "grad_norm": 0.007350297644734383, + "learning_rate": 7.510191625266147e-05, + "loss": 0.022697003185749055, + "step": 175480 + }, + { + "epoch": 24.90986515259049, + "grad_norm": 0.18280135095119476, + "learning_rate": 7.510049680624557e-05, + "loss": 0.022344477474689484, + "step": 175490 + }, + { + "epoch": 24.911284599006386, + "grad_norm": 0.6633405089378357, + "learning_rate": 7.509907735982968e-05, + "loss": 0.006186798959970474, + "step": 175500 + }, + { + "epoch": 24.911284599006386, + "eval_accuracy": 0.986837922044891, + "eval_loss": 0.049445293843746185, + "eval_runtime": 33.8851, + "eval_samples_per_second": 464.127, + "eval_steps_per_second": 14.52, + "step": 175500 + }, + { + "epoch": 24.912704045422284, + "grad_norm": 0.033913515508174896, + "learning_rate": 7.509765791341377e-05, + "loss": 0.005810964852571487, + "step": 175510 + }, + { + "epoch": 24.914123491838183, + "grad_norm": 0.5527358055114746, + "learning_rate": 7.509623846699787e-05, + "loss": 0.011636155098676682, + "step": 175520 + }, + { + "epoch": 24.91554293825408, + "grad_norm": 0.024469785392284393, + "learning_rate": 7.509481902058197e-05, + "loss": 0.002954862639307976, + "step": 175530 + }, + { + "epoch": 24.91696238466998, + "grad_norm": 0.9214481711387634, + "learning_rate": 7.509339957416608e-05, + "loss": 0.0024703428149223327, + "step": 175540 + }, + { + "epoch": 24.918381831085878, + "grad_norm": 0.002124282531440258, + "learning_rate": 7.509198012775018e-05, + "loss": 0.010342609882354737, + "step": 175550 + }, + { + "epoch": 24.919801277501776, + "grad_norm": 0.00863539706915617, + "learning_rate": 7.509056068133429e-05, + "loss": 0.006307965517044068, + "step": 175560 + }, + { + "epoch": 24.92122072391767, + "grad_norm": 13.086250305175781, + "learning_rate": 7.508914123491839e-05, + "loss": 0.02772601842880249, + "step": 175570 + }, + { + "epoch": 24.92264017033357, + "grad_norm": 2.0935614109039307, + "learning_rate": 7.508772178850248e-05, + "loss": 0.01197703331708908, + "step": 175580 + }, + { + "epoch": 24.924059616749467, + "grad_norm": 0.4687116742134094, + "learning_rate": 7.50863023420866e-05, + "loss": 0.012996704876422882, + "step": 175590 + }, + { + "epoch": 24.925479063165366, + "grad_norm": 6.849146366119385, + "learning_rate": 7.508488289567069e-05, + "loss": 0.011623166501522064, + "step": 175600 + }, + { + "epoch": 24.926898509581264, + "grad_norm": 18.899351119995117, + "learning_rate": 7.50834634492548e-05, + "loss": 0.029349669814109802, + "step": 175610 + }, + { + "epoch": 24.928317955997162, + "grad_norm": 0.034784719347953796, + "learning_rate": 7.508204400283889e-05, + "loss": 0.008214130997657776, + "step": 175620 + }, + { + "epoch": 24.92973740241306, + "grad_norm": 0.012977590784430504, + "learning_rate": 7.5080624556423e-05, + "loss": 0.02578195631504059, + "step": 175630 + }, + { + "epoch": 24.931156848828955, + "grad_norm": 8.395556449890137, + "learning_rate": 7.50792051100071e-05, + "loss": 0.00772925466299057, + "step": 175640 + }, + { + "epoch": 24.932576295244854, + "grad_norm": 8.696738243103027, + "learning_rate": 7.50777856635912e-05, + "loss": 0.02265123128890991, + "step": 175650 + }, + { + "epoch": 24.933995741660752, + "grad_norm": 0.3534843325614929, + "learning_rate": 7.50763662171753e-05, + "loss": 0.028701850771903993, + "step": 175660 + }, + { + "epoch": 24.93541518807665, + "grad_norm": 0.46534907817840576, + "learning_rate": 7.50749467707594e-05, + "loss": 0.01640675812959671, + "step": 175670 + }, + { + "epoch": 24.93683463449255, + "grad_norm": 0.08312589675188065, + "learning_rate": 7.507352732434351e-05, + "loss": 0.01537715494632721, + "step": 175680 + }, + { + "epoch": 24.938254080908447, + "grad_norm": 1.4698450565338135, + "learning_rate": 7.507210787792761e-05, + "loss": 0.03692317008972168, + "step": 175690 + }, + { + "epoch": 24.939673527324345, + "grad_norm": 0.2958504557609558, + "learning_rate": 7.507068843151172e-05, + "loss": 0.0011108819395303725, + "step": 175700 + }, + { + "epoch": 24.94109297374024, + "grad_norm": 1.8403807878494263, + "learning_rate": 7.506926898509582e-05, + "loss": 0.012101513147354127, + "step": 175710 + }, + { + "epoch": 24.942512420156138, + "grad_norm": 0.21057625114917755, + "learning_rate": 7.506784953867991e-05, + "loss": 0.012412656843662263, + "step": 175720 + }, + { + "epoch": 24.943931866572036, + "grad_norm": 0.08128996938467026, + "learning_rate": 7.506643009226401e-05, + "loss": 0.0032723117619752884, + "step": 175730 + }, + { + "epoch": 24.945351312987935, + "grad_norm": 2.3081390857696533, + "learning_rate": 7.506501064584812e-05, + "loss": 0.03393067121505737, + "step": 175740 + }, + { + "epoch": 24.946770759403833, + "grad_norm": 0.008313100785017014, + "learning_rate": 7.506359119943222e-05, + "loss": 0.08946827054023743, + "step": 175750 + }, + { + "epoch": 24.94819020581973, + "grad_norm": 0.3292960524559021, + "learning_rate": 7.506217175301633e-05, + "loss": 0.024500785768032073, + "step": 175760 + }, + { + "epoch": 24.94960965223563, + "grad_norm": 22.93217658996582, + "learning_rate": 7.506075230660044e-05, + "loss": 0.0669428527355194, + "step": 175770 + }, + { + "epoch": 24.951029098651524, + "grad_norm": 3.4237565994262695, + "learning_rate": 7.505933286018453e-05, + "loss": 0.03038751184940338, + "step": 175780 + }, + { + "epoch": 24.952448545067423, + "grad_norm": 1.499855875968933, + "learning_rate": 7.505791341376864e-05, + "loss": 0.018992039561271667, + "step": 175790 + }, + { + "epoch": 24.95386799148332, + "grad_norm": 16.999082565307617, + "learning_rate": 7.505649396735273e-05, + "loss": 0.017377470433712006, + "step": 175800 + }, + { + "epoch": 24.95528743789922, + "grad_norm": 0.1883808970451355, + "learning_rate": 7.505507452093684e-05, + "loss": 0.0027715291827917097, + "step": 175810 + }, + { + "epoch": 24.956706884315118, + "grad_norm": 12.042606353759766, + "learning_rate": 7.505365507452094e-05, + "loss": 0.03289647400379181, + "step": 175820 + }, + { + "epoch": 24.958126330731016, + "grad_norm": 0.014491724781692028, + "learning_rate": 7.505223562810504e-05, + "loss": 0.019682183861732483, + "step": 175830 + }, + { + "epoch": 24.959545777146914, + "grad_norm": 0.32384780049324036, + "learning_rate": 7.505081618168914e-05, + "loss": 0.007131822407245636, + "step": 175840 + }, + { + "epoch": 24.96096522356281, + "grad_norm": 1.8035625219345093, + "learning_rate": 7.504939673527325e-05, + "loss": 0.011278056353330613, + "step": 175850 + }, + { + "epoch": 24.962384669978707, + "grad_norm": 0.15551474690437317, + "learning_rate": 7.504797728885736e-05, + "loss": 0.010449586808681488, + "step": 175860 + }, + { + "epoch": 24.963804116394606, + "grad_norm": 12.248896598815918, + "learning_rate": 7.504655784244146e-05, + "loss": 0.03312714695930481, + "step": 175870 + }, + { + "epoch": 24.965223562810504, + "grad_norm": 0.03220541402697563, + "learning_rate": 7.504513839602555e-05, + "loss": 0.026300197839736937, + "step": 175880 + }, + { + "epoch": 24.966643009226402, + "grad_norm": 0.07988949865102768, + "learning_rate": 7.504371894960965e-05, + "loss": 0.045073002576828, + "step": 175890 + }, + { + "epoch": 24.9680624556423, + "grad_norm": 2.022703170776367, + "learning_rate": 7.504229950319376e-05, + "loss": 0.03606774508953094, + "step": 175900 + }, + { + "epoch": 24.9694819020582, + "grad_norm": 4.755268096923828, + "learning_rate": 7.504088005677786e-05, + "loss": 0.016792690753936766, + "step": 175910 + }, + { + "epoch": 24.970901348474094, + "grad_norm": 0.3038274049758911, + "learning_rate": 7.503946061036197e-05, + "loss": 0.02018530070781708, + "step": 175920 + }, + { + "epoch": 24.972320794889992, + "grad_norm": 1.830064058303833, + "learning_rate": 7.503804116394605e-05, + "loss": 0.03984014987945557, + "step": 175930 + }, + { + "epoch": 24.97374024130589, + "grad_norm": 0.015554901212453842, + "learning_rate": 7.503662171753016e-05, + "loss": 0.005465466901659966, + "step": 175940 + }, + { + "epoch": 24.97515968772179, + "grad_norm": 0.035277217626571655, + "learning_rate": 7.503520227111428e-05, + "loss": 0.010185975581407547, + "step": 175950 + }, + { + "epoch": 24.976579134137687, + "grad_norm": 3.6613926887512207, + "learning_rate": 7.503378282469837e-05, + "loss": 0.029780980944633485, + "step": 175960 + }, + { + "epoch": 24.977998580553585, + "grad_norm": 0.23636162281036377, + "learning_rate": 7.503236337828248e-05, + "loss": 0.000822971761226654, + "step": 175970 + }, + { + "epoch": 24.979418026969483, + "grad_norm": 0.010194499976933002, + "learning_rate": 7.503094393186657e-05, + "loss": 0.0032549675554037092, + "step": 175980 + }, + { + "epoch": 24.980837473385378, + "grad_norm": 16.37147331237793, + "learning_rate": 7.502952448545068e-05, + "loss": 0.03289965987205505, + "step": 175990 + }, + { + "epoch": 24.982256919801276, + "grad_norm": 0.003607755294069648, + "learning_rate": 7.502810503903478e-05, + "loss": 0.002564488723874092, + "step": 176000 + }, + { + "epoch": 24.982256919801276, + "eval_accuracy": 0.9905258472690278, + "eval_loss": 0.03739862143993378, + "eval_runtime": 34.2559, + "eval_samples_per_second": 459.104, + "eval_steps_per_second": 14.362, + "step": 176000 + }, + { + "epoch": 24.983676366217175, + "grad_norm": 0.018405435606837273, + "learning_rate": 7.502668559261889e-05, + "loss": 0.00741477981209755, + "step": 176010 + }, + { + "epoch": 24.985095812633073, + "grad_norm": 0.13060133159160614, + "learning_rate": 7.502540809084457e-05, + "loss": 0.03214366137981415, + "step": 176020 + }, + { + "epoch": 24.98651525904897, + "grad_norm": 0.009267968125641346, + "learning_rate": 7.502398864442868e-05, + "loss": 0.02713037431240082, + "step": 176030 + }, + { + "epoch": 24.98793470546487, + "grad_norm": 0.16197466850280762, + "learning_rate": 7.502256919801278e-05, + "loss": 0.024450284242630006, + "step": 176040 + }, + { + "epoch": 24.989354151880768, + "grad_norm": 0.04397560656070709, + "learning_rate": 7.502114975159688e-05, + "loss": 0.03620302677154541, + "step": 176050 + }, + { + "epoch": 24.990773598296663, + "grad_norm": 11.898681640625, + "learning_rate": 7.501973030518097e-05, + "loss": 0.01917205601930618, + "step": 176060 + }, + { + "epoch": 24.99219304471256, + "grad_norm": 1.2187647819519043, + "learning_rate": 7.501831085876509e-05, + "loss": 0.013482607901096344, + "step": 176070 + }, + { + "epoch": 24.99361249112846, + "grad_norm": 0.06481413543224335, + "learning_rate": 7.501689141234918e-05, + "loss": 0.0022290892899036407, + "step": 176080 + }, + { + "epoch": 24.995031937544358, + "grad_norm": 0.04644186422228813, + "learning_rate": 7.50154719659333e-05, + "loss": 0.002214464545249939, + "step": 176090 + }, + { + "epoch": 24.996451383960256, + "grad_norm": 0.05620459094643593, + "learning_rate": 7.501405251951739e-05, + "loss": 0.030244290828704834, + "step": 176100 + }, + { + "epoch": 24.997870830376154, + "grad_norm": 0.18697574734687805, + "learning_rate": 7.501263307310149e-05, + "loss": 0.01267269104719162, + "step": 176110 + }, + { + "epoch": 24.999290276792053, + "grad_norm": 1.2455826997756958, + "learning_rate": 7.50112136266856e-05, + "loss": 0.010584303736686706, + "step": 176120 + }, + { + "epoch": 25.000709723207947, + "grad_norm": 0.7709699869155884, + "learning_rate": 7.50097941802697e-05, + "loss": 0.04960097670555115, + "step": 176130 + }, + { + "epoch": 25.002129169623846, + "grad_norm": 0.03904024511575699, + "learning_rate": 7.500837473385381e-05, + "loss": 0.03774903416633606, + "step": 176140 + }, + { + "epoch": 25.003548616039744, + "grad_norm": 0.1304798424243927, + "learning_rate": 7.50069552874379e-05, + "loss": 0.005126225203275681, + "step": 176150 + }, + { + "epoch": 25.004968062455642, + "grad_norm": 0.017753375694155693, + "learning_rate": 7.5005535841022e-05, + "loss": 0.029411721229553222, + "step": 176160 + }, + { + "epoch": 25.00638750887154, + "grad_norm": 7.24683952331543, + "learning_rate": 7.50041163946061e-05, + "loss": 0.0076638728380203245, + "step": 176170 + }, + { + "epoch": 25.00780695528744, + "grad_norm": 0.023115063086152077, + "learning_rate": 7.500269694819021e-05, + "loss": 0.008908241242170333, + "step": 176180 + }, + { + "epoch": 25.009226401703337, + "grad_norm": 0.3951238989830017, + "learning_rate": 7.500127750177431e-05, + "loss": 0.049042055010795595, + "step": 176190 + }, + { + "epoch": 25.010645848119232, + "grad_norm": 0.021467646583914757, + "learning_rate": 7.499985805535842e-05, + "loss": 0.08232132792472839, + "step": 176200 + }, + { + "epoch": 25.01206529453513, + "grad_norm": 13.238282203674316, + "learning_rate": 7.499843860894252e-05, + "loss": 0.005195924639701843, + "step": 176210 + }, + { + "epoch": 25.01348474095103, + "grad_norm": 8.570143699645996, + "learning_rate": 7.499701916252661e-05, + "loss": 0.011431652307510375, + "step": 176220 + }, + { + "epoch": 25.014904187366927, + "grad_norm": 7.184260845184326, + "learning_rate": 7.499559971611073e-05, + "loss": 0.0234529510140419, + "step": 176230 + }, + { + "epoch": 25.016323633782825, + "grad_norm": 0.07747618854045868, + "learning_rate": 7.499418026969482e-05, + "loss": 0.007757975161075592, + "step": 176240 + }, + { + "epoch": 25.017743080198724, + "grad_norm": 0.004590606316924095, + "learning_rate": 7.499276082327893e-05, + "loss": 0.026771089434623717, + "step": 176250 + }, + { + "epoch": 25.019162526614622, + "grad_norm": 0.16705340147018433, + "learning_rate": 7.499134137686302e-05, + "loss": 0.0268364816904068, + "step": 176260 + }, + { + "epoch": 25.020581973030517, + "grad_norm": 0.04464271664619446, + "learning_rate": 7.498992193044713e-05, + "loss": 0.014764367043972016, + "step": 176270 + }, + { + "epoch": 25.022001419446415, + "grad_norm": 0.20153948664665222, + "learning_rate": 7.498850248403123e-05, + "loss": 0.02189038395881653, + "step": 176280 + }, + { + "epoch": 25.023420865862313, + "grad_norm": 1.1461154222488403, + "learning_rate": 7.498708303761534e-05, + "loss": 0.004062484577298164, + "step": 176290 + }, + { + "epoch": 25.02484031227821, + "grad_norm": 4.946191310882568, + "learning_rate": 7.498566359119943e-05, + "loss": 0.011904344707727433, + "step": 176300 + }, + { + "epoch": 25.02625975869411, + "grad_norm": 0.17491453886032104, + "learning_rate": 7.498424414478353e-05, + "loss": 0.0061743341386318205, + "step": 176310 + }, + { + "epoch": 25.027679205110008, + "grad_norm": 0.00420490512624383, + "learning_rate": 7.498282469836764e-05, + "loss": 0.028180000185966492, + "step": 176320 + }, + { + "epoch": 25.029098651525906, + "grad_norm": 16.56654167175293, + "learning_rate": 7.498140525195174e-05, + "loss": 0.0345615029335022, + "step": 176330 + }, + { + "epoch": 25.0305180979418, + "grad_norm": 0.09155841916799545, + "learning_rate": 7.497998580553585e-05, + "loss": 0.024400685727596284, + "step": 176340 + }, + { + "epoch": 25.0319375443577, + "grad_norm": 6.47618293762207, + "learning_rate": 7.497856635911995e-05, + "loss": 0.01806054711341858, + "step": 176350 + }, + { + "epoch": 25.033356990773598, + "grad_norm": 0.2358197122812271, + "learning_rate": 7.497714691270405e-05, + "loss": 0.006319011747837067, + "step": 176360 + }, + { + "epoch": 25.034776437189496, + "grad_norm": 0.09109296649694443, + "learning_rate": 7.497572746628814e-05, + "loss": 0.0050426885485649105, + "step": 176370 + }, + { + "epoch": 25.036195883605394, + "grad_norm": 1.0644699335098267, + "learning_rate": 7.497430801987225e-05, + "loss": 0.009116597473621368, + "step": 176380 + }, + { + "epoch": 25.037615330021293, + "grad_norm": 0.02851303108036518, + "learning_rate": 7.497288857345635e-05, + "loss": 0.017719896137714387, + "step": 176390 + }, + { + "epoch": 25.03903477643719, + "grad_norm": 11.078912734985352, + "learning_rate": 7.497146912704046e-05, + "loss": 0.0142837256193161, + "step": 176400 + }, + { + "epoch": 25.040454222853086, + "grad_norm": 2.8149285316467285, + "learning_rate": 7.497004968062456e-05, + "loss": 0.014115889370441437, + "step": 176410 + }, + { + "epoch": 25.041873669268984, + "grad_norm": 6.465295314788818, + "learning_rate": 7.496863023420866e-05, + "loss": 0.008584058284759522, + "step": 176420 + }, + { + "epoch": 25.043293115684882, + "grad_norm": 1.1520136594772339, + "learning_rate": 7.496721078779277e-05, + "loss": 0.002545027807354927, + "step": 176430 + }, + { + "epoch": 25.04471256210078, + "grad_norm": 0.10030464082956314, + "learning_rate": 7.496579134137686e-05, + "loss": 0.00711313933134079, + "step": 176440 + }, + { + "epoch": 25.04613200851668, + "grad_norm": 0.4313453733921051, + "learning_rate": 7.496437189496098e-05, + "loss": 0.01120852530002594, + "step": 176450 + }, + { + "epoch": 25.047551454932577, + "grad_norm": 0.08552046120166779, + "learning_rate": 7.496295244854507e-05, + "loss": 0.02512127161026001, + "step": 176460 + }, + { + "epoch": 25.048970901348476, + "grad_norm": 0.8522791266441345, + "learning_rate": 7.496153300212917e-05, + "loss": 0.004693029448390007, + "step": 176470 + }, + { + "epoch": 25.05039034776437, + "grad_norm": 0.016311323270201683, + "learning_rate": 7.496011355571327e-05, + "loss": 0.0016009807586669922, + "step": 176480 + }, + { + "epoch": 25.05180979418027, + "grad_norm": 2.0797278881073, + "learning_rate": 7.495869410929738e-05, + "loss": 0.01012134701013565, + "step": 176490 + }, + { + "epoch": 25.053229240596167, + "grad_norm": 7.008743762969971, + "learning_rate": 7.495727466288148e-05, + "loss": 0.0631231963634491, + "step": 176500 + }, + { + "epoch": 25.053229240596167, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.052684854716062546, + "eval_runtime": 33.4078, + "eval_samples_per_second": 470.758, + "eval_steps_per_second": 14.727, + "step": 176500 + }, + { + "epoch": 25.054648687012065, + "grad_norm": 0.28661060333251953, + "learning_rate": 7.495585521646559e-05, + "loss": 0.013906162977218629, + "step": 176510 + }, + { + "epoch": 25.056068133427964, + "grad_norm": 13.152715682983398, + "learning_rate": 7.495443577004968e-05, + "loss": 0.016797906160354613, + "step": 176520 + }, + { + "epoch": 25.057487579843862, + "grad_norm": 0.7410918474197388, + "learning_rate": 7.495301632363378e-05, + "loss": 0.022309647500514986, + "step": 176530 + }, + { + "epoch": 25.05890702625976, + "grad_norm": 0.372710257768631, + "learning_rate": 7.495159687721789e-05, + "loss": 0.004038023576140404, + "step": 176540 + }, + { + "epoch": 25.060326472675655, + "grad_norm": 0.09186269342899323, + "learning_rate": 7.495017743080199e-05, + "loss": 0.0063840910792350766, + "step": 176550 + }, + { + "epoch": 25.061745919091553, + "grad_norm": 0.020056987181305885, + "learning_rate": 7.49487579843861e-05, + "loss": 0.013389450311660767, + "step": 176560 + }, + { + "epoch": 25.06316536550745, + "grad_norm": 0.2885107100009918, + "learning_rate": 7.494733853797018e-05, + "loss": 0.006709974259138107, + "step": 176570 + }, + { + "epoch": 25.06458481192335, + "grad_norm": 0.06776135414838791, + "learning_rate": 7.49459190915543e-05, + "loss": 0.006024333462119102, + "step": 176580 + }, + { + "epoch": 25.066004258339248, + "grad_norm": 0.02462557516992092, + "learning_rate": 7.49444996451384e-05, + "loss": 0.011663196980953217, + "step": 176590 + }, + { + "epoch": 25.067423704755146, + "grad_norm": 6.3693132400512695, + "learning_rate": 7.49430801987225e-05, + "loss": 0.02744825482368469, + "step": 176600 + }, + { + "epoch": 25.068843151171045, + "grad_norm": 0.6953768134117126, + "learning_rate": 7.49416607523066e-05, + "loss": 0.037572643160820006, + "step": 176610 + }, + { + "epoch": 25.07026259758694, + "grad_norm": 0.8800525069236755, + "learning_rate": 7.49402413058907e-05, + "loss": 0.0031657319515943525, + "step": 176620 + }, + { + "epoch": 25.071682044002838, + "grad_norm": 8.216121673583984, + "learning_rate": 7.493882185947481e-05, + "loss": 0.006142593547701835, + "step": 176630 + }, + { + "epoch": 25.073101490418736, + "grad_norm": 0.020718766376376152, + "learning_rate": 7.493740241305891e-05, + "loss": 0.008105747401714325, + "step": 176640 + }, + { + "epoch": 25.074520936834634, + "grad_norm": 16.330923080444336, + "learning_rate": 7.493598296664302e-05, + "loss": 0.016757726669311523, + "step": 176650 + }, + { + "epoch": 25.075940383250533, + "grad_norm": 15.399680137634277, + "learning_rate": 7.493456352022712e-05, + "loss": 0.020750723779201508, + "step": 176660 + }, + { + "epoch": 25.07735982966643, + "grad_norm": 4.649724006652832, + "learning_rate": 7.493314407381121e-05, + "loss": 0.011888822913169861, + "step": 176670 + }, + { + "epoch": 25.07877927608233, + "grad_norm": 1.4708702564239502, + "learning_rate": 7.493172462739531e-05, + "loss": 0.050727838277816774, + "step": 176680 + }, + { + "epoch": 25.080198722498224, + "grad_norm": 0.7243286371231079, + "learning_rate": 7.493030518097942e-05, + "loss": 0.0016802966594696045, + "step": 176690 + }, + { + "epoch": 25.081618168914122, + "grad_norm": 0.05877183377742767, + "learning_rate": 7.492888573456352e-05, + "loss": 0.0010644357651472091, + "step": 176700 + }, + { + "epoch": 25.08303761533002, + "grad_norm": 0.04714786633849144, + "learning_rate": 7.492746628814763e-05, + "loss": 0.001830197125673294, + "step": 176710 + }, + { + "epoch": 25.08445706174592, + "grad_norm": 10.835545539855957, + "learning_rate": 7.492604684173173e-05, + "loss": 0.014449545741081237, + "step": 176720 + }, + { + "epoch": 25.085876508161817, + "grad_norm": 4.864100456237793, + "learning_rate": 7.492462739531582e-05, + "loss": 0.019573044776916505, + "step": 176730 + }, + { + "epoch": 25.087295954577716, + "grad_norm": 10.999276161193848, + "learning_rate": 7.492320794889994e-05, + "loss": 0.02057640254497528, + "step": 176740 + }, + { + "epoch": 25.088715400993614, + "grad_norm": 10.096863746643066, + "learning_rate": 7.492178850248403e-05, + "loss": 0.03906630873680115, + "step": 176750 + }, + { + "epoch": 25.09013484740951, + "grad_norm": 0.016585664823651314, + "learning_rate": 7.492036905606814e-05, + "loss": 0.06388051509857177, + "step": 176760 + }, + { + "epoch": 25.091554293825407, + "grad_norm": 1.1463334560394287, + "learning_rate": 7.491894960965224e-05, + "loss": 0.01610073894262314, + "step": 176770 + }, + { + "epoch": 25.092973740241305, + "grad_norm": 15.765900611877441, + "learning_rate": 7.491753016323634e-05, + "loss": 0.01837473511695862, + "step": 176780 + }, + { + "epoch": 25.094393186657204, + "grad_norm": 2.7591538429260254, + "learning_rate": 7.491611071682044e-05, + "loss": 0.004452492669224739, + "step": 176790 + }, + { + "epoch": 25.095812633073102, + "grad_norm": 5.0121169090271, + "learning_rate": 7.491469127040455e-05, + "loss": 0.029766082763671875, + "step": 176800 + }, + { + "epoch": 25.097232079489, + "grad_norm": 7.286386489868164, + "learning_rate": 7.491327182398866e-05, + "loss": 0.014282031357288361, + "step": 176810 + }, + { + "epoch": 25.0986515259049, + "grad_norm": 0.15636223554611206, + "learning_rate": 7.491185237757275e-05, + "loss": 0.009339762479066848, + "step": 176820 + }, + { + "epoch": 25.100070972320793, + "grad_norm": 0.013237441889941692, + "learning_rate": 7.491043293115685e-05, + "loss": 0.005744466185569763, + "step": 176830 + }, + { + "epoch": 25.10149041873669, + "grad_norm": 1.442914605140686, + "learning_rate": 7.490901348474095e-05, + "loss": 0.021216309070587157, + "step": 176840 + }, + { + "epoch": 25.10290986515259, + "grad_norm": 0.8551831245422363, + "learning_rate": 7.490759403832506e-05, + "loss": 0.041458770632743835, + "step": 176850 + }, + { + "epoch": 25.10432931156849, + "grad_norm": 0.015192602761089802, + "learning_rate": 7.490617459190916e-05, + "loss": 0.0006980057805776596, + "step": 176860 + }, + { + "epoch": 25.105748757984387, + "grad_norm": 0.3266620934009552, + "learning_rate": 7.490475514549327e-05, + "loss": 0.005885736271739006, + "step": 176870 + }, + { + "epoch": 25.107168204400285, + "grad_norm": 0.16445809602737427, + "learning_rate": 7.490333569907735e-05, + "loss": 0.014743351936340332, + "step": 176880 + }, + { + "epoch": 25.108587650816183, + "grad_norm": 5.196931838989258, + "learning_rate": 7.490191625266146e-05, + "loss": 0.008371663093566895, + "step": 176890 + }, + { + "epoch": 25.110007097232078, + "grad_norm": 0.013903990387916565, + "learning_rate": 7.490049680624557e-05, + "loss": 0.009004485607147217, + "step": 176900 + }, + { + "epoch": 25.111426543647976, + "grad_norm": 0.5382477641105652, + "learning_rate": 7.489907735982967e-05, + "loss": 0.016852802038192748, + "step": 176910 + }, + { + "epoch": 25.112845990063875, + "grad_norm": 0.16067661345005035, + "learning_rate": 7.489765791341378e-05, + "loss": 0.0014088556170463562, + "step": 176920 + }, + { + "epoch": 25.114265436479773, + "grad_norm": 5.141151428222656, + "learning_rate": 7.489623846699787e-05, + "loss": 0.0058997992426157, + "step": 176930 + }, + { + "epoch": 25.11568488289567, + "grad_norm": 0.07201357930898666, + "learning_rate": 7.489481902058198e-05, + "loss": 0.002650027722120285, + "step": 176940 + }, + { + "epoch": 25.11710432931157, + "grad_norm": 0.06861986219882965, + "learning_rate": 7.489339957416607e-05, + "loss": 0.00939708724617958, + "step": 176950 + }, + { + "epoch": 25.118523775727468, + "grad_norm": 0.08185673505067825, + "learning_rate": 7.489198012775019e-05, + "loss": 0.0004951357841491699, + "step": 176960 + }, + { + "epoch": 25.119943222143363, + "grad_norm": 0.17136581242084503, + "learning_rate": 7.489056068133428e-05, + "loss": 0.0052155755460262295, + "step": 176970 + }, + { + "epoch": 25.12136266855926, + "grad_norm": 0.05884414166212082, + "learning_rate": 7.488914123491838e-05, + "loss": 0.017165617644786836, + "step": 176980 + }, + { + "epoch": 25.12278211497516, + "grad_norm": 0.09806634485721588, + "learning_rate": 7.488772178850249e-05, + "loss": 0.014797671139240265, + "step": 176990 + }, + { + "epoch": 25.124201561391057, + "grad_norm": 0.6071063876152039, + "learning_rate": 7.488630234208659e-05, + "loss": 0.0016063731163740158, + "step": 177000 + }, + { + "epoch": 25.124201561391057, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.0478605180978775, + "eval_runtime": 33.7119, + "eval_samples_per_second": 466.512, + "eval_steps_per_second": 14.594, + "step": 177000 + }, + { + "epoch": 25.125621007806956, + "grad_norm": 11.317508697509766, + "learning_rate": 7.48848828956707e-05, + "loss": 0.017256909608840944, + "step": 177010 + }, + { + "epoch": 25.127040454222854, + "grad_norm": 0.4567406475543976, + "learning_rate": 7.48834634492548e-05, + "loss": 0.034023070335388185, + "step": 177020 + }, + { + "epoch": 25.128459900638752, + "grad_norm": 0.14275133609771729, + "learning_rate": 7.48820440028389e-05, + "loss": 0.018980677425861358, + "step": 177030 + }, + { + "epoch": 25.129879347054647, + "grad_norm": 0.053550709038972855, + "learning_rate": 7.488062455642299e-05, + "loss": 0.007971149682998658, + "step": 177040 + }, + { + "epoch": 25.131298793470545, + "grad_norm": 0.030252715572714806, + "learning_rate": 7.48792051100071e-05, + "loss": 0.03175648748874664, + "step": 177050 + }, + { + "epoch": 25.132718239886444, + "grad_norm": 0.04399664327502251, + "learning_rate": 7.48777856635912e-05, + "loss": 0.021335771679878233, + "step": 177060 + }, + { + "epoch": 25.134137686302342, + "grad_norm": 0.7450165748596191, + "learning_rate": 7.487636621717531e-05, + "loss": 0.0533946692943573, + "step": 177070 + }, + { + "epoch": 25.13555713271824, + "grad_norm": 0.35782137513160706, + "learning_rate": 7.487494677075941e-05, + "loss": 0.01605946719646454, + "step": 177080 + }, + { + "epoch": 25.13697657913414, + "grad_norm": 0.021078404039144516, + "learning_rate": 7.48735273243435e-05, + "loss": 0.015987367928028108, + "step": 177090 + }, + { + "epoch": 25.138396025550037, + "grad_norm": 0.41523632407188416, + "learning_rate": 7.487210787792762e-05, + "loss": 0.006299933791160584, + "step": 177100 + }, + { + "epoch": 25.13981547196593, + "grad_norm": 1.736223816871643, + "learning_rate": 7.487068843151171e-05, + "loss": 0.004199061915278435, + "step": 177110 + }, + { + "epoch": 25.14123491838183, + "grad_norm": 5.941538333892822, + "learning_rate": 7.486926898509583e-05, + "loss": 0.015485307574272156, + "step": 177120 + }, + { + "epoch": 25.14265436479773, + "grad_norm": 0.0933258906006813, + "learning_rate": 7.486784953867992e-05, + "loss": 0.005381283164024353, + "step": 177130 + }, + { + "epoch": 25.144073811213627, + "grad_norm": 0.006083237007260323, + "learning_rate": 7.486643009226402e-05, + "loss": 0.06206090450286865, + "step": 177140 + }, + { + "epoch": 25.145493257629525, + "grad_norm": 0.022192124277353287, + "learning_rate": 7.486501064584812e-05, + "loss": 0.006977579742670059, + "step": 177150 + }, + { + "epoch": 25.146912704045423, + "grad_norm": 3.280998468399048, + "learning_rate": 7.486359119943223e-05, + "loss": 0.022607001662254333, + "step": 177160 + }, + { + "epoch": 25.14833215046132, + "grad_norm": 5.697748184204102, + "learning_rate": 7.486217175301633e-05, + "loss": 0.008167669177055359, + "step": 177170 + }, + { + "epoch": 25.149751596877216, + "grad_norm": 0.4718027412891388, + "learning_rate": 7.486075230660044e-05, + "loss": 0.02843135893344879, + "step": 177180 + }, + { + "epoch": 25.151171043293115, + "grad_norm": 0.1417374312877655, + "learning_rate": 7.485933286018453e-05, + "loss": 0.004925765469670296, + "step": 177190 + }, + { + "epoch": 25.152590489709013, + "grad_norm": 1.1414718627929688, + "learning_rate": 7.485791341376863e-05, + "loss": 0.01350751668214798, + "step": 177200 + }, + { + "epoch": 25.15400993612491, + "grad_norm": 0.1868734359741211, + "learning_rate": 7.485649396735274e-05, + "loss": 0.011719868332147599, + "step": 177210 + }, + { + "epoch": 25.15542938254081, + "grad_norm": 0.2365775853395462, + "learning_rate": 7.485507452093684e-05, + "loss": 0.015200671553611756, + "step": 177220 + }, + { + "epoch": 25.156848828956708, + "grad_norm": 5.340273380279541, + "learning_rate": 7.485365507452095e-05, + "loss": 0.024863722920417785, + "step": 177230 + }, + { + "epoch": 25.158268275372606, + "grad_norm": 0.24241764843463898, + "learning_rate": 7.485223562810503e-05, + "loss": 0.023737782239913942, + "step": 177240 + }, + { + "epoch": 25.1596877217885, + "grad_norm": 0.6737861037254333, + "learning_rate": 7.485081618168915e-05, + "loss": 0.002550158277153969, + "step": 177250 + }, + { + "epoch": 25.1611071682044, + "grad_norm": 0.00904833059757948, + "learning_rate": 7.484939673527324e-05, + "loss": 0.007226961106061936, + "step": 177260 + }, + { + "epoch": 25.162526614620297, + "grad_norm": 0.8146507143974304, + "learning_rate": 7.484797728885735e-05, + "loss": 0.008874116837978363, + "step": 177270 + }, + { + "epoch": 25.163946061036196, + "grad_norm": 0.09449493139982224, + "learning_rate": 7.484655784244145e-05, + "loss": 0.008186303824186326, + "step": 177280 + }, + { + "epoch": 25.165365507452094, + "grad_norm": 0.12033767253160477, + "learning_rate": 7.484513839602555e-05, + "loss": 0.020451271533966066, + "step": 177290 + }, + { + "epoch": 25.166784953867992, + "grad_norm": 1.9243777990341187, + "learning_rate": 7.484371894960966e-05, + "loss": 0.004837767779827118, + "step": 177300 + }, + { + "epoch": 25.16820440028389, + "grad_norm": 0.27286776900291443, + "learning_rate": 7.484229950319376e-05, + "loss": 0.02365976572036743, + "step": 177310 + }, + { + "epoch": 25.169623846699785, + "grad_norm": 13.3877592086792, + "learning_rate": 7.484088005677787e-05, + "loss": 0.03661222457885742, + "step": 177320 + }, + { + "epoch": 25.171043293115684, + "grad_norm": 6.232120513916016, + "learning_rate": 7.483946061036196e-05, + "loss": 0.0076761551201343535, + "step": 177330 + }, + { + "epoch": 25.172462739531582, + "grad_norm": 0.9671525955200195, + "learning_rate": 7.483804116394606e-05, + "loss": 0.0025267720222473143, + "step": 177340 + }, + { + "epoch": 25.17388218594748, + "grad_norm": 0.04282539337873459, + "learning_rate": 7.483662171753016e-05, + "loss": 0.032548004388809205, + "step": 177350 + }, + { + "epoch": 25.17530163236338, + "grad_norm": 4.027100086212158, + "learning_rate": 7.483520227111427e-05, + "loss": 0.03468705117702484, + "step": 177360 + }, + { + "epoch": 25.176721078779277, + "grad_norm": 0.21050456166267395, + "learning_rate": 7.483378282469837e-05, + "loss": 0.03242431282997131, + "step": 177370 + }, + { + "epoch": 25.178140525195175, + "grad_norm": 0.09454849362373352, + "learning_rate": 7.483236337828248e-05, + "loss": 0.001025502011179924, + "step": 177380 + }, + { + "epoch": 25.17955997161107, + "grad_norm": 13.48999309539795, + "learning_rate": 7.483094393186658e-05, + "loss": 0.027829620242118835, + "step": 177390 + }, + { + "epoch": 25.18097941802697, + "grad_norm": 0.008085222914814949, + "learning_rate": 7.482952448545067e-05, + "loss": 0.007607623934745789, + "step": 177400 + }, + { + "epoch": 25.182398864442867, + "grad_norm": 0.014183585532009602, + "learning_rate": 7.482810503903478e-05, + "loss": 0.005951400846242905, + "step": 177410 + }, + { + "epoch": 25.183818310858765, + "grad_norm": 4.822554588317871, + "learning_rate": 7.482668559261888e-05, + "loss": 0.05174697637557983, + "step": 177420 + }, + { + "epoch": 25.185237757274663, + "grad_norm": 11.530142784118652, + "learning_rate": 7.482526614620299e-05, + "loss": 0.010979852080345154, + "step": 177430 + }, + { + "epoch": 25.18665720369056, + "grad_norm": 0.263751745223999, + "learning_rate": 7.482384669978708e-05, + "loss": 0.0015735868364572525, + "step": 177440 + }, + { + "epoch": 25.18807665010646, + "grad_norm": 0.0038164283614605665, + "learning_rate": 7.482242725337119e-05, + "loss": 0.006784328818321228, + "step": 177450 + }, + { + "epoch": 25.189496096522355, + "grad_norm": 0.8439112901687622, + "learning_rate": 7.482100780695529e-05, + "loss": 0.014946967363357544, + "step": 177460 + }, + { + "epoch": 25.190915542938253, + "grad_norm": 5.974944114685059, + "learning_rate": 7.48195883605394e-05, + "loss": 0.037424790859222415, + "step": 177470 + }, + { + "epoch": 25.19233498935415, + "grad_norm": 0.645409882068634, + "learning_rate": 7.48181689141235e-05, + "loss": 0.003190535679459572, + "step": 177480 + }, + { + "epoch": 25.19375443577005, + "grad_norm": 0.0510481595993042, + "learning_rate": 7.48167494677076e-05, + "loss": 0.01070658341050148, + "step": 177490 + }, + { + "epoch": 25.195173882185948, + "grad_norm": 2.534604072570801, + "learning_rate": 7.48153300212917e-05, + "loss": 0.030412399768829347, + "step": 177500 + }, + { + "epoch": 25.195173882185948, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.04976564645767212, + "eval_runtime": 33.4107, + "eval_samples_per_second": 470.717, + "eval_steps_per_second": 14.726, + "step": 177500 + }, + { + "epoch": 25.196593328601846, + "grad_norm": 0.015043523162603378, + "learning_rate": 7.48139105748758e-05, + "loss": 0.008754602074623108, + "step": 177510 + }, + { + "epoch": 25.198012775017745, + "grad_norm": 0.1021757647395134, + "learning_rate": 7.481249112845991e-05, + "loss": 0.011797083914279938, + "step": 177520 + }, + { + "epoch": 25.19943222143364, + "grad_norm": 0.022420065477490425, + "learning_rate": 7.481107168204401e-05, + "loss": 0.01326509416103363, + "step": 177530 + }, + { + "epoch": 25.200851667849538, + "grad_norm": 0.010952077805995941, + "learning_rate": 7.480965223562812e-05, + "loss": 0.05155777931213379, + "step": 177540 + }, + { + "epoch": 25.202271114265436, + "grad_norm": 5.917091369628906, + "learning_rate": 7.48082327892122e-05, + "loss": 0.004917236045002937, + "step": 177550 + }, + { + "epoch": 25.203690560681334, + "grad_norm": 0.052807893604040146, + "learning_rate": 7.480681334279631e-05, + "loss": 0.006468456983566284, + "step": 177560 + }, + { + "epoch": 25.205110007097232, + "grad_norm": 0.11365615576505661, + "learning_rate": 7.480539389638041e-05, + "loss": 0.022048671543598176, + "step": 177570 + }, + { + "epoch": 25.20652945351313, + "grad_norm": 0.01614413596689701, + "learning_rate": 7.480397444996452e-05, + "loss": 0.04329241514205932, + "step": 177580 + }, + { + "epoch": 25.20794889992903, + "grad_norm": 0.5580011010169983, + "learning_rate": 7.480255500354862e-05, + "loss": 0.014224207401275635, + "step": 177590 + }, + { + "epoch": 25.209368346344924, + "grad_norm": 5.600711345672607, + "learning_rate": 7.480113555713272e-05, + "loss": 0.02170404642820358, + "step": 177600 + }, + { + "epoch": 25.210787792760822, + "grad_norm": 0.11746274679899216, + "learning_rate": 7.479971611071683e-05, + "loss": 0.04861941635608673, + "step": 177610 + }, + { + "epoch": 25.21220723917672, + "grad_norm": 13.55069637298584, + "learning_rate": 7.479829666430092e-05, + "loss": 0.02090483605861664, + "step": 177620 + }, + { + "epoch": 25.21362668559262, + "grad_norm": 1.3059744834899902, + "learning_rate": 7.479687721788504e-05, + "loss": 0.007630449533462524, + "step": 177630 + }, + { + "epoch": 25.215046132008517, + "grad_norm": 16.88045883178711, + "learning_rate": 7.479545777146913e-05, + "loss": 0.017962148785591124, + "step": 177640 + }, + { + "epoch": 25.216465578424415, + "grad_norm": 3.987849473953247, + "learning_rate": 7.479403832505323e-05, + "loss": 0.00394018292427063, + "step": 177650 + }, + { + "epoch": 25.217885024840314, + "grad_norm": 1.354544997215271, + "learning_rate": 7.479261887863733e-05, + "loss": 0.021294161677360535, + "step": 177660 + }, + { + "epoch": 25.21930447125621, + "grad_norm": 0.22708702087402344, + "learning_rate": 7.479119943222144e-05, + "loss": 0.023211371898651124, + "step": 177670 + }, + { + "epoch": 25.220723917672107, + "grad_norm": 0.03344748914241791, + "learning_rate": 7.478977998580554e-05, + "loss": 0.001497640833258629, + "step": 177680 + }, + { + "epoch": 25.222143364088005, + "grad_norm": 3.6119697093963623, + "learning_rate": 7.478836053938965e-05, + "loss": 0.01813344657421112, + "step": 177690 + }, + { + "epoch": 25.223562810503903, + "grad_norm": 0.03671973571181297, + "learning_rate": 7.478694109297374e-05, + "loss": 0.010255929827690125, + "step": 177700 + }, + { + "epoch": 25.2249822569198, + "grad_norm": 0.3248015344142914, + "learning_rate": 7.478552164655784e-05, + "loss": 0.01787571907043457, + "step": 177710 + }, + { + "epoch": 25.2264017033357, + "grad_norm": 0.029002796858549118, + "learning_rate": 7.478410220014195e-05, + "loss": 0.028751945495605467, + "step": 177720 + }, + { + "epoch": 25.2278211497516, + "grad_norm": 0.1481567770242691, + "learning_rate": 7.478268275372605e-05, + "loss": 0.04389281272888183, + "step": 177730 + }, + { + "epoch": 25.229240596167493, + "grad_norm": 3.7602739334106445, + "learning_rate": 7.478126330731016e-05, + "loss": 0.004061662405729294, + "step": 177740 + }, + { + "epoch": 25.23066004258339, + "grad_norm": 0.021305641159415245, + "learning_rate": 7.477984386089424e-05, + "loss": 0.007902707159519195, + "step": 177750 + }, + { + "epoch": 25.23207948899929, + "grad_norm": 0.008513431064784527, + "learning_rate": 7.477842441447836e-05, + "loss": 0.04395363926887512, + "step": 177760 + }, + { + "epoch": 25.233498935415188, + "grad_norm": 1.5825765132904053, + "learning_rate": 7.477700496806245e-05, + "loss": 0.023178498446941375, + "step": 177770 + }, + { + "epoch": 25.234918381831086, + "grad_norm": 0.04857514426112175, + "learning_rate": 7.477558552164656e-05, + "loss": 0.050082278251647946, + "step": 177780 + }, + { + "epoch": 25.236337828246985, + "grad_norm": 0.23522338271141052, + "learning_rate": 7.477416607523066e-05, + "loss": 0.045811936259269714, + "step": 177790 + }, + { + "epoch": 25.237757274662883, + "grad_norm": 2.9428513050079346, + "learning_rate": 7.477274662881476e-05, + "loss": 0.03173220455646515, + "step": 177800 + }, + { + "epoch": 25.239176721078778, + "grad_norm": 0.7059081792831421, + "learning_rate": 7.477132718239887e-05, + "loss": 0.005904814973473549, + "step": 177810 + }, + { + "epoch": 25.240596167494676, + "grad_norm": 0.5686576962471008, + "learning_rate": 7.476990773598297e-05, + "loss": 0.027811533212661742, + "step": 177820 + }, + { + "epoch": 25.242015613910574, + "grad_norm": 0.3142721652984619, + "learning_rate": 7.476848828956708e-05, + "loss": 0.025027748942375184, + "step": 177830 + }, + { + "epoch": 25.243435060326473, + "grad_norm": 3.721357583999634, + "learning_rate": 7.476706884315118e-05, + "loss": 0.01937507390975952, + "step": 177840 + }, + { + "epoch": 25.24485450674237, + "grad_norm": 0.013615927658975124, + "learning_rate": 7.476564939673529e-05, + "loss": 0.016618776321411132, + "step": 177850 + }, + { + "epoch": 25.24627395315827, + "grad_norm": 0.0028565656393766403, + "learning_rate": 7.476422995031937e-05, + "loss": 0.013839320838451385, + "step": 177860 + }, + { + "epoch": 25.247693399574167, + "grad_norm": 15.168730735778809, + "learning_rate": 7.476281050390348e-05, + "loss": 0.01770912706851959, + "step": 177870 + }, + { + "epoch": 25.249112845990062, + "grad_norm": 0.04687674343585968, + "learning_rate": 7.476139105748758e-05, + "loss": 0.008464504778385163, + "step": 177880 + }, + { + "epoch": 25.25053229240596, + "grad_norm": 0.07272609323263168, + "learning_rate": 7.475997161107169e-05, + "loss": 0.010662201046943664, + "step": 177890 + }, + { + "epoch": 25.25195173882186, + "grad_norm": 0.03803888335824013, + "learning_rate": 7.475855216465579e-05, + "loss": 0.021412912011146545, + "step": 177900 + }, + { + "epoch": 25.253371185237757, + "grad_norm": 4.348742961883545, + "learning_rate": 7.475713271823988e-05, + "loss": 0.01795404255390167, + "step": 177910 + }, + { + "epoch": 25.254790631653655, + "grad_norm": 6.2798638343811035, + "learning_rate": 7.4755713271824e-05, + "loss": 0.009177735447883606, + "step": 177920 + }, + { + "epoch": 25.256210078069554, + "grad_norm": 8.514674186706543, + "learning_rate": 7.475429382540809e-05, + "loss": 0.017040459811687468, + "step": 177930 + }, + { + "epoch": 25.257629524485452, + "grad_norm": 2.7489349842071533, + "learning_rate": 7.47528743789922e-05, + "loss": 0.0093565434217453, + "step": 177940 + }, + { + "epoch": 25.259048970901347, + "grad_norm": 0.013862254098057747, + "learning_rate": 7.47514549325763e-05, + "loss": 0.005216912552714348, + "step": 177950 + }, + { + "epoch": 25.260468417317245, + "grad_norm": 0.03947428613901138, + "learning_rate": 7.47500354861604e-05, + "loss": 0.004667390137910843, + "step": 177960 + }, + { + "epoch": 25.261887863733143, + "grad_norm": 16.694896697998047, + "learning_rate": 7.47486160397445e-05, + "loss": 0.018578889966011047, + "step": 177970 + }, + { + "epoch": 25.26330731014904, + "grad_norm": 0.08032336831092834, + "learning_rate": 7.47471965933286e-05, + "loss": 0.0032907668501138687, + "step": 177980 + }, + { + "epoch": 25.26472675656494, + "grad_norm": 0.3080384433269501, + "learning_rate": 7.47457771469127e-05, + "loss": 0.001094098761677742, + "step": 177990 + }, + { + "epoch": 25.26614620298084, + "grad_norm": 0.018115438520908356, + "learning_rate": 7.474435770049681e-05, + "loss": 0.03822220861911774, + "step": 178000 + }, + { + "epoch": 25.26614620298084, + "eval_accuracy": 0.9898264131747949, + "eval_loss": 0.04019980877637863, + "eval_runtime": 34.4308, + "eval_samples_per_second": 456.771, + "eval_steps_per_second": 14.29, + "step": 178000 + }, + { + "epoch": 25.267565649396737, + "grad_norm": 6.488519668579102, + "learning_rate": 7.474293825408091e-05, + "loss": 0.008816338330507278, + "step": 178010 + }, + { + "epoch": 25.26898509581263, + "grad_norm": 0.8103450536727905, + "learning_rate": 7.474151880766501e-05, + "loss": 0.006737771630287171, + "step": 178020 + }, + { + "epoch": 25.27040454222853, + "grad_norm": 0.26171043515205383, + "learning_rate": 7.474009936124912e-05, + "loss": 0.04256677031517029, + "step": 178030 + }, + { + "epoch": 25.271823988644428, + "grad_norm": 1.605383276939392, + "learning_rate": 7.473867991483322e-05, + "loss": 0.011130887269973754, + "step": 178040 + }, + { + "epoch": 25.273243435060326, + "grad_norm": 0.7999954223632812, + "learning_rate": 7.473726046841733e-05, + "loss": 0.010748135298490525, + "step": 178050 + }, + { + "epoch": 25.274662881476225, + "grad_norm": 0.006804846227169037, + "learning_rate": 7.473584102200141e-05, + "loss": 0.034029772877693175, + "step": 178060 + }, + { + "epoch": 25.276082327892123, + "grad_norm": 1.8623192310333252, + "learning_rate": 7.473442157558552e-05, + "loss": 0.004102405905723572, + "step": 178070 + }, + { + "epoch": 25.27750177430802, + "grad_norm": 0.0293990820646286, + "learning_rate": 7.473300212916962e-05, + "loss": 0.013626775145530701, + "step": 178080 + }, + { + "epoch": 25.278921220723916, + "grad_norm": 3.447418212890625, + "learning_rate": 7.473158268275373e-05, + "loss": 0.012241517752408981, + "step": 178090 + }, + { + "epoch": 25.280340667139814, + "grad_norm": 0.03427115082740784, + "learning_rate": 7.473016323633784e-05, + "loss": 0.001579718291759491, + "step": 178100 + }, + { + "epoch": 25.281760113555713, + "grad_norm": 0.17126867175102234, + "learning_rate": 7.472874378992193e-05, + "loss": 0.026781368255615234, + "step": 178110 + }, + { + "epoch": 25.28317955997161, + "grad_norm": 0.701843798160553, + "learning_rate": 7.472732434350604e-05, + "loss": 0.0033408574759960174, + "step": 178120 + }, + { + "epoch": 25.28459900638751, + "grad_norm": 0.18805037438869476, + "learning_rate": 7.472590489709013e-05, + "loss": 0.009420453011989594, + "step": 178130 + }, + { + "epoch": 25.286018452803408, + "grad_norm": 0.011128490790724754, + "learning_rate": 7.472448545067425e-05, + "loss": 0.004626255109906196, + "step": 178140 + }, + { + "epoch": 25.287437899219306, + "grad_norm": 3.042506456375122, + "learning_rate": 7.472306600425834e-05, + "loss": 0.007917433977127075, + "step": 178150 + }, + { + "epoch": 25.2888573456352, + "grad_norm": 0.006364540662616491, + "learning_rate": 7.472164655784244e-05, + "loss": 0.014313438534736633, + "step": 178160 + }, + { + "epoch": 25.2902767920511, + "grad_norm": 2.647148609161377, + "learning_rate": 7.472022711142654e-05, + "loss": 0.019677142798900604, + "step": 178170 + }, + { + "epoch": 25.291696238466997, + "grad_norm": 0.013891726732254028, + "learning_rate": 7.471880766501065e-05, + "loss": 0.005915298312902451, + "step": 178180 + }, + { + "epoch": 25.293115684882896, + "grad_norm": 0.5962719917297363, + "learning_rate": 7.471738821859476e-05, + "loss": 0.07857401371002197, + "step": 178190 + }, + { + "epoch": 25.294535131298794, + "grad_norm": 13.079880714416504, + "learning_rate": 7.471596877217886e-05, + "loss": 0.017688582837581634, + "step": 178200 + }, + { + "epoch": 25.295954577714692, + "grad_norm": 2.362428665161133, + "learning_rate": 7.471454932576297e-05, + "loss": 0.004099174588918686, + "step": 178210 + }, + { + "epoch": 25.29737402413059, + "grad_norm": 1.9382761716842651, + "learning_rate": 7.471312987934705e-05, + "loss": 0.006340087205171585, + "step": 178220 + }, + { + "epoch": 25.298793470546485, + "grad_norm": 0.20858049392700195, + "learning_rate": 7.471171043293116e-05, + "loss": 0.006140478327870369, + "step": 178230 + }, + { + "epoch": 25.300212916962384, + "grad_norm": 8.342604637145996, + "learning_rate": 7.471029098651526e-05, + "loss": 0.023527058959007262, + "step": 178240 + }, + { + "epoch": 25.301632363378282, + "grad_norm": 18.527238845825195, + "learning_rate": 7.470887154009937e-05, + "loss": 0.053363317251205446, + "step": 178250 + }, + { + "epoch": 25.30305180979418, + "grad_norm": 19.245800018310547, + "learning_rate": 7.470745209368347e-05, + "loss": 0.04276447892189026, + "step": 178260 + }, + { + "epoch": 25.30447125621008, + "grad_norm": 0.029465187340974808, + "learning_rate": 7.470603264726757e-05, + "loss": 0.03063215911388397, + "step": 178270 + }, + { + "epoch": 25.305890702625977, + "grad_norm": 0.35916101932525635, + "learning_rate": 7.470461320085168e-05, + "loss": 0.0027552783489227296, + "step": 178280 + }, + { + "epoch": 25.307310149041875, + "grad_norm": 0.043592825531959534, + "learning_rate": 7.470319375443577e-05, + "loss": 0.022251568734645844, + "step": 178290 + }, + { + "epoch": 25.30872959545777, + "grad_norm": 7.234454154968262, + "learning_rate": 7.470177430801988e-05, + "loss": 0.031618863344192505, + "step": 178300 + }, + { + "epoch": 25.310149041873668, + "grad_norm": 5.121912002563477, + "learning_rate": 7.470035486160398e-05, + "loss": 0.037425887584686277, + "step": 178310 + }, + { + "epoch": 25.311568488289566, + "grad_norm": 12.64792537689209, + "learning_rate": 7.469893541518808e-05, + "loss": 0.017772296071052553, + "step": 178320 + }, + { + "epoch": 25.312987934705465, + "grad_norm": 2.0808417797088623, + "learning_rate": 7.469751596877218e-05, + "loss": 0.012282520532608032, + "step": 178330 + }, + { + "epoch": 25.314407381121363, + "grad_norm": 0.05998979136347771, + "learning_rate": 7.469609652235629e-05, + "loss": 0.047841215133666994, + "step": 178340 + }, + { + "epoch": 25.31582682753726, + "grad_norm": 0.7916275262832642, + "learning_rate": 7.469467707594039e-05, + "loss": 0.016629940271377562, + "step": 178350 + }, + { + "epoch": 25.31724627395316, + "grad_norm": 0.03510245308279991, + "learning_rate": 7.46932576295245e-05, + "loss": 0.006230897083878517, + "step": 178360 + }, + { + "epoch": 25.318665720369054, + "grad_norm": 0.01202579215168953, + "learning_rate": 7.46918381831086e-05, + "loss": 0.002487137168645859, + "step": 178370 + }, + { + "epoch": 25.320085166784953, + "grad_norm": 0.004697986878454685, + "learning_rate": 7.469041873669269e-05, + "loss": 0.0012678995728492737, + "step": 178380 + }, + { + "epoch": 25.32150461320085, + "grad_norm": 0.03375842794775963, + "learning_rate": 7.46889992902768e-05, + "loss": 0.004447629302740097, + "step": 178390 + }, + { + "epoch": 25.32292405961675, + "grad_norm": 0.02407890185713768, + "learning_rate": 7.46875798438609e-05, + "loss": 0.0688482642173767, + "step": 178400 + }, + { + "epoch": 25.324343506032648, + "grad_norm": 0.008337125182151794, + "learning_rate": 7.468616039744501e-05, + "loss": 0.014852511882781982, + "step": 178410 + }, + { + "epoch": 25.325762952448546, + "grad_norm": 0.42953604459762573, + "learning_rate": 7.46847409510291e-05, + "loss": 0.0029565125703811647, + "step": 178420 + }, + { + "epoch": 25.327182398864444, + "grad_norm": 4.332587718963623, + "learning_rate": 7.46833215046132e-05, + "loss": 0.00488160103559494, + "step": 178430 + }, + { + "epoch": 25.32860184528034, + "grad_norm": 0.10849611461162567, + "learning_rate": 7.46819020581973e-05, + "loss": 0.02495238482952118, + "step": 178440 + }, + { + "epoch": 25.330021291696237, + "grad_norm": 0.05418453365564346, + "learning_rate": 7.468048261178141e-05, + "loss": 0.01544126719236374, + "step": 178450 + }, + { + "epoch": 25.331440738112136, + "grad_norm": 0.0031717834062874317, + "learning_rate": 7.467906316536551e-05, + "loss": 0.030635115504264832, + "step": 178460 + }, + { + "epoch": 25.332860184528034, + "grad_norm": 0.04475431889295578, + "learning_rate": 7.467764371894961e-05, + "loss": 0.018678781390190125, + "step": 178470 + }, + { + "epoch": 25.334279630943932, + "grad_norm": 13.741936683654785, + "learning_rate": 7.467622427253372e-05, + "loss": 0.026824140548706056, + "step": 178480 + }, + { + "epoch": 25.33569907735983, + "grad_norm": 8.08503532409668, + "learning_rate": 7.467480482611782e-05, + "loss": 0.02451682984828949, + "step": 178490 + }, + { + "epoch": 25.33711852377573, + "grad_norm": 13.073064804077148, + "learning_rate": 7.467338537970193e-05, + "loss": 0.021388554573059083, + "step": 178500 + }, + { + "epoch": 25.33711852377573, + "eval_accuracy": 0.9869015069625485, + "eval_loss": 0.04963162541389465, + "eval_runtime": 33.6369, + "eval_samples_per_second": 467.552, + "eval_steps_per_second": 14.627, + "step": 178500 + }, + { + "epoch": 25.338537970191624, + "grad_norm": 0.023830002173781395, + "learning_rate": 7.467196593328602e-05, + "loss": 0.006773306429386139, + "step": 178510 + }, + { + "epoch": 25.339957416607522, + "grad_norm": 0.11956886202096939, + "learning_rate": 7.467054648687012e-05, + "loss": 0.04358331561088562, + "step": 178520 + }, + { + "epoch": 25.34137686302342, + "grad_norm": 0.4194454550743103, + "learning_rate": 7.466912704045422e-05, + "loss": 0.010776673257350922, + "step": 178530 + }, + { + "epoch": 25.34279630943932, + "grad_norm": 0.4637395143508911, + "learning_rate": 7.466770759403833e-05, + "loss": 0.010551182925701142, + "step": 178540 + }, + { + "epoch": 25.344215755855217, + "grad_norm": 0.7976447343826294, + "learning_rate": 7.466628814762243e-05, + "loss": 0.0015708636492490768, + "step": 178550 + }, + { + "epoch": 25.345635202271115, + "grad_norm": 0.6929312944412231, + "learning_rate": 7.466486870120654e-05, + "loss": 0.01298723965883255, + "step": 178560 + }, + { + "epoch": 25.347054648687013, + "grad_norm": 0.17178545892238617, + "learning_rate": 7.466344925479064e-05, + "loss": 0.004111459851264954, + "step": 178570 + }, + { + "epoch": 25.348474095102908, + "grad_norm": 0.1511676013469696, + "learning_rate": 7.466202980837473e-05, + "loss": 0.018685771524906157, + "step": 178580 + }, + { + "epoch": 25.349893541518806, + "grad_norm": 0.8036438226699829, + "learning_rate": 7.466061036195884e-05, + "loss": 0.00201040580868721, + "step": 178590 + }, + { + "epoch": 25.351312987934705, + "grad_norm": 5.287117958068848, + "learning_rate": 7.465919091554294e-05, + "loss": 0.024046140909194946, + "step": 178600 + }, + { + "epoch": 25.352732434350603, + "grad_norm": 0.37541306018829346, + "learning_rate": 7.465777146912705e-05, + "loss": 0.011264415085315704, + "step": 178610 + }, + { + "epoch": 25.3541518807665, + "grad_norm": 0.056379083544015884, + "learning_rate": 7.465635202271115e-05, + "loss": 0.04053153991699219, + "step": 178620 + }, + { + "epoch": 25.3555713271824, + "grad_norm": 7.385488033294678, + "learning_rate": 7.465493257629525e-05, + "loss": 0.011267174035310745, + "step": 178630 + }, + { + "epoch": 25.356990773598298, + "grad_norm": 0.00989037100225687, + "learning_rate": 7.465351312987934e-05, + "loss": 0.007854482531547547, + "step": 178640 + }, + { + "epoch": 25.358410220014193, + "grad_norm": 1.4001903533935547, + "learning_rate": 7.465209368346346e-05, + "loss": 0.0224483460187912, + "step": 178650 + }, + { + "epoch": 25.35982966643009, + "grad_norm": 0.5095293521881104, + "learning_rate": 7.465067423704755e-05, + "loss": 0.0370639443397522, + "step": 178660 + }, + { + "epoch": 25.36124911284599, + "grad_norm": 0.03634141758084297, + "learning_rate": 7.464925479063166e-05, + "loss": 0.0031753618270158766, + "step": 178670 + }, + { + "epoch": 25.362668559261888, + "grad_norm": 4.425975799560547, + "learning_rate": 7.464783534421576e-05, + "loss": 0.022585457563400267, + "step": 178680 + }, + { + "epoch": 25.364088005677786, + "grad_norm": 8.515546798706055, + "learning_rate": 7.464641589779986e-05, + "loss": 0.01267334669828415, + "step": 178690 + }, + { + "epoch": 25.365507452093684, + "grad_norm": 0.45367899537086487, + "learning_rate": 7.464499645138397e-05, + "loss": 0.00992943048477173, + "step": 178700 + }, + { + "epoch": 25.366926898509583, + "grad_norm": 5.9111762046813965, + "learning_rate": 7.464357700496807e-05, + "loss": 0.03179035186767578, + "step": 178710 + }, + { + "epoch": 25.368346344925477, + "grad_norm": 0.06397988647222519, + "learning_rate": 7.464215755855218e-05, + "loss": 0.0699587345123291, + "step": 178720 + }, + { + "epoch": 25.369765791341376, + "grad_norm": 5.676983833312988, + "learning_rate": 7.464073811213626e-05, + "loss": 0.004925765842199325, + "step": 178730 + }, + { + "epoch": 25.371185237757274, + "grad_norm": 0.01177477091550827, + "learning_rate": 7.463931866572037e-05, + "loss": 0.01276625394821167, + "step": 178740 + }, + { + "epoch": 25.372604684173172, + "grad_norm": 0.012948267161846161, + "learning_rate": 7.463789921930447e-05, + "loss": 0.027036419510841368, + "step": 178750 + }, + { + "epoch": 25.37402413058907, + "grad_norm": 0.20030640065670013, + "learning_rate": 7.463647977288858e-05, + "loss": 0.055357450246810914, + "step": 178760 + }, + { + "epoch": 25.37544357700497, + "grad_norm": 0.46566349267959595, + "learning_rate": 7.463506032647268e-05, + "loss": 0.0048358868807554245, + "step": 178770 + }, + { + "epoch": 25.376863023420867, + "grad_norm": 0.02517218515276909, + "learning_rate": 7.463364088005678e-05, + "loss": 0.020268794894218446, + "step": 178780 + }, + { + "epoch": 25.378282469836762, + "grad_norm": 0.01399201713502407, + "learning_rate": 7.463222143364089e-05, + "loss": 0.006105394661426544, + "step": 178790 + }, + { + "epoch": 25.37970191625266, + "grad_norm": 0.027102502062916756, + "learning_rate": 7.463080198722498e-05, + "loss": 0.017883066833019257, + "step": 178800 + }, + { + "epoch": 25.38112136266856, + "grad_norm": 0.7600093483924866, + "learning_rate": 7.46293825408091e-05, + "loss": 0.06457518935203552, + "step": 178810 + }, + { + "epoch": 25.382540809084457, + "grad_norm": 11.45730209350586, + "learning_rate": 7.462796309439319e-05, + "loss": 0.023484429717063902, + "step": 178820 + }, + { + "epoch": 25.383960255500355, + "grad_norm": 4.682811737060547, + "learning_rate": 7.462654364797729e-05, + "loss": 0.009061050415039063, + "step": 178830 + }, + { + "epoch": 25.385379701916253, + "grad_norm": 1.2764064073562622, + "learning_rate": 7.462512420156139e-05, + "loss": 0.0035672053694725037, + "step": 178840 + }, + { + "epoch": 25.386799148332152, + "grad_norm": 10.941265106201172, + "learning_rate": 7.46237047551455e-05, + "loss": 0.058092916011810304, + "step": 178850 + }, + { + "epoch": 25.388218594748047, + "grad_norm": 10.942096710205078, + "learning_rate": 7.46222853087296e-05, + "loss": 0.03376610279083252, + "step": 178860 + }, + { + "epoch": 25.389638041163945, + "grad_norm": 0.03621029108762741, + "learning_rate": 7.46208658623137e-05, + "loss": 0.020803767442703246, + "step": 178870 + }, + { + "epoch": 25.391057487579843, + "grad_norm": 0.14712239801883698, + "learning_rate": 7.46194464158978e-05, + "loss": 0.01574898511171341, + "step": 178880 + }, + { + "epoch": 25.39247693399574, + "grad_norm": 8.039456367492676, + "learning_rate": 7.46180269694819e-05, + "loss": 0.03067460358142853, + "step": 178890 + }, + { + "epoch": 25.39389638041164, + "grad_norm": 0.03159085288643837, + "learning_rate": 7.461660752306601e-05, + "loss": 0.020564551651477813, + "step": 178900 + }, + { + "epoch": 25.395315826827538, + "grad_norm": 0.020784901455044746, + "learning_rate": 7.461518807665011e-05, + "loss": 0.0013353623449802398, + "step": 178910 + }, + { + "epoch": 25.396735273243436, + "grad_norm": 0.04087957739830017, + "learning_rate": 7.461376863023422e-05, + "loss": 0.019830669462680816, + "step": 178920 + }, + { + "epoch": 25.39815471965933, + "grad_norm": 7.553539752960205, + "learning_rate": 7.461234918381832e-05, + "loss": 0.026098597049713134, + "step": 178930 + }, + { + "epoch": 25.39957416607523, + "grad_norm": 0.46314722299575806, + "learning_rate": 7.461092973740241e-05, + "loss": 0.01915777623653412, + "step": 178940 + }, + { + "epoch": 25.400993612491128, + "grad_norm": 1.7408502101898193, + "learning_rate": 7.460951029098651e-05, + "loss": 0.010671199858188629, + "step": 178950 + }, + { + "epoch": 25.402413058907026, + "grad_norm": 0.09332727640867233, + "learning_rate": 7.460809084457062e-05, + "loss": 0.008747979998588562, + "step": 178960 + }, + { + "epoch": 25.403832505322924, + "grad_norm": 1.0955272912979126, + "learning_rate": 7.460667139815472e-05, + "loss": 0.01663207858800888, + "step": 178970 + }, + { + "epoch": 25.405251951738823, + "grad_norm": 1.037175178527832, + "learning_rate": 7.460525195173883e-05, + "loss": 0.010167185962200165, + "step": 178980 + }, + { + "epoch": 25.40667139815472, + "grad_norm": 0.06694335490465164, + "learning_rate": 7.460383250532293e-05, + "loss": 0.002763127163052559, + "step": 178990 + }, + { + "epoch": 25.408090844570616, + "grad_norm": 0.1025523766875267, + "learning_rate": 7.460241305890703e-05, + "loss": 0.028108128905296327, + "step": 179000 + }, + { + "epoch": 25.408090844570616, + "eval_accuracy": 0.9870922617155211, + "eval_loss": 0.05028115585446358, + "eval_runtime": 33.1153, + "eval_samples_per_second": 474.917, + "eval_steps_per_second": 14.857, + "step": 179000 + }, + { + "epoch": 25.409510290986514, + "grad_norm": 0.39928001165390015, + "learning_rate": 7.460099361249114e-05, + "loss": 0.02408731132745743, + "step": 179010 + }, + { + "epoch": 25.410929737402412, + "grad_norm": 7.591883182525635, + "learning_rate": 7.459957416607523e-05, + "loss": 0.02525644898414612, + "step": 179020 + }, + { + "epoch": 25.41234918381831, + "grad_norm": 0.20059002935886383, + "learning_rate": 7.459815471965935e-05, + "loss": 0.014850631356239319, + "step": 179030 + }, + { + "epoch": 25.41376863023421, + "grad_norm": 0.5195045471191406, + "learning_rate": 7.459673527324343e-05, + "loss": 0.0044113948941230776, + "step": 179040 + }, + { + "epoch": 25.415188076650107, + "grad_norm": 0.034902915358543396, + "learning_rate": 7.459531582682754e-05, + "loss": 0.05072693824768067, + "step": 179050 + }, + { + "epoch": 25.416607523066006, + "grad_norm": 0.28236207365989685, + "learning_rate": 7.459389638041164e-05, + "loss": 0.002547471970319748, + "step": 179060 + }, + { + "epoch": 25.4180269694819, + "grad_norm": 0.7457674145698547, + "learning_rate": 7.459247693399575e-05, + "loss": 0.013233822584152222, + "step": 179070 + }, + { + "epoch": 25.4194464158978, + "grad_norm": 0.11938507109880447, + "learning_rate": 7.459105748757985e-05, + "loss": 0.019518765807151794, + "step": 179080 + }, + { + "epoch": 25.420865862313697, + "grad_norm": 3.9229867458343506, + "learning_rate": 7.458963804116394e-05, + "loss": 0.030642420053482056, + "step": 179090 + }, + { + "epoch": 25.422285308729595, + "grad_norm": 6.860047817230225, + "learning_rate": 7.458821859474805e-05, + "loss": 0.012601245939731599, + "step": 179100 + }, + { + "epoch": 25.423704755145494, + "grad_norm": 0.4756755530834198, + "learning_rate": 7.458679914833215e-05, + "loss": 0.015806837379932402, + "step": 179110 + }, + { + "epoch": 25.425124201561392, + "grad_norm": 0.1906869113445282, + "learning_rate": 7.458537970191626e-05, + "loss": 0.02159782499074936, + "step": 179120 + }, + { + "epoch": 25.42654364797729, + "grad_norm": 0.031273841857910156, + "learning_rate": 7.458396025550036e-05, + "loss": 0.0011338144540786743, + "step": 179130 + }, + { + "epoch": 25.427963094393185, + "grad_norm": 0.08604549616575241, + "learning_rate": 7.458254080908446e-05, + "loss": 0.04943074882030487, + "step": 179140 + }, + { + "epoch": 25.429382540809083, + "grad_norm": 11.49404525756836, + "learning_rate": 7.458112136266855e-05, + "loss": 0.007327257096767426, + "step": 179150 + }, + { + "epoch": 25.43080198722498, + "grad_norm": 15.202176094055176, + "learning_rate": 7.457970191625267e-05, + "loss": 0.03841550350189209, + "step": 179160 + }, + { + "epoch": 25.43222143364088, + "grad_norm": 3.8403332233428955, + "learning_rate": 7.457828246983676e-05, + "loss": 0.024710839986801146, + "step": 179170 + }, + { + "epoch": 25.433640880056778, + "grad_norm": 0.5462737679481506, + "learning_rate": 7.457686302342087e-05, + "loss": 0.04543546438217163, + "step": 179180 + }, + { + "epoch": 25.435060326472676, + "grad_norm": 0.533586859703064, + "learning_rate": 7.457544357700497e-05, + "loss": 0.002545740455389023, + "step": 179190 + }, + { + "epoch": 25.436479772888575, + "grad_norm": 1.6557437181472778, + "learning_rate": 7.457402413058907e-05, + "loss": 0.001211695373058319, + "step": 179200 + }, + { + "epoch": 25.43789921930447, + "grad_norm": 0.06100745126605034, + "learning_rate": 7.457260468417318e-05, + "loss": 0.007412203401327133, + "step": 179210 + }, + { + "epoch": 25.439318665720368, + "grad_norm": 13.958776473999023, + "learning_rate": 7.457132718239886e-05, + "loss": 0.04113844037055969, + "step": 179220 + }, + { + "epoch": 25.440738112136266, + "grad_norm": 0.12970346212387085, + "learning_rate": 7.456990773598298e-05, + "loss": 0.021130159497261047, + "step": 179230 + }, + { + "epoch": 25.442157558552164, + "grad_norm": 0.04387388005852699, + "learning_rate": 7.456848828956707e-05, + "loss": 0.007466824352741241, + "step": 179240 + }, + { + "epoch": 25.443577004968063, + "grad_norm": 5.08711576461792, + "learning_rate": 7.456706884315118e-05, + "loss": 0.01601400226354599, + "step": 179250 + }, + { + "epoch": 25.44499645138396, + "grad_norm": 0.12018890678882599, + "learning_rate": 7.456564939673528e-05, + "loss": 0.006733101606369018, + "step": 179260 + }, + { + "epoch": 25.44641589779986, + "grad_norm": 1.7128828763961792, + "learning_rate": 7.456422995031938e-05, + "loss": 0.00431697815656662, + "step": 179270 + }, + { + "epoch": 25.447835344215754, + "grad_norm": 0.053284674882888794, + "learning_rate": 7.456281050390348e-05, + "loss": 0.0012097828090190887, + "step": 179280 + }, + { + "epoch": 25.449254790631652, + "grad_norm": 0.2677818238735199, + "learning_rate": 7.456139105748759e-05, + "loss": 0.01239481270313263, + "step": 179290 + }, + { + "epoch": 25.45067423704755, + "grad_norm": 0.06686796993017197, + "learning_rate": 7.455997161107168e-05, + "loss": 0.014288446307182312, + "step": 179300 + }, + { + "epoch": 25.45209368346345, + "grad_norm": 10.13921070098877, + "learning_rate": 7.45585521646558e-05, + "loss": 0.019560638070106506, + "step": 179310 + }, + { + "epoch": 25.453513129879347, + "grad_norm": 0.03497035428881645, + "learning_rate": 7.455713271823989e-05, + "loss": 0.026509669423103333, + "step": 179320 + }, + { + "epoch": 25.454932576295246, + "grad_norm": 0.019185800105333328, + "learning_rate": 7.455571327182399e-05, + "loss": 0.006622491031885147, + "step": 179330 + }, + { + "epoch": 25.456352022711144, + "grad_norm": 0.9168856143951416, + "learning_rate": 7.45542938254081e-05, + "loss": 0.047225314378738406, + "step": 179340 + }, + { + "epoch": 25.45777146912704, + "grad_norm": 0.013608884997665882, + "learning_rate": 7.45528743789922e-05, + "loss": 0.03336382508277893, + "step": 179350 + }, + { + "epoch": 25.459190915542937, + "grad_norm": 0.04201589152216911, + "learning_rate": 7.455145493257631e-05, + "loss": 0.014812839031219483, + "step": 179360 + }, + { + "epoch": 25.460610361958835, + "grad_norm": 0.027195554226636887, + "learning_rate": 7.455003548616039e-05, + "loss": 0.006506334990262985, + "step": 179370 + }, + { + "epoch": 25.462029808374734, + "grad_norm": 0.07601962238550186, + "learning_rate": 7.45486160397445e-05, + "loss": 0.0007668506354093551, + "step": 179380 + }, + { + "epoch": 25.463449254790632, + "grad_norm": 3.8264167308807373, + "learning_rate": 7.45471965933286e-05, + "loss": 0.021552176773548128, + "step": 179390 + }, + { + "epoch": 25.46486870120653, + "grad_norm": 0.04703402519226074, + "learning_rate": 7.454577714691271e-05, + "loss": 0.002871483936905861, + "step": 179400 + }, + { + "epoch": 25.46628814762243, + "grad_norm": 0.24255910515785217, + "learning_rate": 7.454435770049681e-05, + "loss": 0.010494459420442581, + "step": 179410 + }, + { + "epoch": 25.467707594038323, + "grad_norm": 0.11832259595394135, + "learning_rate": 7.454293825408091e-05, + "loss": 0.02506493330001831, + "step": 179420 + }, + { + "epoch": 25.46912704045422, + "grad_norm": 2.1287312507629395, + "learning_rate": 7.454151880766502e-05, + "loss": 0.02138691544532776, + "step": 179430 + }, + { + "epoch": 25.47054648687012, + "grad_norm": 0.343986451625824, + "learning_rate": 7.454009936124912e-05, + "loss": 0.012413951754570007, + "step": 179440 + }, + { + "epoch": 25.471965933286018, + "grad_norm": 4.449080944061279, + "learning_rate": 7.453867991483323e-05, + "loss": 0.0026068843901157377, + "step": 179450 + }, + { + "epoch": 25.473385379701917, + "grad_norm": 0.15330153703689575, + "learning_rate": 7.453726046841732e-05, + "loss": 0.025967541337013244, + "step": 179460 + }, + { + "epoch": 25.474804826117815, + "grad_norm": 0.6993677020072937, + "learning_rate": 7.453584102200142e-05, + "loss": 0.005336512997746468, + "step": 179470 + }, + { + "epoch": 25.476224272533713, + "grad_norm": 3.798711061477661, + "learning_rate": 7.453442157558552e-05, + "loss": 0.0028274387121200563, + "step": 179480 + }, + { + "epoch": 25.477643718949608, + "grad_norm": 23.057825088500977, + "learning_rate": 7.453300212916963e-05, + "loss": 0.05611908435821533, + "step": 179490 + }, + { + "epoch": 25.479063165365506, + "grad_norm": 0.1809503436088562, + "learning_rate": 7.453158268275373e-05, + "loss": 0.0015491347759962082, + "step": 179500 + }, + { + "epoch": 25.479063165365506, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.0715729370713234, + "eval_runtime": 33.2215, + "eval_samples_per_second": 473.398, + "eval_steps_per_second": 14.81, + "step": 179500 + }, + { + "epoch": 25.480482611781405, + "grad_norm": 0.8529840111732483, + "learning_rate": 7.453016323633784e-05, + "loss": 0.0038203656673431396, + "step": 179510 + }, + { + "epoch": 25.481902058197303, + "grad_norm": 0.6859683394432068, + "learning_rate": 7.452874378992193e-05, + "loss": 0.03406044840812683, + "step": 179520 + }, + { + "epoch": 25.4833215046132, + "grad_norm": 1.2647279500961304, + "learning_rate": 7.452732434350603e-05, + "loss": 0.002039032056927681, + "step": 179530 + }, + { + "epoch": 25.4847409510291, + "grad_norm": 0.0566161572933197, + "learning_rate": 7.452590489709014e-05, + "loss": 0.005181047692894935, + "step": 179540 + }, + { + "epoch": 25.486160397444998, + "grad_norm": 0.1946851760149002, + "learning_rate": 7.452448545067424e-05, + "loss": 0.021260997653007506, + "step": 179550 + }, + { + "epoch": 25.487579843860892, + "grad_norm": 0.017168091610074043, + "learning_rate": 7.452306600425835e-05, + "loss": 0.004287741333246231, + "step": 179560 + }, + { + "epoch": 25.48899929027679, + "grad_norm": 0.021301057189702988, + "learning_rate": 7.452164655784245e-05, + "loss": 0.0013755429536104202, + "step": 179570 + }, + { + "epoch": 25.49041873669269, + "grad_norm": 0.1608135849237442, + "learning_rate": 7.452022711142655e-05, + "loss": 0.024828551709651946, + "step": 179580 + }, + { + "epoch": 25.491838183108587, + "grad_norm": 0.4931938648223877, + "learning_rate": 7.451880766501064e-05, + "loss": 0.005660456418991089, + "step": 179590 + }, + { + "epoch": 25.493257629524486, + "grad_norm": 11.662925720214844, + "learning_rate": 7.451738821859475e-05, + "loss": 0.021727830171585083, + "step": 179600 + }, + { + "epoch": 25.494677075940384, + "grad_norm": 0.011983329430222511, + "learning_rate": 7.451596877217885e-05, + "loss": 0.012643672525882721, + "step": 179610 + }, + { + "epoch": 25.496096522356282, + "grad_norm": 0.2905292809009552, + "learning_rate": 7.451454932576296e-05, + "loss": 0.007094824314117431, + "step": 179620 + }, + { + "epoch": 25.497515968772177, + "grad_norm": 0.837986171245575, + "learning_rate": 7.451312987934706e-05, + "loss": 0.02105049043893814, + "step": 179630 + }, + { + "epoch": 25.498935415188075, + "grad_norm": 0.01625245250761509, + "learning_rate": 7.451171043293116e-05, + "loss": 0.03656308054924011, + "step": 179640 + }, + { + "epoch": 25.500354861603974, + "grad_norm": 0.5474497675895691, + "learning_rate": 7.451029098651527e-05, + "loss": 0.04430088102817535, + "step": 179650 + }, + { + "epoch": 25.501774308019872, + "grad_norm": 3.1805567741394043, + "learning_rate": 7.450887154009937e-05, + "loss": 0.008376038074493409, + "step": 179660 + }, + { + "epoch": 25.50319375443577, + "grad_norm": 8.11063289642334, + "learning_rate": 7.450745209368348e-05, + "loss": 0.03653077483177185, + "step": 179670 + }, + { + "epoch": 25.50461320085167, + "grad_norm": 0.14486612379550934, + "learning_rate": 7.450603264726756e-05, + "loss": 0.012043663114309312, + "step": 179680 + }, + { + "epoch": 25.506032647267567, + "grad_norm": 0.062996506690979, + "learning_rate": 7.450461320085167e-05, + "loss": 0.009694510698318481, + "step": 179690 + }, + { + "epoch": 25.50745209368346, + "grad_norm": 0.04224622622132301, + "learning_rate": 7.450319375443577e-05, + "loss": 0.0012867387384176254, + "step": 179700 + }, + { + "epoch": 25.50887154009936, + "grad_norm": 0.02752966620028019, + "learning_rate": 7.450177430801988e-05, + "loss": 0.012078963965177537, + "step": 179710 + }, + { + "epoch": 25.51029098651526, + "grad_norm": 0.08715686947107315, + "learning_rate": 7.450035486160398e-05, + "loss": 0.0025779686868190764, + "step": 179720 + }, + { + "epoch": 25.511710432931157, + "grad_norm": 0.029118185862898827, + "learning_rate": 7.449893541518807e-05, + "loss": 0.030601164698600768, + "step": 179730 + }, + { + "epoch": 25.513129879347055, + "grad_norm": 2.260190010070801, + "learning_rate": 7.449751596877219e-05, + "loss": 0.007078541070222854, + "step": 179740 + }, + { + "epoch": 25.514549325762953, + "grad_norm": 24.11655616760254, + "learning_rate": 7.449609652235628e-05, + "loss": 0.04562524557113647, + "step": 179750 + }, + { + "epoch": 25.51596877217885, + "grad_norm": 0.0746900737285614, + "learning_rate": 7.44946770759404e-05, + "loss": 0.04080681204795837, + "step": 179760 + }, + { + "epoch": 25.517388218594746, + "grad_norm": 0.05191466957330704, + "learning_rate": 7.449325762952449e-05, + "loss": 0.021039208769798277, + "step": 179770 + }, + { + "epoch": 25.518807665010645, + "grad_norm": 0.03088299371302128, + "learning_rate": 7.449183818310859e-05, + "loss": 0.037669619917869566, + "step": 179780 + }, + { + "epoch": 25.520227111426543, + "grad_norm": 14.2377290725708, + "learning_rate": 7.449041873669269e-05, + "loss": 0.032105231285095216, + "step": 179790 + }, + { + "epoch": 25.52164655784244, + "grad_norm": 0.5570749044418335, + "learning_rate": 7.44889992902768e-05, + "loss": 0.00255543552339077, + "step": 179800 + }, + { + "epoch": 25.52306600425834, + "grad_norm": 0.5801060199737549, + "learning_rate": 7.44875798438609e-05, + "loss": 0.0005924213677644729, + "step": 179810 + }, + { + "epoch": 25.524485450674238, + "grad_norm": 10.7149658203125, + "learning_rate": 7.4486160397445e-05, + "loss": 0.005726019665598869, + "step": 179820 + }, + { + "epoch": 25.525904897090136, + "grad_norm": 0.4844067692756653, + "learning_rate": 7.44847409510291e-05, + "loss": 0.01844250559806824, + "step": 179830 + }, + { + "epoch": 25.52732434350603, + "grad_norm": 0.005730295553803444, + "learning_rate": 7.44833215046132e-05, + "loss": 0.03920533061027527, + "step": 179840 + }, + { + "epoch": 25.52874378992193, + "grad_norm": 0.005780892912298441, + "learning_rate": 7.448190205819731e-05, + "loss": 0.03683828711509705, + "step": 179850 + }, + { + "epoch": 25.530163236337827, + "grad_norm": 0.12365595996379852, + "learning_rate": 7.448048261178141e-05, + "loss": 0.023146805167198182, + "step": 179860 + }, + { + "epoch": 25.531582682753726, + "grad_norm": 0.07882615923881531, + "learning_rate": 7.447906316536552e-05, + "loss": 0.024275703728199004, + "step": 179870 + }, + { + "epoch": 25.533002129169624, + "grad_norm": 0.03749169409275055, + "learning_rate": 7.44776437189496e-05, + "loss": 0.07599815130233764, + "step": 179880 + }, + { + "epoch": 25.534421575585522, + "grad_norm": 11.083174705505371, + "learning_rate": 7.447622427253371e-05, + "loss": 0.020051510632038118, + "step": 179890 + }, + { + "epoch": 25.53584102200142, + "grad_norm": 12.049695014953613, + "learning_rate": 7.447480482611781e-05, + "loss": 0.02493106424808502, + "step": 179900 + }, + { + "epoch": 25.537260468417315, + "grad_norm": 0.44503772258758545, + "learning_rate": 7.447338537970192e-05, + "loss": 0.006494826078414917, + "step": 179910 + }, + { + "epoch": 25.538679914833214, + "grad_norm": 0.026505298912525177, + "learning_rate": 7.447196593328602e-05, + "loss": 0.016276293992996217, + "step": 179920 + }, + { + "epoch": 25.540099361249112, + "grad_norm": 0.018481822684407234, + "learning_rate": 7.447054648687013e-05, + "loss": 0.006986555457115173, + "step": 179930 + }, + { + "epoch": 25.54151880766501, + "grad_norm": 0.1404401659965515, + "learning_rate": 7.446912704045423e-05, + "loss": 0.032223135232925415, + "step": 179940 + }, + { + "epoch": 25.54293825408091, + "grad_norm": 0.024211108684539795, + "learning_rate": 7.446770759403833e-05, + "loss": 0.00860668122768402, + "step": 179950 + }, + { + "epoch": 25.544357700496807, + "grad_norm": 11.628357887268066, + "learning_rate": 7.446628814762244e-05, + "loss": 0.011537303030490876, + "step": 179960 + }, + { + "epoch": 25.545777146912705, + "grad_norm": 0.35813575983047485, + "learning_rate": 7.446486870120653e-05, + "loss": 0.004934598132967949, + "step": 179970 + }, + { + "epoch": 25.5471965933286, + "grad_norm": 5.8434624671936035, + "learning_rate": 7.446344925479064e-05, + "loss": 0.034967026114463805, + "step": 179980 + }, + { + "epoch": 25.5486160397445, + "grad_norm": 1.3131963014602661, + "learning_rate": 7.446202980837473e-05, + "loss": 0.012502413988113404, + "step": 179990 + }, + { + "epoch": 25.550035486160397, + "grad_norm": 0.0213746540248394, + "learning_rate": 7.446061036195884e-05, + "loss": 0.05780455470085144, + "step": 180000 + }, + { + "epoch": 25.550035486160397, + "eval_accuracy": 0.9882367902333566, + "eval_loss": 0.04046269506216049, + "eval_runtime": 33.2615, + "eval_samples_per_second": 472.828, + "eval_steps_per_second": 14.792, + "step": 180000 + }, + { + "epoch": 25.551454932576295, + "grad_norm": 0.15020737051963806, + "learning_rate": 7.445919091554294e-05, + "loss": 0.005047255754470825, + "step": 180010 + }, + { + "epoch": 25.552874378992193, + "grad_norm": 0.716580867767334, + "learning_rate": 7.445777146912705e-05, + "loss": 0.022347766160964965, + "step": 180020 + }, + { + "epoch": 25.55429382540809, + "grad_norm": 0.10273517668247223, + "learning_rate": 7.445635202271114e-05, + "loss": 0.058278721570968625, + "step": 180030 + }, + { + "epoch": 25.55571327182399, + "grad_norm": 2.252950429916382, + "learning_rate": 7.445493257629524e-05, + "loss": 0.012786373496055603, + "step": 180040 + }, + { + "epoch": 25.557132718239885, + "grad_norm": 0.028423108160495758, + "learning_rate": 7.445351312987935e-05, + "loss": 0.0017830252647399903, + "step": 180050 + }, + { + "epoch": 25.558552164655783, + "grad_norm": 1.8051271438598633, + "learning_rate": 7.445209368346345e-05, + "loss": 0.0025440797209739683, + "step": 180060 + }, + { + "epoch": 25.55997161107168, + "grad_norm": 0.01650075986981392, + "learning_rate": 7.445067423704756e-05, + "loss": 0.0468770444393158, + "step": 180070 + }, + { + "epoch": 25.56139105748758, + "grad_norm": 0.006516004912555218, + "learning_rate": 7.444925479063166e-05, + "loss": 0.012915173172950744, + "step": 180080 + }, + { + "epoch": 25.562810503903478, + "grad_norm": 0.0053267702460289, + "learning_rate": 7.444783534421576e-05, + "loss": 0.019885960221290588, + "step": 180090 + }, + { + "epoch": 25.564229950319376, + "grad_norm": 0.5405298471450806, + "learning_rate": 7.444641589779985e-05, + "loss": 0.010451976954936982, + "step": 180100 + }, + { + "epoch": 25.565649396735274, + "grad_norm": 0.46959030628204346, + "learning_rate": 7.444499645138396e-05, + "loss": 0.0017228391021490096, + "step": 180110 + }, + { + "epoch": 25.56706884315117, + "grad_norm": 0.13083001971244812, + "learning_rate": 7.444357700496806e-05, + "loss": 0.019055040180683137, + "step": 180120 + }, + { + "epoch": 25.568488289567068, + "grad_norm": 0.6801934838294983, + "learning_rate": 7.444215755855217e-05, + "loss": 0.003512929007411003, + "step": 180130 + }, + { + "epoch": 25.569907735982966, + "grad_norm": 0.009140847250819206, + "learning_rate": 7.444073811213627e-05, + "loss": 0.002552158012986183, + "step": 180140 + }, + { + "epoch": 25.571327182398864, + "grad_norm": 0.25207245349884033, + "learning_rate": 7.443931866572037e-05, + "loss": 0.00436212383210659, + "step": 180150 + }, + { + "epoch": 25.572746628814762, + "grad_norm": 0.7457872033119202, + "learning_rate": 7.443789921930448e-05, + "loss": 0.009685005247592925, + "step": 180160 + }, + { + "epoch": 25.57416607523066, + "grad_norm": 0.00849017035216093, + "learning_rate": 7.443647977288858e-05, + "loss": 0.012156320363283157, + "step": 180170 + }, + { + "epoch": 25.57558552164656, + "grad_norm": 0.028729649260640144, + "learning_rate": 7.443506032647269e-05, + "loss": 0.02413068413734436, + "step": 180180 + }, + { + "epoch": 25.577004968062454, + "grad_norm": 4.667131423950195, + "learning_rate": 7.443364088005677e-05, + "loss": 0.0029822651296854017, + "step": 180190 + }, + { + "epoch": 25.578424414478352, + "grad_norm": 0.11157491058111191, + "learning_rate": 7.443222143364088e-05, + "loss": 0.0075209617614746095, + "step": 180200 + }, + { + "epoch": 25.57984386089425, + "grad_norm": 0.2470676153898239, + "learning_rate": 7.443080198722498e-05, + "loss": 0.007403917610645294, + "step": 180210 + }, + { + "epoch": 25.58126330731015, + "grad_norm": 0.04640617221593857, + "learning_rate": 7.442938254080909e-05, + "loss": 0.06443987488746643, + "step": 180220 + }, + { + "epoch": 25.582682753726047, + "grad_norm": 0.014466472901403904, + "learning_rate": 7.442796309439319e-05, + "loss": 0.003911777958273887, + "step": 180230 + }, + { + "epoch": 25.584102200141945, + "grad_norm": 0.08852870762348175, + "learning_rate": 7.442654364797728e-05, + "loss": 0.004382487386465073, + "step": 180240 + }, + { + "epoch": 25.585521646557844, + "grad_norm": 9.406824111938477, + "learning_rate": 7.44251242015614e-05, + "loss": 0.0265102744102478, + "step": 180250 + }, + { + "epoch": 25.58694109297374, + "grad_norm": 4.033895969390869, + "learning_rate": 7.442370475514549e-05, + "loss": 0.06041900515556335, + "step": 180260 + }, + { + "epoch": 25.588360539389637, + "grad_norm": 13.346366882324219, + "learning_rate": 7.44222853087296e-05, + "loss": 0.007657216489315033, + "step": 180270 + }, + { + "epoch": 25.589779985805535, + "grad_norm": 0.02948656678199768, + "learning_rate": 7.44208658623137e-05, + "loss": 0.0252111554145813, + "step": 180280 + }, + { + "epoch": 25.591199432221433, + "grad_norm": 0.20635554194450378, + "learning_rate": 7.441944641589781e-05, + "loss": 0.006677997857332229, + "step": 180290 + }, + { + "epoch": 25.59261887863733, + "grad_norm": 0.45468103885650635, + "learning_rate": 7.44180269694819e-05, + "loss": 0.004203520715236664, + "step": 180300 + }, + { + "epoch": 25.59403832505323, + "grad_norm": 0.4620322585105896, + "learning_rate": 7.441660752306601e-05, + "loss": 0.012938132882118225, + "step": 180310 + }, + { + "epoch": 25.59545777146913, + "grad_norm": 0.11011151969432831, + "learning_rate": 7.44151880766501e-05, + "loss": 0.015026769042015076, + "step": 180320 + }, + { + "epoch": 25.596877217885023, + "grad_norm": 8.943921089172363, + "learning_rate": 7.441376863023422e-05, + "loss": 0.015251556038856506, + "step": 180330 + }, + { + "epoch": 25.59829666430092, + "grad_norm": 0.5165340900421143, + "learning_rate": 7.441234918381833e-05, + "loss": 0.02986389398574829, + "step": 180340 + }, + { + "epoch": 25.59971611071682, + "grad_norm": 0.017073189839720726, + "learning_rate": 7.441092973740241e-05, + "loss": 0.02626275420188904, + "step": 180350 + }, + { + "epoch": 25.601135557132718, + "grad_norm": 0.1626952737569809, + "learning_rate": 7.440951029098652e-05, + "loss": 0.005844282731413841, + "step": 180360 + }, + { + "epoch": 25.602555003548616, + "grad_norm": 1.4537242650985718, + "learning_rate": 7.440809084457062e-05, + "loss": 0.016101941466331482, + "step": 180370 + }, + { + "epoch": 25.603974449964515, + "grad_norm": 7.548768520355225, + "learning_rate": 7.440667139815473e-05, + "loss": 0.043826824426651, + "step": 180380 + }, + { + "epoch": 25.605393896380413, + "grad_norm": 0.0704728439450264, + "learning_rate": 7.440525195173883e-05, + "loss": 0.0019751343876123427, + "step": 180390 + }, + { + "epoch": 25.606813342796308, + "grad_norm": 3.9258878231048584, + "learning_rate": 7.440383250532292e-05, + "loss": 0.031178992986679078, + "step": 180400 + }, + { + "epoch": 25.608232789212206, + "grad_norm": 0.34889426827430725, + "learning_rate": 7.440241305890702e-05, + "loss": 0.02299784719944, + "step": 180410 + }, + { + "epoch": 25.609652235628104, + "grad_norm": 0.015762055292725563, + "learning_rate": 7.440099361249113e-05, + "loss": 0.00834675133228302, + "step": 180420 + }, + { + "epoch": 25.611071682044003, + "grad_norm": 0.05661742016673088, + "learning_rate": 7.439957416607524e-05, + "loss": 0.00292656235396862, + "step": 180430 + }, + { + "epoch": 25.6124911284599, + "grad_norm": 0.33740442991256714, + "learning_rate": 7.439815471965934e-05, + "loss": 0.004822145029902458, + "step": 180440 + }, + { + "epoch": 25.6139105748758, + "grad_norm": 0.02520233578979969, + "learning_rate": 7.439673527324344e-05, + "loss": 0.007170078158378601, + "step": 180450 + }, + { + "epoch": 25.615330021291697, + "grad_norm": 8.412757873535156, + "learning_rate": 7.439531582682754e-05, + "loss": 0.02769443392753601, + "step": 180460 + }, + { + "epoch": 25.616749467707596, + "grad_norm": 0.35972270369529724, + "learning_rate": 7.439389638041165e-05, + "loss": 0.0017675723880529403, + "step": 180470 + }, + { + "epoch": 25.61816891412349, + "grad_norm": 0.01401180773973465, + "learning_rate": 7.439247693399574e-05, + "loss": 0.008082438260316849, + "step": 180480 + }, + { + "epoch": 25.61958836053939, + "grad_norm": 0.3828575313091278, + "learning_rate": 7.439105748757985e-05, + "loss": 0.0029577065259218218, + "step": 180490 + }, + { + "epoch": 25.621007806955287, + "grad_norm": 10.930902481079102, + "learning_rate": 7.438963804116394e-05, + "loss": 0.009360193461179733, + "step": 180500 + }, + { + "epoch": 25.621007806955287, + "eval_accuracy": 0.9869015069625485, + "eval_loss": 0.04728792607784271, + "eval_runtime": 33.3698, + "eval_samples_per_second": 471.294, + "eval_steps_per_second": 14.744, + "step": 180500 + }, + { + "epoch": 25.622427253371185, + "grad_norm": 0.9593197703361511, + "learning_rate": 7.438821859474805e-05, + "loss": 0.009705141186714172, + "step": 180510 + }, + { + "epoch": 25.623846699787084, + "grad_norm": 0.6631242632865906, + "learning_rate": 7.438679914833216e-05, + "loss": 0.0027312323451042174, + "step": 180520 + }, + { + "epoch": 25.625266146202982, + "grad_norm": 0.13193994760513306, + "learning_rate": 7.438537970191626e-05, + "loss": 0.045428845286369327, + "step": 180530 + }, + { + "epoch": 25.62668559261888, + "grad_norm": 0.048400361090898514, + "learning_rate": 7.438396025550037e-05, + "loss": 0.02116979956626892, + "step": 180540 + }, + { + "epoch": 25.628105039034775, + "grad_norm": 0.7989998459815979, + "learning_rate": 7.438254080908445e-05, + "loss": 0.010105837136507034, + "step": 180550 + }, + { + "epoch": 25.629524485450673, + "grad_norm": 0.028090141713619232, + "learning_rate": 7.438112136266856e-05, + "loss": 0.002611011266708374, + "step": 180560 + }, + { + "epoch": 25.63094393186657, + "grad_norm": 21.0240535736084, + "learning_rate": 7.437970191625266e-05, + "loss": 0.03965606689453125, + "step": 180570 + }, + { + "epoch": 25.63236337828247, + "grad_norm": 0.04208636283874512, + "learning_rate": 7.437828246983677e-05, + "loss": 0.03935863971710205, + "step": 180580 + }, + { + "epoch": 25.63378282469837, + "grad_norm": 0.6098654270172119, + "learning_rate": 7.437686302342087e-05, + "loss": 0.012244017422199249, + "step": 180590 + }, + { + "epoch": 25.635202271114267, + "grad_norm": 7.909515857696533, + "learning_rate": 7.437544357700497e-05, + "loss": 0.05155755281448364, + "step": 180600 + }, + { + "epoch": 25.636621717530165, + "grad_norm": 0.08482035994529724, + "learning_rate": 7.437402413058906e-05, + "loss": 0.012521453201770782, + "step": 180610 + }, + { + "epoch": 25.63804116394606, + "grad_norm": 5.045107364654541, + "learning_rate": 7.437260468417317e-05, + "loss": 0.01685637831687927, + "step": 180620 + }, + { + "epoch": 25.639460610361958, + "grad_norm": 9.797825813293457, + "learning_rate": 7.437118523775729e-05, + "loss": 0.07041789889335633, + "step": 180630 + }, + { + "epoch": 25.640880056777856, + "grad_norm": 2.280484914779663, + "learning_rate": 7.436976579134138e-05, + "loss": 0.018373827636241912, + "step": 180640 + }, + { + "epoch": 25.642299503193755, + "grad_norm": 0.0426308773458004, + "learning_rate": 7.43683463449255e-05, + "loss": 0.009015412628650665, + "step": 180650 + }, + { + "epoch": 25.643718949609653, + "grad_norm": 0.1349436342716217, + "learning_rate": 7.436692689850958e-05, + "loss": 0.013774242997169495, + "step": 180660 + }, + { + "epoch": 25.64513839602555, + "grad_norm": 8.831050872802734, + "learning_rate": 7.436550745209369e-05, + "loss": 0.012438704073429108, + "step": 180670 + }, + { + "epoch": 25.64655784244145, + "grad_norm": 0.6139320135116577, + "learning_rate": 7.436408800567779e-05, + "loss": 0.01155378445982933, + "step": 180680 + }, + { + "epoch": 25.647977288857344, + "grad_norm": 0.18683241307735443, + "learning_rate": 7.43626685592619e-05, + "loss": 0.017727547883987428, + "step": 180690 + }, + { + "epoch": 25.649396735273243, + "grad_norm": 5.657413482666016, + "learning_rate": 7.4361249112846e-05, + "loss": 0.00654677152633667, + "step": 180700 + }, + { + "epoch": 25.65081618168914, + "grad_norm": 0.024630235508084297, + "learning_rate": 7.435982966643009e-05, + "loss": 0.004772892594337464, + "step": 180710 + }, + { + "epoch": 25.65223562810504, + "grad_norm": 0.10830465704202652, + "learning_rate": 7.43584102200142e-05, + "loss": 0.04357871413230896, + "step": 180720 + }, + { + "epoch": 25.653655074520938, + "grad_norm": 1.2649155855178833, + "learning_rate": 7.43569907735983e-05, + "loss": 0.0218611478805542, + "step": 180730 + }, + { + "epoch": 25.655074520936836, + "grad_norm": 10.143715858459473, + "learning_rate": 7.435557132718241e-05, + "loss": 0.031734701991081235, + "step": 180740 + }, + { + "epoch": 25.656493967352734, + "grad_norm": 0.24694842100143433, + "learning_rate": 7.435415188076651e-05, + "loss": 0.01450224220752716, + "step": 180750 + }, + { + "epoch": 25.65791341376863, + "grad_norm": 5.240409851074219, + "learning_rate": 7.43527324343506e-05, + "loss": 0.011673653125762939, + "step": 180760 + }, + { + "epoch": 25.659332860184527, + "grad_norm": 0.538646936416626, + "learning_rate": 7.43513129879347e-05, + "loss": 0.03488227725028992, + "step": 180770 + }, + { + "epoch": 25.660752306600425, + "grad_norm": 0.02933654375374317, + "learning_rate": 7.434989354151881e-05, + "loss": 0.023148712515830994, + "step": 180780 + }, + { + "epoch": 25.662171753016324, + "grad_norm": 0.1445612609386444, + "learning_rate": 7.434847409510291e-05, + "loss": 0.070120769739151, + "step": 180790 + }, + { + "epoch": 25.663591199432222, + "grad_norm": 4.232518672943115, + "learning_rate": 7.434705464868702e-05, + "loss": 0.016361749172210692, + "step": 180800 + }, + { + "epoch": 25.66501064584812, + "grad_norm": 0.08076496422290802, + "learning_rate": 7.434563520227112e-05, + "loss": 0.002158990129828453, + "step": 180810 + }, + { + "epoch": 25.66643009226402, + "grad_norm": 0.0426332950592041, + "learning_rate": 7.434421575585522e-05, + "loss": 0.014573234319686889, + "step": 180820 + }, + { + "epoch": 25.667849538679913, + "grad_norm": 0.874748170375824, + "learning_rate": 7.434279630943933e-05, + "loss": 0.01890096366405487, + "step": 180830 + }, + { + "epoch": 25.669268985095812, + "grad_norm": 5.259952545166016, + "learning_rate": 7.434137686302343e-05, + "loss": 0.017324069142341615, + "step": 180840 + }, + { + "epoch": 25.67068843151171, + "grad_norm": 0.010029159486293793, + "learning_rate": 7.433995741660754e-05, + "loss": 0.005403413623571396, + "step": 180850 + }, + { + "epoch": 25.67210787792761, + "grad_norm": 0.03455464541912079, + "learning_rate": 7.433853797019162e-05, + "loss": 0.01387394666671753, + "step": 180860 + }, + { + "epoch": 25.673527324343507, + "grad_norm": 2.7934722900390625, + "learning_rate": 7.433711852377573e-05, + "loss": 0.01271732896566391, + "step": 180870 + }, + { + "epoch": 25.674946770759405, + "grad_norm": 0.24656261503696442, + "learning_rate": 7.433569907735983e-05, + "loss": 0.014781039953231812, + "step": 180880 + }, + { + "epoch": 25.676366217175303, + "grad_norm": 0.3367293179035187, + "learning_rate": 7.433427963094394e-05, + "loss": 0.013533301651477814, + "step": 180890 + }, + { + "epoch": 25.677785663591198, + "grad_norm": 3.874493360519409, + "learning_rate": 7.433286018452804e-05, + "loss": 0.01466352939605713, + "step": 180900 + }, + { + "epoch": 25.679205110007096, + "grad_norm": 1.8236688375473022, + "learning_rate": 7.433144073811213e-05, + "loss": 0.003141341730952263, + "step": 180910 + }, + { + "epoch": 25.680624556422995, + "grad_norm": 0.01918906904757023, + "learning_rate": 7.433002129169625e-05, + "loss": 0.017547209560871125, + "step": 180920 + }, + { + "epoch": 25.682044002838893, + "grad_norm": 0.6680472493171692, + "learning_rate": 7.432860184528034e-05, + "loss": 0.006016036868095398, + "step": 180930 + }, + { + "epoch": 25.68346344925479, + "grad_norm": 19.02199935913086, + "learning_rate": 7.432718239886445e-05, + "loss": 0.04733777642250061, + "step": 180940 + }, + { + "epoch": 25.68488289567069, + "grad_norm": 0.06071118265390396, + "learning_rate": 7.432576295244855e-05, + "loss": 0.022424611449241637, + "step": 180950 + }, + { + "epoch": 25.686302342086588, + "grad_norm": 0.01818894036114216, + "learning_rate": 7.432434350603265e-05, + "loss": 0.007296357303857803, + "step": 180960 + }, + { + "epoch": 25.687721788502483, + "grad_norm": 0.0475507490336895, + "learning_rate": 7.432292405961675e-05, + "loss": 0.002873092144727707, + "step": 180970 + }, + { + "epoch": 25.68914123491838, + "grad_norm": 0.2386280745267868, + "learning_rate": 7.432150461320086e-05, + "loss": 0.004392069950699806, + "step": 180980 + }, + { + "epoch": 25.69056068133428, + "grad_norm": 0.12866981327533722, + "learning_rate": 7.432008516678495e-05, + "loss": 0.0303143709897995, + "step": 180990 + }, + { + "epoch": 25.691980127750178, + "grad_norm": 0.019492391496896744, + "learning_rate": 7.431866572036906e-05, + "loss": 0.022767136991024017, + "step": 181000 + }, + { + "epoch": 25.691980127750178, + "eval_accuracy": 0.9900807528454251, + "eval_loss": 0.047881025820970535, + "eval_runtime": 33.9911, + "eval_samples_per_second": 462.679, + "eval_steps_per_second": 14.474, + "step": 181000 + }, + { + "epoch": 25.693399574166076, + "grad_norm": 12.624052047729492, + "learning_rate": 7.431724627395316e-05, + "loss": 0.03521527945995331, + "step": 181010 + }, + { + "epoch": 25.694819020581974, + "grad_norm": 0.004312537144869566, + "learning_rate": 7.431582682753726e-05, + "loss": 0.01153850108385086, + "step": 181020 + }, + { + "epoch": 25.696238466997873, + "grad_norm": 0.18245014548301697, + "learning_rate": 7.431440738112137e-05, + "loss": 0.031166958808898925, + "step": 181030 + }, + { + "epoch": 25.697657913413767, + "grad_norm": 10.196966171264648, + "learning_rate": 7.431298793470547e-05, + "loss": 0.007751426100730896, + "step": 181040 + }, + { + "epoch": 25.699077359829666, + "grad_norm": 0.019032716751098633, + "learning_rate": 7.431156848828958e-05, + "loss": 0.037809702754020694, + "step": 181050 + }, + { + "epoch": 25.700496806245564, + "grad_norm": 1.7837401628494263, + "learning_rate": 7.431014904187368e-05, + "loss": 0.0027151618152856825, + "step": 181060 + }, + { + "epoch": 25.701916252661462, + "grad_norm": 0.008473812602460384, + "learning_rate": 7.430872959545777e-05, + "loss": 0.008484746515750884, + "step": 181070 + }, + { + "epoch": 25.70333569907736, + "grad_norm": 0.009824814274907112, + "learning_rate": 7.430731014904187e-05, + "loss": 0.007057398557662964, + "step": 181080 + }, + { + "epoch": 25.70475514549326, + "grad_norm": 0.2679128646850586, + "learning_rate": 7.430589070262598e-05, + "loss": 0.007718280702829361, + "step": 181090 + }, + { + "epoch": 25.706174591909157, + "grad_norm": 6.387024402618408, + "learning_rate": 7.430447125621008e-05, + "loss": 0.018606492877006532, + "step": 181100 + }, + { + "epoch": 25.707594038325052, + "grad_norm": 0.39925387501716614, + "learning_rate": 7.430305180979419e-05, + "loss": 0.006041895225644111, + "step": 181110 + }, + { + "epoch": 25.70901348474095, + "grad_norm": 0.8825147151947021, + "learning_rate": 7.430163236337829e-05, + "loss": 0.005907560512423515, + "step": 181120 + }, + { + "epoch": 25.71043293115685, + "grad_norm": 0.06370214372873306, + "learning_rate": 7.430021291696238e-05, + "loss": 0.019082184135913848, + "step": 181130 + }, + { + "epoch": 25.711852377572747, + "grad_norm": 12.338886260986328, + "learning_rate": 7.42987934705465e-05, + "loss": 0.025133752822875978, + "step": 181140 + }, + { + "epoch": 25.713271823988645, + "grad_norm": 2.075573682785034, + "learning_rate": 7.429737402413059e-05, + "loss": 0.06179547309875488, + "step": 181150 + }, + { + "epoch": 25.714691270404543, + "grad_norm": 0.38068199157714844, + "learning_rate": 7.42959545777147e-05, + "loss": 0.1190909504890442, + "step": 181160 + }, + { + "epoch": 25.71611071682044, + "grad_norm": 0.18170884251594543, + "learning_rate": 7.429453513129879e-05, + "loss": 0.0020603276789188385, + "step": 181170 + }, + { + "epoch": 25.717530163236336, + "grad_norm": 0.08044267445802689, + "learning_rate": 7.42931156848829e-05, + "loss": 0.008223888278007508, + "step": 181180 + }, + { + "epoch": 25.718949609652235, + "grad_norm": 0.026301411911845207, + "learning_rate": 7.4291696238467e-05, + "loss": 0.03885988295078278, + "step": 181190 + }, + { + "epoch": 25.720369056068133, + "grad_norm": 0.09079450368881226, + "learning_rate": 7.429027679205111e-05, + "loss": 0.009325295686721802, + "step": 181200 + }, + { + "epoch": 25.72178850248403, + "grad_norm": 0.03157563880085945, + "learning_rate": 7.42888573456352e-05, + "loss": 0.023444196581840514, + "step": 181210 + }, + { + "epoch": 25.72320794889993, + "grad_norm": 1.9255449771881104, + "learning_rate": 7.42874378992193e-05, + "loss": 0.0023035652935504912, + "step": 181220 + }, + { + "epoch": 25.724627395315828, + "grad_norm": 4.451763153076172, + "learning_rate": 7.428601845280341e-05, + "loss": 0.027551031112670897, + "step": 181230 + }, + { + "epoch": 25.726046841731726, + "grad_norm": 0.018033504486083984, + "learning_rate": 7.428459900638751e-05, + "loss": 0.02980167865753174, + "step": 181240 + }, + { + "epoch": 25.72746628814762, + "grad_norm": 1.287234902381897, + "learning_rate": 7.428317955997162e-05, + "loss": 0.0036415129899978636, + "step": 181250 + }, + { + "epoch": 25.72888573456352, + "grad_norm": 0.0843605101108551, + "learning_rate": 7.428176011355572e-05, + "loss": 0.01205001473426819, + "step": 181260 + }, + { + "epoch": 25.730305180979418, + "grad_norm": 0.11676943302154541, + "learning_rate": 7.428034066713982e-05, + "loss": 0.00231565497815609, + "step": 181270 + }, + { + "epoch": 25.731724627395316, + "grad_norm": 8.089485168457031, + "learning_rate": 7.427892122072391e-05, + "loss": 0.012157589197158813, + "step": 181280 + }, + { + "epoch": 25.733144073811214, + "grad_norm": 0.030331209301948547, + "learning_rate": 7.427750177430802e-05, + "loss": 0.029729142785072327, + "step": 181290 + }, + { + "epoch": 25.734563520227113, + "grad_norm": 0.23350068926811218, + "learning_rate": 7.427608232789212e-05, + "loss": 0.0109323650598526, + "step": 181300 + }, + { + "epoch": 25.73598296664301, + "grad_norm": 0.8193116188049316, + "learning_rate": 7.427466288147623e-05, + "loss": 0.005194208025932312, + "step": 181310 + }, + { + "epoch": 25.737402413058906, + "grad_norm": 0.11574787646532059, + "learning_rate": 7.427324343506033e-05, + "loss": 0.024942012131214143, + "step": 181320 + }, + { + "epoch": 25.738821859474804, + "grad_norm": 0.19333761930465698, + "learning_rate": 7.427182398864443e-05, + "loss": 0.01087513342499733, + "step": 181330 + }, + { + "epoch": 25.740241305890702, + "grad_norm": 0.016540123149752617, + "learning_rate": 7.427040454222854e-05, + "loss": 0.005305198207497597, + "step": 181340 + }, + { + "epoch": 25.7416607523066, + "grad_norm": 1.326748013496399, + "learning_rate": 7.426898509581264e-05, + "loss": 0.012491590529680251, + "step": 181350 + }, + { + "epoch": 25.7430801987225, + "grad_norm": 0.11618193238973618, + "learning_rate": 7.426756564939675e-05, + "loss": 0.00229722261428833, + "step": 181360 + }, + { + "epoch": 25.744499645138397, + "grad_norm": 0.22436931729316711, + "learning_rate": 7.426614620298084e-05, + "loss": 0.004848874732851982, + "step": 181370 + }, + { + "epoch": 25.745919091554295, + "grad_norm": 3.9946205615997314, + "learning_rate": 7.426472675656494e-05, + "loss": 0.03293180167675018, + "step": 181380 + }, + { + "epoch": 25.74733853797019, + "grad_norm": 0.027193017303943634, + "learning_rate": 7.426330731014904e-05, + "loss": 0.010988382995128632, + "step": 181390 + }, + { + "epoch": 25.74875798438609, + "grad_norm": 14.060412406921387, + "learning_rate": 7.426188786373315e-05, + "loss": 0.04175468385219574, + "step": 181400 + }, + { + "epoch": 25.750177430801987, + "grad_norm": 7.672267436981201, + "learning_rate": 7.426046841731725e-05, + "loss": 0.009236945956945419, + "step": 181410 + }, + { + "epoch": 25.751596877217885, + "grad_norm": 0.00839468464255333, + "learning_rate": 7.425904897090136e-05, + "loss": 0.002652636915445328, + "step": 181420 + }, + { + "epoch": 25.753016323633783, + "grad_norm": 8.264179229736328, + "learning_rate": 7.425762952448546e-05, + "loss": 0.013533377647399902, + "step": 181430 + }, + { + "epoch": 25.75443577004968, + "grad_norm": 0.1594119817018509, + "learning_rate": 7.425621007806955e-05, + "loss": 0.002218455821275711, + "step": 181440 + }, + { + "epoch": 25.75585521646558, + "grad_norm": 0.23191586136817932, + "learning_rate": 7.425479063165366e-05, + "loss": 0.00816868245601654, + "step": 181450 + }, + { + "epoch": 25.757274662881475, + "grad_norm": 0.27516797184944153, + "learning_rate": 7.425337118523776e-05, + "loss": 0.01348232924938202, + "step": 181460 + }, + { + "epoch": 25.758694109297373, + "grad_norm": 0.3133617639541626, + "learning_rate": 7.425195173882187e-05, + "loss": 0.003851490095257759, + "step": 181470 + }, + { + "epoch": 25.76011355571327, + "grad_norm": 0.24817009270191193, + "learning_rate": 7.425053229240596e-05, + "loss": 0.0055272731930017475, + "step": 181480 + }, + { + "epoch": 25.76153300212917, + "grad_norm": 1.715070366859436, + "learning_rate": 7.424911284599007e-05, + "loss": 0.023049409687519073, + "step": 181490 + }, + { + "epoch": 25.762952448545068, + "grad_norm": 0.5812352299690247, + "learning_rate": 7.424769339957416e-05, + "loss": 0.0265667587518692, + "step": 181500 + }, + { + "epoch": 25.762952448545068, + "eval_accuracy": 0.9859477331976855, + "eval_loss": 0.06103920564055443, + "eval_runtime": 34.1488, + "eval_samples_per_second": 460.543, + "eval_steps_per_second": 14.408, + "step": 181500 + }, + { + "epoch": 25.764371894960966, + "grad_norm": 0.5625430345535278, + "learning_rate": 7.424627395315827e-05, + "loss": 0.01948379576206207, + "step": 181510 + }, + { + "epoch": 25.765791341376865, + "grad_norm": 8.295649528503418, + "learning_rate": 7.424485450674237e-05, + "loss": 0.024756273627281188, + "step": 181520 + }, + { + "epoch": 25.76721078779276, + "grad_norm": 1.0632835626602173, + "learning_rate": 7.424343506032647e-05, + "loss": 0.013222813606262207, + "step": 181530 + }, + { + "epoch": 25.768630234208658, + "grad_norm": 18.29619789123535, + "learning_rate": 7.424201561391058e-05, + "loss": 0.016410768032073975, + "step": 181540 + }, + { + "epoch": 25.770049680624556, + "grad_norm": 0.026612289249897003, + "learning_rate": 7.424059616749468e-05, + "loss": 0.010223998874425887, + "step": 181550 + }, + { + "epoch": 25.771469127040454, + "grad_norm": 0.6846365928649902, + "learning_rate": 7.423917672107879e-05, + "loss": 0.02377713918685913, + "step": 181560 + }, + { + "epoch": 25.772888573456353, + "grad_norm": 8.354463577270508, + "learning_rate": 7.423775727466289e-05, + "loss": 0.0070688709616661075, + "step": 181570 + }, + { + "epoch": 25.77430801987225, + "grad_norm": 0.025825118646025658, + "learning_rate": 7.423633782824698e-05, + "loss": 0.0030197981745004655, + "step": 181580 + }, + { + "epoch": 25.77572746628815, + "grad_norm": 0.17644797265529633, + "learning_rate": 7.423491838183108e-05, + "loss": 0.021408480405807496, + "step": 181590 + }, + { + "epoch": 25.777146912704044, + "grad_norm": 0.0684879943728447, + "learning_rate": 7.423349893541519e-05, + "loss": 0.005004516616463661, + "step": 181600 + }, + { + "epoch": 25.778566359119942, + "grad_norm": 5.3847808837890625, + "learning_rate": 7.423207948899929e-05, + "loss": 0.03910474479198456, + "step": 181610 + }, + { + "epoch": 25.77998580553584, + "grad_norm": 0.05086629092693329, + "learning_rate": 7.42306600425834e-05, + "loss": 0.026414209604263307, + "step": 181620 + }, + { + "epoch": 25.78140525195174, + "grad_norm": 0.527366578578949, + "learning_rate": 7.42292405961675e-05, + "loss": 0.041139551997184755, + "step": 181630 + }, + { + "epoch": 25.782824698367637, + "grad_norm": 0.014872657135128975, + "learning_rate": 7.42278211497516e-05, + "loss": 0.0036502879112958906, + "step": 181640 + }, + { + "epoch": 25.784244144783536, + "grad_norm": 0.12031950056552887, + "learning_rate": 7.42264017033357e-05, + "loss": 0.002989385277032852, + "step": 181650 + }, + { + "epoch": 25.785663591199434, + "grad_norm": 0.23793786764144897, + "learning_rate": 7.42249822569198e-05, + "loss": 0.018382728099822998, + "step": 181660 + }, + { + "epoch": 25.78708303761533, + "grad_norm": 0.002070126123726368, + "learning_rate": 7.422356281050391e-05, + "loss": 0.03031228482723236, + "step": 181670 + }, + { + "epoch": 25.788502484031227, + "grad_norm": 3.0319528579711914, + "learning_rate": 7.4222143364088e-05, + "loss": 0.0161630779504776, + "step": 181680 + }, + { + "epoch": 25.789921930447125, + "grad_norm": 5.935647964477539, + "learning_rate": 7.422072391767211e-05, + "loss": 0.006852465867996216, + "step": 181690 + }, + { + "epoch": 25.791341376863024, + "grad_norm": 0.4345623850822449, + "learning_rate": 7.42193044712562e-05, + "loss": 0.0014202848076820373, + "step": 181700 + }, + { + "epoch": 25.792760823278922, + "grad_norm": 3.4075217247009277, + "learning_rate": 7.421788502484032e-05, + "loss": 0.015714436769485474, + "step": 181710 + }, + { + "epoch": 25.79418026969482, + "grad_norm": 0.11642465740442276, + "learning_rate": 7.421646557842441e-05, + "loss": 0.018615080416202544, + "step": 181720 + }, + { + "epoch": 25.79559971611072, + "grad_norm": 0.09087579697370529, + "learning_rate": 7.421504613200853e-05, + "loss": 0.005982545390725136, + "step": 181730 + }, + { + "epoch": 25.797019162526613, + "grad_norm": 0.6385992765426636, + "learning_rate": 7.421362668559262e-05, + "loss": 0.029961612820625306, + "step": 181740 + }, + { + "epoch": 25.79843860894251, + "grad_norm": 0.009608929045498371, + "learning_rate": 7.421220723917672e-05, + "loss": 0.004801873490214348, + "step": 181750 + }, + { + "epoch": 25.79985805535841, + "grad_norm": 0.133661687374115, + "learning_rate": 7.421078779276083e-05, + "loss": 0.023289982974529267, + "step": 181760 + }, + { + "epoch": 25.801277501774308, + "grad_norm": 0.06418689340353012, + "learning_rate": 7.420936834634493e-05, + "loss": 0.03108038604259491, + "step": 181770 + }, + { + "epoch": 25.802696948190206, + "grad_norm": 1.272254467010498, + "learning_rate": 7.420794889992904e-05, + "loss": 0.002033423259854317, + "step": 181780 + }, + { + "epoch": 25.804116394606105, + "grad_norm": 2.013504981994629, + "learning_rate": 7.420652945351312e-05, + "loss": 0.008599017560482026, + "step": 181790 + }, + { + "epoch": 25.805535841022003, + "grad_norm": 4.352603912353516, + "learning_rate": 7.420511000709723e-05, + "loss": 0.012008067965507508, + "step": 181800 + }, + { + "epoch": 25.806955287437898, + "grad_norm": 5.6481828689575195, + "learning_rate": 7.420369056068133e-05, + "loss": 0.00958443284034729, + "step": 181810 + }, + { + "epoch": 25.808374733853796, + "grad_norm": 19.291791915893555, + "learning_rate": 7.420227111426544e-05, + "loss": 0.032294619083404544, + "step": 181820 + }, + { + "epoch": 25.809794180269694, + "grad_norm": 4.569094657897949, + "learning_rate": 7.420085166784955e-05, + "loss": 0.025536668300628663, + "step": 181830 + }, + { + "epoch": 25.811213626685593, + "grad_norm": 2.405461072921753, + "learning_rate": 7.419943222143364e-05, + "loss": 0.05030026435852051, + "step": 181840 + }, + { + "epoch": 25.81263307310149, + "grad_norm": 0.012625168077647686, + "learning_rate": 7.419801277501775e-05, + "loss": 0.04071837067604065, + "step": 181850 + }, + { + "epoch": 25.81405251951739, + "grad_norm": 10.540587425231934, + "learning_rate": 7.419659332860185e-05, + "loss": 0.08673339486122131, + "step": 181860 + }, + { + "epoch": 25.815471965933288, + "grad_norm": 0.048401083797216415, + "learning_rate": 7.419517388218596e-05, + "loss": 0.0024458322674036025, + "step": 181870 + }, + { + "epoch": 25.816891412349182, + "grad_norm": 5.498880863189697, + "learning_rate": 7.419375443577005e-05, + "loss": 0.02448984086513519, + "step": 181880 + }, + { + "epoch": 25.81831085876508, + "grad_norm": 0.8367999196052551, + "learning_rate": 7.419233498935415e-05, + "loss": 0.009736502170562744, + "step": 181890 + }, + { + "epoch": 25.81973030518098, + "grad_norm": 0.08134724199771881, + "learning_rate": 7.419091554293825e-05, + "loss": 0.01037132740020752, + "step": 181900 + }, + { + "epoch": 25.821149751596877, + "grad_norm": 0.6184737086296082, + "learning_rate": 7.418949609652236e-05, + "loss": 0.0044671095907688144, + "step": 181910 + }, + { + "epoch": 25.822569198012776, + "grad_norm": 0.033824946731328964, + "learning_rate": 7.418807665010647e-05, + "loss": 0.01291331946849823, + "step": 181920 + }, + { + "epoch": 25.823988644428674, + "grad_norm": 0.07523380219936371, + "learning_rate": 7.418665720369057e-05, + "loss": 0.006218673288822174, + "step": 181930 + }, + { + "epoch": 25.825408090844572, + "grad_norm": 1.667096734046936, + "learning_rate": 7.418523775727467e-05, + "loss": 0.03440771996974945, + "step": 181940 + }, + { + "epoch": 25.826827537260467, + "grad_norm": 0.051069483160972595, + "learning_rate": 7.418381831085876e-05, + "loss": 0.015532281994819642, + "step": 181950 + }, + { + "epoch": 25.828246983676365, + "grad_norm": 0.10860639065504074, + "learning_rate": 7.418239886444287e-05, + "loss": 0.003501966968178749, + "step": 181960 + }, + { + "epoch": 25.829666430092264, + "grad_norm": 0.27826717495918274, + "learning_rate": 7.418097941802697e-05, + "loss": 0.025777608156204224, + "step": 181970 + }, + { + "epoch": 25.831085876508162, + "grad_norm": 14.952784538269043, + "learning_rate": 7.417955997161108e-05, + "loss": 0.042685702443122864, + "step": 181980 + }, + { + "epoch": 25.83250532292406, + "grad_norm": 0.9453990459442139, + "learning_rate": 7.417814052519517e-05, + "loss": 0.0011568672955036164, + "step": 181990 + }, + { + "epoch": 25.83392476933996, + "grad_norm": 6.420475482940674, + "learning_rate": 7.417672107877928e-05, + "loss": 0.015341141819953918, + "step": 182000 + }, + { + "epoch": 25.83392476933996, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.07418441027402878, + "eval_runtime": 33.6139, + "eval_samples_per_second": 467.872, + "eval_steps_per_second": 14.637, + "step": 182000 + }, + { + "epoch": 25.835344215755857, + "grad_norm": 2.797924518585205, + "learning_rate": 7.417530163236339e-05, + "loss": 0.015860782563686372, + "step": 182010 + }, + { + "epoch": 25.83676366217175, + "grad_norm": 12.78602123260498, + "learning_rate": 7.417388218594748e-05, + "loss": 0.03180122971534729, + "step": 182020 + }, + { + "epoch": 25.83818310858765, + "grad_norm": 0.11210576444864273, + "learning_rate": 7.41724627395316e-05, + "loss": 0.0043340839445590975, + "step": 182030 + }, + { + "epoch": 25.839602555003548, + "grad_norm": 0.06072605028748512, + "learning_rate": 7.417104329311568e-05, + "loss": 0.0072499319911003115, + "step": 182040 + }, + { + "epoch": 25.841022001419446, + "grad_norm": 0.026019198819994926, + "learning_rate": 7.416962384669979e-05, + "loss": 0.003016340360045433, + "step": 182050 + }, + { + "epoch": 25.842441447835345, + "grad_norm": 2.0016305446624756, + "learning_rate": 7.416820440028389e-05, + "loss": 0.12738150358200073, + "step": 182060 + }, + { + "epoch": 25.843860894251243, + "grad_norm": 0.09400361776351929, + "learning_rate": 7.4166784953868e-05, + "loss": 0.03425784707069397, + "step": 182070 + }, + { + "epoch": 25.84528034066714, + "grad_norm": 0.023346692323684692, + "learning_rate": 7.41653655074521e-05, + "loss": 0.03787416517734528, + "step": 182080 + }, + { + "epoch": 25.846699787083036, + "grad_norm": 0.8512740731239319, + "learning_rate": 7.416394606103621e-05, + "loss": 0.0012184519320726395, + "step": 182090 + }, + { + "epoch": 25.848119233498934, + "grad_norm": 1.0517945289611816, + "learning_rate": 7.41625266146203e-05, + "loss": 0.0423381119966507, + "step": 182100 + }, + { + "epoch": 25.849538679914833, + "grad_norm": 0.06989353150129318, + "learning_rate": 7.41611071682044e-05, + "loss": 0.028103455901145935, + "step": 182110 + }, + { + "epoch": 25.85095812633073, + "grad_norm": 0.30882710218429565, + "learning_rate": 7.415968772178851e-05, + "loss": 0.001858191192150116, + "step": 182120 + }, + { + "epoch": 25.85237757274663, + "grad_norm": 0.03753085806965828, + "learning_rate": 7.415826827537261e-05, + "loss": 0.006228188425302506, + "step": 182130 + }, + { + "epoch": 25.853797019162528, + "grad_norm": 4.320182800292969, + "learning_rate": 7.415684882895672e-05, + "loss": 0.0033666577190160753, + "step": 182140 + }, + { + "epoch": 25.855216465578426, + "grad_norm": 2.625394344329834, + "learning_rate": 7.41554293825408e-05, + "loss": 0.0159807026386261, + "step": 182150 + }, + { + "epoch": 25.85663591199432, + "grad_norm": 4.34943962097168, + "learning_rate": 7.415400993612492e-05, + "loss": 0.030489757657051086, + "step": 182160 + }, + { + "epoch": 25.85805535841022, + "grad_norm": 0.007747141644358635, + "learning_rate": 7.415259048970901e-05, + "loss": 0.013479173183441162, + "step": 182170 + }, + { + "epoch": 25.859474804826117, + "grad_norm": 0.05320015549659729, + "learning_rate": 7.415117104329312e-05, + "loss": 0.008832027018070222, + "step": 182180 + }, + { + "epoch": 25.860894251242016, + "grad_norm": 0.7455971837043762, + "learning_rate": 7.414989354151881e-05, + "loss": 0.021108701825141907, + "step": 182190 + }, + { + "epoch": 25.862313697657914, + "grad_norm": 0.04624160751700401, + "learning_rate": 7.414847409510292e-05, + "loss": 0.02266530990600586, + "step": 182200 + }, + { + "epoch": 25.863733144073812, + "grad_norm": 0.01805158704519272, + "learning_rate": 7.414705464868702e-05, + "loss": 0.020498384535312653, + "step": 182210 + }, + { + "epoch": 25.86515259048971, + "grad_norm": 0.2062365561723709, + "learning_rate": 7.414563520227111e-05, + "loss": 0.028120845556259155, + "step": 182220 + }, + { + "epoch": 25.866572036905605, + "grad_norm": 0.06060515344142914, + "learning_rate": 7.414421575585521e-05, + "loss": 0.025897637009620667, + "step": 182230 + }, + { + "epoch": 25.867991483321504, + "grad_norm": 0.4364734888076782, + "learning_rate": 7.414279630943932e-05, + "loss": 0.04351780414581299, + "step": 182240 + }, + { + "epoch": 25.869410929737402, + "grad_norm": 0.44115567207336426, + "learning_rate": 7.414137686302342e-05, + "loss": 0.047239115834236144, + "step": 182250 + }, + { + "epoch": 25.8708303761533, + "grad_norm": 1.355193018913269, + "learning_rate": 7.413995741660753e-05, + "loss": 0.011905653774738312, + "step": 182260 + }, + { + "epoch": 25.8722498225692, + "grad_norm": 2.049482822418213, + "learning_rate": 7.413853797019163e-05, + "loss": 0.009963244199752808, + "step": 182270 + }, + { + "epoch": 25.873669268985097, + "grad_norm": 0.39308401942253113, + "learning_rate": 7.413711852377573e-05, + "loss": 0.027703410387039183, + "step": 182280 + }, + { + "epoch": 25.875088715400995, + "grad_norm": 0.5474950075149536, + "learning_rate": 7.413569907735984e-05, + "loss": 0.0008932899683713913, + "step": 182290 + }, + { + "epoch": 25.87650816181689, + "grad_norm": 13.972710609436035, + "learning_rate": 7.413427963094393e-05, + "loss": 0.03018895983695984, + "step": 182300 + }, + { + "epoch": 25.87792760823279, + "grad_norm": 0.24021422863006592, + "learning_rate": 7.413286018452805e-05, + "loss": 0.029331699013710022, + "step": 182310 + }, + { + "epoch": 25.879347054648687, + "grad_norm": 0.9731734395027161, + "learning_rate": 7.413144073811213e-05, + "loss": 0.025038081407546996, + "step": 182320 + }, + { + "epoch": 25.880766501064585, + "grad_norm": 0.13746650516986847, + "learning_rate": 7.413002129169624e-05, + "loss": 0.018882088363170624, + "step": 182330 + }, + { + "epoch": 25.882185947480483, + "grad_norm": 0.025698335841298103, + "learning_rate": 7.412860184528034e-05, + "loss": 0.0020218852907419203, + "step": 182340 + }, + { + "epoch": 25.88360539389638, + "grad_norm": 0.9271196126937866, + "learning_rate": 7.412718239886445e-05, + "loss": 0.03542667925357819, + "step": 182350 + }, + { + "epoch": 25.88502484031228, + "grad_norm": 1.4351253509521484, + "learning_rate": 7.412576295244855e-05, + "loss": 0.029222682118415833, + "step": 182360 + }, + { + "epoch": 25.886444286728175, + "grad_norm": 0.032461345195770264, + "learning_rate": 7.412434350603266e-05, + "loss": 0.035482007265090945, + "step": 182370 + }, + { + "epoch": 25.887863733144073, + "grad_norm": 12.990357398986816, + "learning_rate": 7.412292405961675e-05, + "loss": 0.03822300732135773, + "step": 182380 + }, + { + "epoch": 25.88928317955997, + "grad_norm": 0.012165614403784275, + "learning_rate": 7.412150461320085e-05, + "loss": 0.0019414242357015609, + "step": 182390 + }, + { + "epoch": 25.89070262597587, + "grad_norm": 1.1320135593414307, + "learning_rate": 7.412008516678496e-05, + "loss": 0.020562881231307985, + "step": 182400 + }, + { + "epoch": 25.892122072391768, + "grad_norm": 0.07621171325445175, + "learning_rate": 7.411866572036906e-05, + "loss": 0.011768876761198043, + "step": 182410 + }, + { + "epoch": 25.893541518807666, + "grad_norm": 0.24478782713413239, + "learning_rate": 7.411724627395317e-05, + "loss": 0.03588279485702515, + "step": 182420 + }, + { + "epoch": 25.894960965223564, + "grad_norm": 0.023743903264403343, + "learning_rate": 7.411582682753725e-05, + "loss": 0.0025848109275102616, + "step": 182430 + }, + { + "epoch": 25.89638041163946, + "grad_norm": 9.931859970092773, + "learning_rate": 7.411440738112137e-05, + "loss": 0.015128797292709351, + "step": 182440 + }, + { + "epoch": 25.897799858055357, + "grad_norm": 2.1261560916900635, + "learning_rate": 7.411298793470546e-05, + "loss": 0.004695490747690201, + "step": 182450 + }, + { + "epoch": 25.899219304471256, + "grad_norm": 0.014003855176270008, + "learning_rate": 7.411156848828957e-05, + "loss": 0.05878941416740417, + "step": 182460 + }, + { + "epoch": 25.900638750887154, + "grad_norm": 0.03657088428735733, + "learning_rate": 7.411014904187367e-05, + "loss": 0.014867982268333435, + "step": 182470 + }, + { + "epoch": 25.902058197303052, + "grad_norm": 15.879858016967773, + "learning_rate": 7.410872959545777e-05, + "loss": 0.03394612669944763, + "step": 182480 + }, + { + "epoch": 25.90347764371895, + "grad_norm": 0.024481555446982384, + "learning_rate": 7.410731014904188e-05, + "loss": 0.011305215954780578, + "step": 182490 + }, + { + "epoch": 25.90489709013485, + "grad_norm": 6.700937271118164, + "learning_rate": 7.410589070262598e-05, + "loss": 0.014017327129840851, + "step": 182500 + }, + { + "epoch": 25.90489709013485, + "eval_accuracy": 0.9859477331976855, + "eval_loss": 0.060092292726039886, + "eval_runtime": 31.633, + "eval_samples_per_second": 497.171, + "eval_steps_per_second": 15.553, + "step": 182500 + }, + { + "epoch": 25.906316536550744, + "grad_norm": 0.08095698803663254, + "learning_rate": 7.410447125621009e-05, + "loss": 0.0025466863065958024, + "step": 182510 + }, + { + "epoch": 25.907735982966642, + "grad_norm": 2.8683793544769287, + "learning_rate": 7.410305180979419e-05, + "loss": 0.012289933860301971, + "step": 182520 + }, + { + "epoch": 25.90915542938254, + "grad_norm": 0.598667562007904, + "learning_rate": 7.410163236337828e-05, + "loss": 0.009825052320957183, + "step": 182530 + }, + { + "epoch": 25.91057487579844, + "grad_norm": 0.04160289838910103, + "learning_rate": 7.410021291696238e-05, + "loss": 0.013257338106632233, + "step": 182540 + }, + { + "epoch": 25.911994322214337, + "grad_norm": 0.03824185952544212, + "learning_rate": 7.409879347054649e-05, + "loss": 0.0017321981489658357, + "step": 182550 + }, + { + "epoch": 25.913413768630235, + "grad_norm": 0.0911448672413826, + "learning_rate": 7.409737402413059e-05, + "loss": 0.002647039666771889, + "step": 182560 + }, + { + "epoch": 25.914833215046134, + "grad_norm": 0.06498662382364273, + "learning_rate": 7.40959545777147e-05, + "loss": 0.0029141634702682497, + "step": 182570 + }, + { + "epoch": 25.91625266146203, + "grad_norm": 0.125472754240036, + "learning_rate": 7.40945351312988e-05, + "loss": 0.043365365266799925, + "step": 182580 + }, + { + "epoch": 25.917672107877927, + "grad_norm": 0.25213465094566345, + "learning_rate": 7.40931156848829e-05, + "loss": 0.014418220520019532, + "step": 182590 + }, + { + "epoch": 25.919091554293825, + "grad_norm": 0.03894510120153427, + "learning_rate": 7.4091696238467e-05, + "loss": 0.0031946513801813124, + "step": 182600 + }, + { + "epoch": 25.920511000709723, + "grad_norm": 0.2249029129743576, + "learning_rate": 7.40902767920511e-05, + "loss": 0.01936246156692505, + "step": 182610 + }, + { + "epoch": 25.92193044712562, + "grad_norm": 0.00967717170715332, + "learning_rate": 7.408885734563521e-05, + "loss": 0.001641826331615448, + "step": 182620 + }, + { + "epoch": 25.92334989354152, + "grad_norm": 0.6991938948631287, + "learning_rate": 7.40874378992193e-05, + "loss": 0.01826585829257965, + "step": 182630 + }, + { + "epoch": 25.924769339957418, + "grad_norm": 1.7823221683502197, + "learning_rate": 7.408601845280341e-05, + "loss": 0.042367637157440186, + "step": 182640 + }, + { + "epoch": 25.926188786373313, + "grad_norm": 1.6676915884017944, + "learning_rate": 7.40845990063875e-05, + "loss": 0.009232044219970703, + "step": 182650 + }, + { + "epoch": 25.92760823278921, + "grad_norm": 0.11420632898807526, + "learning_rate": 7.408317955997162e-05, + "loss": 0.013116797804832459, + "step": 182660 + }, + { + "epoch": 25.92902767920511, + "grad_norm": 0.2333434522151947, + "learning_rate": 7.408176011355573e-05, + "loss": 0.009703756868839264, + "step": 182670 + }, + { + "epoch": 25.930447125621008, + "grad_norm": 0.18618333339691162, + "learning_rate": 7.408034066713981e-05, + "loss": 0.0166924387216568, + "step": 182680 + }, + { + "epoch": 25.931866572036906, + "grad_norm": 2.4819319248199463, + "learning_rate": 7.407892122072392e-05, + "loss": 0.04339458644390106, + "step": 182690 + }, + { + "epoch": 25.933286018452804, + "grad_norm": 0.09084921330213547, + "learning_rate": 7.407750177430802e-05, + "loss": 0.0018082816153764726, + "step": 182700 + }, + { + "epoch": 25.934705464868703, + "grad_norm": 0.21567562222480774, + "learning_rate": 7.407608232789213e-05, + "loss": 0.004091301932930946, + "step": 182710 + }, + { + "epoch": 25.936124911284598, + "grad_norm": 15.654061317443848, + "learning_rate": 7.407466288147623e-05, + "loss": 0.04718263149261474, + "step": 182720 + }, + { + "epoch": 25.937544357700496, + "grad_norm": 9.564797401428223, + "learning_rate": 7.407324343506034e-05, + "loss": 0.013506737351417542, + "step": 182730 + }, + { + "epoch": 25.938963804116394, + "grad_norm": 0.12193374335765839, + "learning_rate": 7.407182398864442e-05, + "loss": 0.008239512145519257, + "step": 182740 + }, + { + "epoch": 25.940383250532292, + "grad_norm": 16.03081703186035, + "learning_rate": 7.407040454222853e-05, + "loss": 0.018369293212890624, + "step": 182750 + }, + { + "epoch": 25.94180269694819, + "grad_norm": 0.09167572855949402, + "learning_rate": 7.406898509581264e-05, + "loss": 0.0025456957519054413, + "step": 182760 + }, + { + "epoch": 25.94322214336409, + "grad_norm": 0.07925423979759216, + "learning_rate": 7.406756564939674e-05, + "loss": 0.03974187970161438, + "step": 182770 + }, + { + "epoch": 25.944641589779987, + "grad_norm": 0.7465562224388123, + "learning_rate": 7.406614620298085e-05, + "loss": 0.041642165184020995, + "step": 182780 + }, + { + "epoch": 25.946061036195882, + "grad_norm": 0.0605219341814518, + "learning_rate": 7.406472675656494e-05, + "loss": 0.026855272054672242, + "step": 182790 + }, + { + "epoch": 25.94748048261178, + "grad_norm": 11.880960464477539, + "learning_rate": 7.406330731014905e-05, + "loss": 0.015202383697032928, + "step": 182800 + }, + { + "epoch": 25.94889992902768, + "grad_norm": 0.24506203830242157, + "learning_rate": 7.406188786373314e-05, + "loss": 0.004094628617167473, + "step": 182810 + }, + { + "epoch": 25.950319375443577, + "grad_norm": 0.10587123036384583, + "learning_rate": 7.406046841731726e-05, + "loss": 0.01804138720035553, + "step": 182820 + }, + { + "epoch": 25.951738821859475, + "grad_norm": 0.3023914098739624, + "learning_rate": 7.405904897090135e-05, + "loss": 0.01903342604637146, + "step": 182830 + }, + { + "epoch": 25.953158268275374, + "grad_norm": 0.21383629739284515, + "learning_rate": 7.405762952448545e-05, + "loss": 0.0013116117566823958, + "step": 182840 + }, + { + "epoch": 25.954577714691272, + "grad_norm": 0.25566184520721436, + "learning_rate": 7.405621007806955e-05, + "loss": 0.013333827257156372, + "step": 182850 + }, + { + "epoch": 25.955997161107167, + "grad_norm": 0.040156446397304535, + "learning_rate": 7.405479063165366e-05, + "loss": 0.02429501712322235, + "step": 182860 + }, + { + "epoch": 25.957416607523065, + "grad_norm": 12.300079345703125, + "learning_rate": 7.405337118523777e-05, + "loss": 0.020304258167743682, + "step": 182870 + }, + { + "epoch": 25.958836053938963, + "grad_norm": 0.029767291620373726, + "learning_rate": 7.405195173882187e-05, + "loss": 0.01451994627714157, + "step": 182880 + }, + { + "epoch": 25.96025550035486, + "grad_norm": 3.777977228164673, + "learning_rate": 7.405053229240596e-05, + "loss": 0.00483471043407917, + "step": 182890 + }, + { + "epoch": 25.96167494677076, + "grad_norm": 0.056128039956092834, + "learning_rate": 7.404911284599006e-05, + "loss": 0.013836896419525147, + "step": 182900 + }, + { + "epoch": 25.96309439318666, + "grad_norm": 7.229593276977539, + "learning_rate": 7.404769339957417e-05, + "loss": 0.008050240576267242, + "step": 182910 + }, + { + "epoch": 25.964513839602557, + "grad_norm": 1.3015624284744263, + "learning_rate": 7.404627395315827e-05, + "loss": 0.022721394896507263, + "step": 182920 + }, + { + "epoch": 25.96593328601845, + "grad_norm": 0.5484659075737, + "learning_rate": 7.404485450674238e-05, + "loss": 0.009732043743133545, + "step": 182930 + }, + { + "epoch": 25.96735273243435, + "grad_norm": 0.2696712613105774, + "learning_rate": 7.404343506032646e-05, + "loss": 0.0596104085445404, + "step": 182940 + }, + { + "epoch": 25.968772178850248, + "grad_norm": 4.112942695617676, + "learning_rate": 7.404201561391058e-05, + "loss": 0.00673166811466217, + "step": 182950 + }, + { + "epoch": 25.970191625266146, + "grad_norm": 1.227079153060913, + "learning_rate": 7.404059616749469e-05, + "loss": 0.0015543844550848008, + "step": 182960 + }, + { + "epoch": 25.971611071682045, + "grad_norm": 0.04353504255414009, + "learning_rate": 7.403917672107878e-05, + "loss": 0.02184876948595047, + "step": 182970 + }, + { + "epoch": 25.973030518097943, + "grad_norm": 6.654874324798584, + "learning_rate": 7.40377572746629e-05, + "loss": 0.01366931051015854, + "step": 182980 + }, + { + "epoch": 25.97444996451384, + "grad_norm": 9.260459899902344, + "learning_rate": 7.403633782824698e-05, + "loss": 0.03515231013298035, + "step": 182990 + }, + { + "epoch": 25.975869410929736, + "grad_norm": 1.183670163154602, + "learning_rate": 7.403491838183109e-05, + "loss": 0.040931490063667295, + "step": 183000 + }, + { + "epoch": 25.975869410929736, + "eval_accuracy": 0.984930374515165, + "eval_loss": 0.058381158858537674, + "eval_runtime": 31.4831, + "eval_samples_per_second": 499.538, + "eval_steps_per_second": 15.627, + "step": 183000 + }, + { + "epoch": 25.977288857345634, + "grad_norm": 13.118865013122559, + "learning_rate": 7.403349893541519e-05, + "loss": 0.03204590082168579, + "step": 183010 + }, + { + "epoch": 25.978708303761533, + "grad_norm": 0.4736606180667877, + "learning_rate": 7.40320794889993e-05, + "loss": 0.0062385469675064085, + "step": 183020 + }, + { + "epoch": 25.98012775017743, + "grad_norm": 0.008075454272329807, + "learning_rate": 7.40306600425834e-05, + "loss": 0.052863866090774536, + "step": 183030 + }, + { + "epoch": 25.98154719659333, + "grad_norm": 5.304965972900391, + "learning_rate": 7.402924059616749e-05, + "loss": 0.005596532672643662, + "step": 183040 + }, + { + "epoch": 25.982966643009227, + "grad_norm": 0.9012603163719177, + "learning_rate": 7.40278211497516e-05, + "loss": 0.01650320738554001, + "step": 183050 + }, + { + "epoch": 25.984386089425126, + "grad_norm": 0.014220334589481354, + "learning_rate": 7.40264017033357e-05, + "loss": 0.023412179946899415, + "step": 183060 + }, + { + "epoch": 25.98580553584102, + "grad_norm": 0.16151906549930573, + "learning_rate": 7.402498225691981e-05, + "loss": 0.007209156453609466, + "step": 183070 + }, + { + "epoch": 25.98722498225692, + "grad_norm": 0.33631572127342224, + "learning_rate": 7.402356281050391e-05, + "loss": 0.0008922237902879715, + "step": 183080 + }, + { + "epoch": 25.988644428672817, + "grad_norm": 0.08992263674736023, + "learning_rate": 7.402214336408802e-05, + "loss": 0.002841905876994133, + "step": 183090 + }, + { + "epoch": 25.990063875088715, + "grad_norm": 0.03369695320725441, + "learning_rate": 7.40207239176721e-05, + "loss": 0.0029342386871576307, + "step": 183100 + }, + { + "epoch": 25.991483321504614, + "grad_norm": 0.08476461470127106, + "learning_rate": 7.401930447125621e-05, + "loss": 0.01070745587348938, + "step": 183110 + }, + { + "epoch": 25.992902767920512, + "grad_norm": 4.025931358337402, + "learning_rate": 7.401788502484031e-05, + "loss": 0.032642072439193724, + "step": 183120 + }, + { + "epoch": 25.99432221433641, + "grad_norm": 13.44384765625, + "learning_rate": 7.401646557842442e-05, + "loss": 0.02092517912387848, + "step": 183130 + }, + { + "epoch": 25.995741660752305, + "grad_norm": 0.046679865568876266, + "learning_rate": 7.401504613200852e-05, + "loss": 0.03528848886489868, + "step": 183140 + }, + { + "epoch": 25.997161107168203, + "grad_norm": 0.04742085561156273, + "learning_rate": 7.401362668559262e-05, + "loss": 0.01056586131453514, + "step": 183150 + }, + { + "epoch": 25.9985805535841, + "grad_norm": 0.12444628775119781, + "learning_rate": 7.401220723917673e-05, + "loss": 0.02360062599182129, + "step": 183160 + }, + { + "epoch": 26.0, + "grad_norm": 0.13015875220298767, + "learning_rate": 7.401078779276083e-05, + "loss": 0.02360498011112213, + "step": 183170 + }, + { + "epoch": 26.0014194464159, + "grad_norm": 10.788741111755371, + "learning_rate": 7.400936834634494e-05, + "loss": 0.021838706731796265, + "step": 183180 + }, + { + "epoch": 26.002838892831797, + "grad_norm": 0.11747226119041443, + "learning_rate": 7.400794889992903e-05, + "loss": 0.0048577550798654555, + "step": 183190 + }, + { + "epoch": 26.004258339247695, + "grad_norm": 1.9744071960449219, + "learning_rate": 7.400652945351313e-05, + "loss": 0.0020045511424541473, + "step": 183200 + }, + { + "epoch": 26.00567778566359, + "grad_norm": 0.12270019948482513, + "learning_rate": 7.400511000709723e-05, + "loss": 0.018512681126594543, + "step": 183210 + }, + { + "epoch": 26.007097232079488, + "grad_norm": 0.11087559163570404, + "learning_rate": 7.400369056068134e-05, + "loss": 0.006076197326183319, + "step": 183220 + }, + { + "epoch": 26.008516678495386, + "grad_norm": 0.03739660233259201, + "learning_rate": 7.400227111426544e-05, + "loss": 0.015680195391178132, + "step": 183230 + }, + { + "epoch": 26.009936124911285, + "grad_norm": 0.03730905428528786, + "learning_rate": 7.400085166784955e-05, + "loss": 0.012417271733283997, + "step": 183240 + }, + { + "epoch": 26.011355571327183, + "grad_norm": 0.020780738443136215, + "learning_rate": 7.399943222143365e-05, + "loss": 0.0012714568525552749, + "step": 183250 + }, + { + "epoch": 26.01277501774308, + "grad_norm": 0.007937679998576641, + "learning_rate": 7.399801277501774e-05, + "loss": 0.003024228662252426, + "step": 183260 + }, + { + "epoch": 26.01419446415898, + "grad_norm": 12.003710746765137, + "learning_rate": 7.399659332860185e-05, + "loss": 0.029048088192939758, + "step": 183270 + }, + { + "epoch": 26.015613910574874, + "grad_norm": 0.013102258555591106, + "learning_rate": 7.399517388218595e-05, + "loss": 0.0038975227624177934, + "step": 183280 + }, + { + "epoch": 26.017033356990773, + "grad_norm": 10.534189224243164, + "learning_rate": 7.399375443577006e-05, + "loss": 0.03702815175056458, + "step": 183290 + }, + { + "epoch": 26.01845280340667, + "grad_norm": 7.874550819396973, + "learning_rate": 7.399233498935415e-05, + "loss": 0.015293526649475097, + "step": 183300 + }, + { + "epoch": 26.01987224982257, + "grad_norm": 4.947314739227295, + "learning_rate": 7.399091554293826e-05, + "loss": 0.04244297742843628, + "step": 183310 + }, + { + "epoch": 26.021291696238467, + "grad_norm": 0.09300188720226288, + "learning_rate": 7.398949609652235e-05, + "loss": 0.0011481013149023055, + "step": 183320 + }, + { + "epoch": 26.022711142654366, + "grad_norm": 3.2100324630737305, + "learning_rate": 7.398807665010647e-05, + "loss": 0.003083512932062149, + "step": 183330 + }, + { + "epoch": 26.024130589070264, + "grad_norm": 8.895515441894531, + "learning_rate": 7.398665720369056e-05, + "loss": 0.03792979121208191, + "step": 183340 + }, + { + "epoch": 26.02555003548616, + "grad_norm": 0.006103934720158577, + "learning_rate": 7.398523775727466e-05, + "loss": 0.038037219643592836, + "step": 183350 + }, + { + "epoch": 26.026969481902057, + "grad_norm": 0.013287430629134178, + "learning_rate": 7.398381831085877e-05, + "loss": 0.013951146602630615, + "step": 183360 + }, + { + "epoch": 26.028388928317955, + "grad_norm": 10.018046379089355, + "learning_rate": 7.398239886444287e-05, + "loss": 0.010149547457695007, + "step": 183370 + }, + { + "epoch": 26.029808374733854, + "grad_norm": 0.5069403052330017, + "learning_rate": 7.398097941802698e-05, + "loss": 0.0006826542317867279, + "step": 183380 + }, + { + "epoch": 26.031227821149752, + "grad_norm": 0.36952850222587585, + "learning_rate": 7.397955997161108e-05, + "loss": 0.024126410484313965, + "step": 183390 + }, + { + "epoch": 26.03264726756565, + "grad_norm": 0.004950480069965124, + "learning_rate": 7.397814052519517e-05, + "loss": 0.010340959578752518, + "step": 183400 + }, + { + "epoch": 26.03406671398155, + "grad_norm": 0.1558225154876709, + "learning_rate": 7.397672107877927e-05, + "loss": 0.0017730869352817536, + "step": 183410 + }, + { + "epoch": 26.035486160397443, + "grad_norm": 0.5099972486495972, + "learning_rate": 7.397530163236338e-05, + "loss": 0.00968848168849945, + "step": 183420 + }, + { + "epoch": 26.03690560681334, + "grad_norm": 10.546296119689941, + "learning_rate": 7.397388218594748e-05, + "loss": 0.020409677922725678, + "step": 183430 + }, + { + "epoch": 26.03832505322924, + "grad_norm": 0.023672740906476974, + "learning_rate": 7.397246273953159e-05, + "loss": 0.0157056525349617, + "step": 183440 + }, + { + "epoch": 26.03974449964514, + "grad_norm": 0.09608198702335358, + "learning_rate": 7.397104329311569e-05, + "loss": 0.017443245649337767, + "step": 183450 + }, + { + "epoch": 26.041163946061037, + "grad_norm": 0.10958583652973175, + "learning_rate": 7.396962384669979e-05, + "loss": 0.00520542599260807, + "step": 183460 + }, + { + "epoch": 26.042583392476935, + "grad_norm": 12.953692436218262, + "learning_rate": 7.39682044002839e-05, + "loss": 0.02192756235599518, + "step": 183470 + }, + { + "epoch": 26.044002838892833, + "grad_norm": 0.3042963445186615, + "learning_rate": 7.3966784953868e-05, + "loss": 0.003009422495961189, + "step": 183480 + }, + { + "epoch": 26.045422285308728, + "grad_norm": 0.5177048444747925, + "learning_rate": 7.39653655074521e-05, + "loss": 0.001644083485007286, + "step": 183490 + }, + { + "epoch": 26.046841731724626, + "grad_norm": 0.3077037036418915, + "learning_rate": 7.39639460610362e-05, + "loss": 0.0038151491433382036, + "step": 183500 + }, + { + "epoch": 26.046841731724626, + "eval_accuracy": 0.9890633941629046, + "eval_loss": 0.0420011542737484, + "eval_runtime": 32.0979, + "eval_samples_per_second": 489.969, + "eval_steps_per_second": 15.328, + "step": 183500 + }, + { + "epoch": 26.048261178140525, + "grad_norm": 8.249155044555664, + "learning_rate": 7.39625266146203e-05, + "loss": 0.005245016142725945, + "step": 183510 + }, + { + "epoch": 26.049680624556423, + "grad_norm": 0.4869801104068756, + "learning_rate": 7.39611071682044e-05, + "loss": 0.031156882643699646, + "step": 183520 + }, + { + "epoch": 26.05110007097232, + "grad_norm": 0.6172145009040833, + "learning_rate": 7.395968772178851e-05, + "loss": 0.008197250962257385, + "step": 183530 + }, + { + "epoch": 26.05251951738822, + "grad_norm": 0.02749018929898739, + "learning_rate": 7.39582682753726e-05, + "loss": 0.013861705362796784, + "step": 183540 + }, + { + "epoch": 26.053938963804118, + "grad_norm": 0.6048078536987305, + "learning_rate": 7.395684882895672e-05, + "loss": 0.034680360555648805, + "step": 183550 + }, + { + "epoch": 26.055358410220013, + "grad_norm": 0.003067345591261983, + "learning_rate": 7.395542938254081e-05, + "loss": 0.014530543982982636, + "step": 183560 + }, + { + "epoch": 26.05677785663591, + "grad_norm": 0.19871000945568085, + "learning_rate": 7.395400993612491e-05, + "loss": 0.00943761020898819, + "step": 183570 + }, + { + "epoch": 26.05819730305181, + "grad_norm": 0.6851097941398621, + "learning_rate": 7.395259048970902e-05, + "loss": 0.005794172361493111, + "step": 183580 + }, + { + "epoch": 26.059616749467708, + "grad_norm": 0.1443086862564087, + "learning_rate": 7.395117104329312e-05, + "loss": 0.020941361784934998, + "step": 183590 + }, + { + "epoch": 26.061036195883606, + "grad_norm": 0.12780022621154785, + "learning_rate": 7.394975159687723e-05, + "loss": 0.009748287498950958, + "step": 183600 + }, + { + "epoch": 26.062455642299504, + "grad_norm": 0.08469681441783905, + "learning_rate": 7.394833215046131e-05, + "loss": 0.004023326188325882, + "step": 183610 + }, + { + "epoch": 26.063875088715402, + "grad_norm": 15.409126281738281, + "learning_rate": 7.394691270404543e-05, + "loss": 0.007172147929668427, + "step": 183620 + }, + { + "epoch": 26.065294535131297, + "grad_norm": 0.010838734917342663, + "learning_rate": 7.394549325762952e-05, + "loss": 0.0021254803985357286, + "step": 183630 + }, + { + "epoch": 26.066713981547196, + "grad_norm": 0.01945671997964382, + "learning_rate": 7.394407381121363e-05, + "loss": 0.02933220863342285, + "step": 183640 + }, + { + "epoch": 26.068133427963094, + "grad_norm": 0.04665001109242439, + "learning_rate": 7.394265436479773e-05, + "loss": 0.003899688646197319, + "step": 183650 + }, + { + "epoch": 26.069552874378992, + "grad_norm": 0.05640785023570061, + "learning_rate": 7.394123491838183e-05, + "loss": 0.025949397683143617, + "step": 183660 + }, + { + "epoch": 26.07097232079489, + "grad_norm": 10.324722290039062, + "learning_rate": 7.393981547196594e-05, + "loss": 0.005002400651574135, + "step": 183670 + }, + { + "epoch": 26.07239176721079, + "grad_norm": 0.007364907767623663, + "learning_rate": 7.393839602555004e-05, + "loss": 0.004577473551034927, + "step": 183680 + }, + { + "epoch": 26.073811213626687, + "grad_norm": 0.43592560291290283, + "learning_rate": 7.393697657913415e-05, + "loss": 0.023503924906253814, + "step": 183690 + }, + { + "epoch": 26.075230660042582, + "grad_norm": 0.7567896246910095, + "learning_rate": 7.393555713271824e-05, + "loss": 0.0015078194439411164, + "step": 183700 + }, + { + "epoch": 26.07665010645848, + "grad_norm": 0.11699097603559494, + "learning_rate": 7.393413768630234e-05, + "loss": 0.006677595525979995, + "step": 183710 + }, + { + "epoch": 26.07806955287438, + "grad_norm": 0.0174578744918108, + "learning_rate": 7.393271823988644e-05, + "loss": 0.005651940032839775, + "step": 183720 + }, + { + "epoch": 26.079488999290277, + "grad_norm": 0.5735676884651184, + "learning_rate": 7.393129879347055e-05, + "loss": 0.0325795829296112, + "step": 183730 + }, + { + "epoch": 26.080908445706175, + "grad_norm": 0.06417012214660645, + "learning_rate": 7.392987934705465e-05, + "loss": 0.006683145463466644, + "step": 183740 + }, + { + "epoch": 26.082327892122073, + "grad_norm": 7.796240329742432, + "learning_rate": 7.392845990063876e-05, + "loss": 0.016056892275810242, + "step": 183750 + }, + { + "epoch": 26.08374733853797, + "grad_norm": 0.005177214741706848, + "learning_rate": 7.392704045422286e-05, + "loss": 0.02728145718574524, + "step": 183760 + }, + { + "epoch": 26.085166784953866, + "grad_norm": 0.15786749124526978, + "learning_rate": 7.392562100780695e-05, + "loss": 0.012506267428398133, + "step": 183770 + }, + { + "epoch": 26.086586231369765, + "grad_norm": 0.08470668643712997, + "learning_rate": 7.392420156139106e-05, + "loss": 0.013493019342422485, + "step": 183780 + }, + { + "epoch": 26.088005677785663, + "grad_norm": 0.007081010844558477, + "learning_rate": 7.392278211497516e-05, + "loss": 0.006247552111744881, + "step": 183790 + }, + { + "epoch": 26.08942512420156, + "grad_norm": 0.09657178819179535, + "learning_rate": 7.392136266855927e-05, + "loss": 0.01038820594549179, + "step": 183800 + }, + { + "epoch": 26.09084457061746, + "grad_norm": 15.04168701171875, + "learning_rate": 7.391994322214337e-05, + "loss": 0.03737404942512512, + "step": 183810 + }, + { + "epoch": 26.092264017033358, + "grad_norm": 0.16999591886997223, + "learning_rate": 7.391852377572747e-05, + "loss": 0.01458769291639328, + "step": 183820 + }, + { + "epoch": 26.093683463449256, + "grad_norm": 0.05672222748398781, + "learning_rate": 7.391710432931156e-05, + "loss": 0.009527239203453063, + "step": 183830 + }, + { + "epoch": 26.09510290986515, + "grad_norm": 0.5110501646995544, + "learning_rate": 7.391568488289568e-05, + "loss": 0.0007375482469797135, + "step": 183840 + }, + { + "epoch": 26.09652235628105, + "grad_norm": 2.2654612064361572, + "learning_rate": 7.391426543647977e-05, + "loss": 0.0016574598848819733, + "step": 183850 + }, + { + "epoch": 26.097941802696948, + "grad_norm": 0.15073886513710022, + "learning_rate": 7.391284599006388e-05, + "loss": 0.003994186595082283, + "step": 183860 + }, + { + "epoch": 26.099361249112846, + "grad_norm": 0.007318752817809582, + "learning_rate": 7.391142654364798e-05, + "loss": 0.0011989984661340714, + "step": 183870 + }, + { + "epoch": 26.100780695528744, + "grad_norm": 0.11590830981731415, + "learning_rate": 7.391000709723208e-05, + "loss": 0.008280227333307267, + "step": 183880 + }, + { + "epoch": 26.102200141944643, + "grad_norm": 0.07503677904605865, + "learning_rate": 7.390858765081619e-05, + "loss": 0.011586429923772812, + "step": 183890 + }, + { + "epoch": 26.10361958836054, + "grad_norm": 4.32110071182251, + "learning_rate": 7.390716820440029e-05, + "loss": 0.0038990940898656844, + "step": 183900 + }, + { + "epoch": 26.105039034776436, + "grad_norm": 8.976907730102539, + "learning_rate": 7.39057487579844e-05, + "loss": 0.019383983314037324, + "step": 183910 + }, + { + "epoch": 26.106458481192334, + "grad_norm": 0.006808376871049404, + "learning_rate": 7.390432931156848e-05, + "loss": 0.008423861116170883, + "step": 183920 + }, + { + "epoch": 26.107877927608232, + "grad_norm": 3.402869939804077, + "learning_rate": 7.390290986515259e-05, + "loss": 0.003170362859964371, + "step": 183930 + }, + { + "epoch": 26.10929737402413, + "grad_norm": 0.04741450771689415, + "learning_rate": 7.390149041873669e-05, + "loss": 0.01856931298971176, + "step": 183940 + }, + { + "epoch": 26.11071682044003, + "grad_norm": 0.0139995776116848, + "learning_rate": 7.39000709723208e-05, + "loss": 0.04033329486846924, + "step": 183950 + }, + { + "epoch": 26.112136266855927, + "grad_norm": 1.120937705039978, + "learning_rate": 7.38986515259049e-05, + "loss": 0.01630972623825073, + "step": 183960 + }, + { + "epoch": 26.113555713271825, + "grad_norm": 0.15999341011047363, + "learning_rate": 7.3897232079489e-05, + "loss": 0.0502269446849823, + "step": 183970 + }, + { + "epoch": 26.11497515968772, + "grad_norm": 3.366875171661377, + "learning_rate": 7.389581263307311e-05, + "loss": 0.010100477933883667, + "step": 183980 + }, + { + "epoch": 26.11639460610362, + "grad_norm": 0.1484975963830948, + "learning_rate": 7.38943931866572e-05, + "loss": 0.00876191109418869, + "step": 183990 + }, + { + "epoch": 26.117814052519517, + "grad_norm": 0.26973387598991394, + "learning_rate": 7.389297374024132e-05, + "loss": 0.005481665953993797, + "step": 184000 + }, + { + "epoch": 26.117814052519517, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.054624609649181366, + "eval_runtime": 32.592, + "eval_samples_per_second": 482.542, + "eval_steps_per_second": 15.096, + "step": 184000 + }, + { + "epoch": 26.119233498935415, + "grad_norm": 1.4017066955566406, + "learning_rate": 7.389155429382541e-05, + "loss": 0.021295350790023804, + "step": 184010 + }, + { + "epoch": 26.120652945351313, + "grad_norm": 0.25777944922447205, + "learning_rate": 7.389013484740951e-05, + "loss": 0.021484993398189545, + "step": 184020 + }, + { + "epoch": 26.12207239176721, + "grad_norm": 1.6629531383514404, + "learning_rate": 7.388871540099361e-05, + "loss": 0.03165159523487091, + "step": 184030 + }, + { + "epoch": 26.12349183818311, + "grad_norm": 0.02142961695790291, + "learning_rate": 7.388729595457772e-05, + "loss": 0.05142177939414978, + "step": 184040 + }, + { + "epoch": 26.124911284599005, + "grad_norm": 1.1923511028289795, + "learning_rate": 7.388587650816182e-05, + "loss": 0.001960355043411255, + "step": 184050 + }, + { + "epoch": 26.126330731014903, + "grad_norm": 3.3159379959106445, + "learning_rate": 7.388445706174593e-05, + "loss": 0.011824023723602296, + "step": 184060 + }, + { + "epoch": 26.1277501774308, + "grad_norm": 9.032801628112793, + "learning_rate": 7.388303761533002e-05, + "loss": 0.02361668348312378, + "step": 184070 + }, + { + "epoch": 26.1291696238467, + "grad_norm": 0.010690851137042046, + "learning_rate": 7.388161816891412e-05, + "loss": 0.0011725611984729767, + "step": 184080 + }, + { + "epoch": 26.130589070262598, + "grad_norm": 1.2630473375320435, + "learning_rate": 7.388019872249823e-05, + "loss": 0.00988658368587494, + "step": 184090 + }, + { + "epoch": 26.132008516678496, + "grad_norm": 0.5449743866920471, + "learning_rate": 7.387877927608233e-05, + "loss": 0.023012802004814148, + "step": 184100 + }, + { + "epoch": 26.133427963094395, + "grad_norm": 1.9463233947753906, + "learning_rate": 7.387735982966644e-05, + "loss": 0.022236505150794984, + "step": 184110 + }, + { + "epoch": 26.13484740951029, + "grad_norm": 0.03189481049776077, + "learning_rate": 7.387594038325052e-05, + "loss": 0.01105847880244255, + "step": 184120 + }, + { + "epoch": 26.136266855926188, + "grad_norm": 0.09058975428342819, + "learning_rate": 7.387452093683464e-05, + "loss": 0.009184753894805909, + "step": 184130 + }, + { + "epoch": 26.137686302342086, + "grad_norm": 0.7466179728507996, + "learning_rate": 7.387310149041873e-05, + "loss": 0.039346760511398314, + "step": 184140 + }, + { + "epoch": 26.139105748757984, + "grad_norm": 0.3607329726219177, + "learning_rate": 7.387168204400284e-05, + "loss": 0.016282416880130768, + "step": 184150 + }, + { + "epoch": 26.140525195173883, + "grad_norm": 3.084336042404175, + "learning_rate": 7.387026259758695e-05, + "loss": 0.01022809073328972, + "step": 184160 + }, + { + "epoch": 26.14194464158978, + "grad_norm": 0.5961232781410217, + "learning_rate": 7.386884315117105e-05, + "loss": 0.0036249659955501557, + "step": 184170 + }, + { + "epoch": 26.14336408800568, + "grad_norm": 0.02202095463871956, + "learning_rate": 7.386742370475515e-05, + "loss": 0.00831594318151474, + "step": 184180 + }, + { + "epoch": 26.144783534421574, + "grad_norm": 1.8926854133605957, + "learning_rate": 7.386600425833925e-05, + "loss": 0.004197090119123459, + "step": 184190 + }, + { + "epoch": 26.146202980837472, + "grad_norm": 0.37755268812179565, + "learning_rate": 7.386458481192336e-05, + "loss": 0.0009796518832445144, + "step": 184200 + }, + { + "epoch": 26.14762242725337, + "grad_norm": 2.1853926181793213, + "learning_rate": 7.386316536550745e-05, + "loss": 0.019043274223804474, + "step": 184210 + }, + { + "epoch": 26.14904187366927, + "grad_norm": 0.2749074399471283, + "learning_rate": 7.386174591909157e-05, + "loss": 0.0023919031023979186, + "step": 184220 + }, + { + "epoch": 26.150461320085167, + "grad_norm": 0.26760417222976685, + "learning_rate": 7.386032647267565e-05, + "loss": 0.029241687059402464, + "step": 184230 + }, + { + "epoch": 26.151880766501066, + "grad_norm": 0.4997667968273163, + "learning_rate": 7.385890702625976e-05, + "loss": 0.01035483330488205, + "step": 184240 + }, + { + "epoch": 26.153300212916964, + "grad_norm": 0.997648298740387, + "learning_rate": 7.385748757984387e-05, + "loss": 0.006905896961688996, + "step": 184250 + }, + { + "epoch": 26.15471965933286, + "grad_norm": 6.176813125610352, + "learning_rate": 7.385606813342797e-05, + "loss": 0.005587688460946083, + "step": 184260 + }, + { + "epoch": 26.156139105748757, + "grad_norm": 0.03476956859230995, + "learning_rate": 7.385464868701208e-05, + "loss": 0.0013323772698640823, + "step": 184270 + }, + { + "epoch": 26.157558552164655, + "grad_norm": 0.006198673043400049, + "learning_rate": 7.385322924059616e-05, + "loss": 0.01849823594093323, + "step": 184280 + }, + { + "epoch": 26.158977998580554, + "grad_norm": 0.07879167050123215, + "learning_rate": 7.385180979418027e-05, + "loss": 0.02317288815975189, + "step": 184290 + }, + { + "epoch": 26.160397444996452, + "grad_norm": 0.02473353035748005, + "learning_rate": 7.385039034776437e-05, + "loss": 0.002321012318134308, + "step": 184300 + }, + { + "epoch": 26.16181689141235, + "grad_norm": 0.36795705556869507, + "learning_rate": 7.384897090134848e-05, + "loss": 0.0038444578647613527, + "step": 184310 + }, + { + "epoch": 26.16323633782825, + "grad_norm": 0.038107242435216904, + "learning_rate": 7.384755145493258e-05, + "loss": 0.005904395505785942, + "step": 184320 + }, + { + "epoch": 26.164655784244143, + "grad_norm": 2.1546318531036377, + "learning_rate": 7.384613200851668e-05, + "loss": 0.02142348885536194, + "step": 184330 + }, + { + "epoch": 26.16607523066004, + "grad_norm": 0.04774980992078781, + "learning_rate": 7.384471256210079e-05, + "loss": 0.001890970766544342, + "step": 184340 + }, + { + "epoch": 26.16749467707594, + "grad_norm": 0.49041885137557983, + "learning_rate": 7.384329311568489e-05, + "loss": 0.01698015332221985, + "step": 184350 + }, + { + "epoch": 26.168914123491838, + "grad_norm": 1.0192134380340576, + "learning_rate": 7.3841873669269e-05, + "loss": 0.014211487770080567, + "step": 184360 + }, + { + "epoch": 26.170333569907736, + "grad_norm": 12.436442375183105, + "learning_rate": 7.38404542228531e-05, + "loss": 0.03799489736557007, + "step": 184370 + }, + { + "epoch": 26.171753016323635, + "grad_norm": 16.317834854125977, + "learning_rate": 7.383903477643719e-05, + "loss": 0.008270035684108733, + "step": 184380 + }, + { + "epoch": 26.173172462739533, + "grad_norm": 0.04950987175107002, + "learning_rate": 7.383761533002129e-05, + "loss": 0.06682473421096802, + "step": 184390 + }, + { + "epoch": 26.174591909155428, + "grad_norm": 0.543780505657196, + "learning_rate": 7.38361958836054e-05, + "loss": 0.016939987242221833, + "step": 184400 + }, + { + "epoch": 26.176011355571326, + "grad_norm": 0.5939234495162964, + "learning_rate": 7.38347764371895e-05, + "loss": 0.05154969692230225, + "step": 184410 + }, + { + "epoch": 26.177430801987224, + "grad_norm": 0.1701822131872177, + "learning_rate": 7.383335699077361e-05, + "loss": 0.012877354025840759, + "step": 184420 + }, + { + "epoch": 26.178850248403123, + "grad_norm": 17.054779052734375, + "learning_rate": 7.38319375443577e-05, + "loss": 0.030532556772232055, + "step": 184430 + }, + { + "epoch": 26.18026969481902, + "grad_norm": 0.01531074196100235, + "learning_rate": 7.38305180979418e-05, + "loss": 0.02657327950000763, + "step": 184440 + }, + { + "epoch": 26.18168914123492, + "grad_norm": 0.2979150414466858, + "learning_rate": 7.382909865152591e-05, + "loss": 0.0028955597430467606, + "step": 184450 + }, + { + "epoch": 26.183108587650818, + "grad_norm": 0.12251783162355423, + "learning_rate": 7.382767920511001e-05, + "loss": 0.014662323892116547, + "step": 184460 + }, + { + "epoch": 26.184528034066712, + "grad_norm": 6.830258369445801, + "learning_rate": 7.382625975869412e-05, + "loss": 0.009748994559049606, + "step": 184470 + }, + { + "epoch": 26.18594748048261, + "grad_norm": 0.2782101035118103, + "learning_rate": 7.38248403122782e-05, + "loss": 0.013789930939674377, + "step": 184480 + }, + { + "epoch": 26.18736692689851, + "grad_norm": 10.132781982421875, + "learning_rate": 7.382342086586232e-05, + "loss": 0.04912788867950439, + "step": 184490 + }, + { + "epoch": 26.188786373314407, + "grad_norm": 1.4574706554412842, + "learning_rate": 7.382200141944641e-05, + "loss": 0.02637142837047577, + "step": 184500 + }, + { + "epoch": 26.188786373314407, + "eval_accuracy": 0.9859477331976855, + "eval_loss": 0.06016118824481964, + "eval_runtime": 31.5167, + "eval_samples_per_second": 499.005, + "eval_steps_per_second": 15.611, + "step": 184500 + }, + { + "epoch": 26.190205819730306, + "grad_norm": 1.8633427619934082, + "learning_rate": 7.382058197303053e-05, + "loss": 0.05052157044410706, + "step": 184510 + }, + { + "epoch": 26.191625266146204, + "grad_norm": 6.298516750335693, + "learning_rate": 7.381916252661462e-05, + "loss": 0.03129304051399231, + "step": 184520 + }, + { + "epoch": 26.193044712562102, + "grad_norm": 0.01304350234568119, + "learning_rate": 7.381774308019873e-05, + "loss": 0.028051820397377015, + "step": 184530 + }, + { + "epoch": 26.194464158977997, + "grad_norm": 0.13281132280826569, + "learning_rate": 7.381632363378283e-05, + "loss": 0.004647288843989373, + "step": 184540 + }, + { + "epoch": 26.195883605393895, + "grad_norm": 1.293464183807373, + "learning_rate": 7.381490418736693e-05, + "loss": 0.011637023091316223, + "step": 184550 + }, + { + "epoch": 26.197303051809794, + "grad_norm": 0.22358852624893188, + "learning_rate": 7.381348474095104e-05, + "loss": 0.021714147925376893, + "step": 184560 + }, + { + "epoch": 26.198722498225692, + "grad_norm": 0.0457102507352829, + "learning_rate": 7.381206529453514e-05, + "loss": 0.012100283801555634, + "step": 184570 + }, + { + "epoch": 26.20014194464159, + "grad_norm": 12.135912895202637, + "learning_rate": 7.381078779276082e-05, + "loss": 0.031923803687095645, + "step": 184580 + }, + { + "epoch": 26.20156139105749, + "grad_norm": 6.0857110023498535, + "learning_rate": 7.380936834634493e-05, + "loss": 0.012692299485206605, + "step": 184590 + }, + { + "epoch": 26.202980837473387, + "grad_norm": 0.11880803108215332, + "learning_rate": 7.380794889992903e-05, + "loss": 0.0042447056621313095, + "step": 184600 + }, + { + "epoch": 26.20440028388928, + "grad_norm": 0.08639328181743622, + "learning_rate": 7.380652945351313e-05, + "loss": 0.00617358423769474, + "step": 184610 + }, + { + "epoch": 26.20581973030518, + "grad_norm": 0.4085674285888672, + "learning_rate": 7.380511000709724e-05, + "loss": 0.033511051535606386, + "step": 184620 + }, + { + "epoch": 26.207239176721078, + "grad_norm": 0.07244665175676346, + "learning_rate": 7.380369056068134e-05, + "loss": 0.007279226928949356, + "step": 184630 + }, + { + "epoch": 26.208658623136976, + "grad_norm": 3.5806918144226074, + "learning_rate": 7.380227111426545e-05, + "loss": 0.028893351554870605, + "step": 184640 + }, + { + "epoch": 26.210078069552875, + "grad_norm": 0.18039610981941223, + "learning_rate": 7.380085166784954e-05, + "loss": 0.017935852706432342, + "step": 184650 + }, + { + "epoch": 26.211497515968773, + "grad_norm": 0.09556759148836136, + "learning_rate": 7.379943222143364e-05, + "loss": 0.022977480292320253, + "step": 184660 + }, + { + "epoch": 26.21291696238467, + "grad_norm": 2.2760350704193115, + "learning_rate": 7.379801277501774e-05, + "loss": 0.015171225368976592, + "step": 184670 + }, + { + "epoch": 26.214336408800566, + "grad_norm": 0.016004404053092003, + "learning_rate": 7.379659332860185e-05, + "loss": 0.001689939945936203, + "step": 184680 + }, + { + "epoch": 26.215755855216464, + "grad_norm": 4.48115873336792, + "learning_rate": 7.379517388218595e-05, + "loss": 0.003541301563382149, + "step": 184690 + }, + { + "epoch": 26.217175301632363, + "grad_norm": 19.323917388916016, + "learning_rate": 7.379375443577006e-05, + "loss": 0.028596460819244385, + "step": 184700 + }, + { + "epoch": 26.21859474804826, + "grad_norm": 5.3679327964782715, + "learning_rate": 7.379233498935416e-05, + "loss": 0.01941956877708435, + "step": 184710 + }, + { + "epoch": 26.22001419446416, + "grad_norm": 0.33227890729904175, + "learning_rate": 7.379091554293825e-05, + "loss": 0.010048449039459229, + "step": 184720 + }, + { + "epoch": 26.221433640880058, + "grad_norm": 0.301633358001709, + "learning_rate": 7.378949609652236e-05, + "loss": 0.01560499370098114, + "step": 184730 + }, + { + "epoch": 26.222853087295956, + "grad_norm": 0.008023141883313656, + "learning_rate": 7.378807665010646e-05, + "loss": 0.01796157956123352, + "step": 184740 + }, + { + "epoch": 26.22427253371185, + "grad_norm": 0.10872285068035126, + "learning_rate": 7.378665720369057e-05, + "loss": 0.010085164755582809, + "step": 184750 + }, + { + "epoch": 26.22569198012775, + "grad_norm": 0.007022215984761715, + "learning_rate": 7.378523775727466e-05, + "loss": 0.024917921423912047, + "step": 184760 + }, + { + "epoch": 26.227111426543647, + "grad_norm": 10.503836631774902, + "learning_rate": 7.378381831085877e-05, + "loss": 0.011169569194316864, + "step": 184770 + }, + { + "epoch": 26.228530872959546, + "grad_norm": 0.014461837708950043, + "learning_rate": 7.378239886444286e-05, + "loss": 0.024939756095409393, + "step": 184780 + }, + { + "epoch": 26.229950319375444, + "grad_norm": 0.06270354986190796, + "learning_rate": 7.378097941802697e-05, + "loss": 0.008554065227508545, + "step": 184790 + }, + { + "epoch": 26.231369765791342, + "grad_norm": 0.03123842366039753, + "learning_rate": 7.377955997161107e-05, + "loss": 0.038437068462371826, + "step": 184800 + }, + { + "epoch": 26.23278921220724, + "grad_norm": 0.641308069229126, + "learning_rate": 7.377814052519518e-05, + "loss": 0.002106175944209099, + "step": 184810 + }, + { + "epoch": 26.234208658623135, + "grad_norm": 0.29526522755622864, + "learning_rate": 7.377672107877928e-05, + "loss": 0.004230520874261856, + "step": 184820 + }, + { + "epoch": 26.235628105039034, + "grad_norm": 0.028513310477137566, + "learning_rate": 7.377530163236338e-05, + "loss": 0.0033574145287275313, + "step": 184830 + }, + { + "epoch": 26.237047551454932, + "grad_norm": 0.3956826627254486, + "learning_rate": 7.377388218594749e-05, + "loss": 0.002624880149960518, + "step": 184840 + }, + { + "epoch": 26.23846699787083, + "grad_norm": 4.37926721572876, + "learning_rate": 7.377246273953159e-05, + "loss": 0.004473377019166946, + "step": 184850 + }, + { + "epoch": 26.23988644428673, + "grad_norm": 2.473263740539551, + "learning_rate": 7.37710432931157e-05, + "loss": 0.0029740925878286363, + "step": 184860 + }, + { + "epoch": 26.241305890702627, + "grad_norm": 0.038411326706409454, + "learning_rate": 7.376962384669978e-05, + "loss": 0.021022553741931915, + "step": 184870 + }, + { + "epoch": 26.242725337118525, + "grad_norm": 3.1101856231689453, + "learning_rate": 7.376820440028389e-05, + "loss": 0.010681232810020447, + "step": 184880 + }, + { + "epoch": 26.24414478353442, + "grad_norm": 0.1012149453163147, + "learning_rate": 7.376678495386799e-05, + "loss": 0.03075593411922455, + "step": 184890 + }, + { + "epoch": 26.24556422995032, + "grad_norm": 0.1774710863828659, + "learning_rate": 7.37653655074521e-05, + "loss": 0.022092045843601228, + "step": 184900 + }, + { + "epoch": 26.246983676366217, + "grad_norm": 0.13196063041687012, + "learning_rate": 7.376394606103621e-05, + "loss": 0.009542499482631684, + "step": 184910 + }, + { + "epoch": 26.248403122782115, + "grad_norm": 0.00973777286708355, + "learning_rate": 7.37625266146203e-05, + "loss": 0.04358325302600861, + "step": 184920 + }, + { + "epoch": 26.249822569198013, + "grad_norm": 0.010159406810998917, + "learning_rate": 7.37611071682044e-05, + "loss": 0.025490158796310426, + "step": 184930 + }, + { + "epoch": 26.25124201561391, + "grad_norm": 5.495299816131592, + "learning_rate": 7.37596877217885e-05, + "loss": 0.025832265615463257, + "step": 184940 + }, + { + "epoch": 26.25266146202981, + "grad_norm": 3.5420539379119873, + "learning_rate": 7.375826827537261e-05, + "loss": 0.013268698751926423, + "step": 184950 + }, + { + "epoch": 26.254080908445705, + "grad_norm": 0.18698039650917053, + "learning_rate": 7.375684882895671e-05, + "loss": 0.008921249210834504, + "step": 184960 + }, + { + "epoch": 26.255500354861603, + "grad_norm": 0.1474723070859909, + "learning_rate": 7.375542938254081e-05, + "loss": 0.003880874812602997, + "step": 184970 + }, + { + "epoch": 26.2569198012775, + "grad_norm": 0.028817197307944298, + "learning_rate": 7.37540099361249e-05, + "loss": 0.002471631020307541, + "step": 184980 + }, + { + "epoch": 26.2583392476934, + "grad_norm": 0.7333594560623169, + "learning_rate": 7.375259048970902e-05, + "loss": 0.004227428883314133, + "step": 184990 + }, + { + "epoch": 26.259758694109298, + "grad_norm": 0.04642467200756073, + "learning_rate": 7.375117104329311e-05, + "loss": 0.021403390169143676, + "step": 185000 + }, + { + "epoch": 26.259758694109298, + "eval_accuracy": 0.9892541489158772, + "eval_loss": 0.04816208407282829, + "eval_runtime": 31.5569, + "eval_samples_per_second": 498.369, + "eval_steps_per_second": 15.591, + "step": 185000 + }, + { + "epoch": 26.261178140525196, + "grad_norm": 10.70080852508545, + "learning_rate": 7.374975159687723e-05, + "loss": 0.012696191668510437, + "step": 185010 + }, + { + "epoch": 26.262597586941094, + "grad_norm": 0.008780214004218578, + "learning_rate": 7.374833215046132e-05, + "loss": 0.030699312686920166, + "step": 185020 + }, + { + "epoch": 26.26401703335699, + "grad_norm": 0.027556531131267548, + "learning_rate": 7.374691270404542e-05, + "loss": 0.004156223684549332, + "step": 185030 + }, + { + "epoch": 26.265436479772887, + "grad_norm": 0.009346378967165947, + "learning_rate": 7.374549325762953e-05, + "loss": 0.008067401498556137, + "step": 185040 + }, + { + "epoch": 26.266855926188786, + "grad_norm": 1.0227258205413818, + "learning_rate": 7.374407381121363e-05, + "loss": 0.011782585084438324, + "step": 185050 + }, + { + "epoch": 26.268275372604684, + "grad_norm": 4.250153064727783, + "learning_rate": 7.374265436479774e-05, + "loss": 0.009886081516742706, + "step": 185060 + }, + { + "epoch": 26.269694819020582, + "grad_norm": 1.2519447803497314, + "learning_rate": 7.374123491838182e-05, + "loss": 0.008790290355682373, + "step": 185070 + }, + { + "epoch": 26.27111426543648, + "grad_norm": 0.25269949436187744, + "learning_rate": 7.373981547196593e-05, + "loss": 0.006213304027915001, + "step": 185080 + }, + { + "epoch": 26.27253371185238, + "grad_norm": 0.17747043073177338, + "learning_rate": 7.373839602555003e-05, + "loss": 0.013549353182315826, + "step": 185090 + }, + { + "epoch": 26.273953158268274, + "grad_norm": 0.05491011589765549, + "learning_rate": 7.373697657913414e-05, + "loss": 0.03550673723220825, + "step": 185100 + }, + { + "epoch": 26.275372604684172, + "grad_norm": 0.04167938977479935, + "learning_rate": 7.373555713271825e-05, + "loss": 0.008231434226036071, + "step": 185110 + }, + { + "epoch": 26.27679205110007, + "grad_norm": 14.092864990234375, + "learning_rate": 7.373413768630234e-05, + "loss": 0.03515152931213379, + "step": 185120 + }, + { + "epoch": 26.27821149751597, + "grad_norm": 1.4639110565185547, + "learning_rate": 7.373271823988645e-05, + "loss": 0.00917278379201889, + "step": 185130 + }, + { + "epoch": 26.279630943931867, + "grad_norm": 0.003116186009719968, + "learning_rate": 7.373129879347055e-05, + "loss": 0.0013883888721466064, + "step": 185140 + }, + { + "epoch": 26.281050390347765, + "grad_norm": 0.011794686317443848, + "learning_rate": 7.372987934705466e-05, + "loss": 0.03481429517269134, + "step": 185150 + }, + { + "epoch": 26.282469836763664, + "grad_norm": 0.04325925186276436, + "learning_rate": 7.372845990063875e-05, + "loss": 0.017799718677997588, + "step": 185160 + }, + { + "epoch": 26.28388928317956, + "grad_norm": 9.590524673461914, + "learning_rate": 7.372704045422286e-05, + "loss": 0.00506819486618042, + "step": 185170 + }, + { + "epoch": 26.285308729595457, + "grad_norm": 2.0638937950134277, + "learning_rate": 7.372562100780695e-05, + "loss": 0.0029776226729154587, + "step": 185180 + }, + { + "epoch": 26.286728176011355, + "grad_norm": 13.610956192016602, + "learning_rate": 7.372420156139106e-05, + "loss": 0.03397278785705567, + "step": 185190 + }, + { + "epoch": 26.288147622427253, + "grad_norm": 1.553086757659912, + "learning_rate": 7.372278211497517e-05, + "loss": 0.011890456080436707, + "step": 185200 + }, + { + "epoch": 26.28956706884315, + "grad_norm": 6.196134567260742, + "learning_rate": 7.372136266855927e-05, + "loss": 0.007199671864509582, + "step": 185210 + }, + { + "epoch": 26.29098651525905, + "grad_norm": 0.2108820229768753, + "learning_rate": 7.371994322214338e-05, + "loss": 0.021299007534980773, + "step": 185220 + }, + { + "epoch": 26.292405961674948, + "grad_norm": 0.13553056120872498, + "learning_rate": 7.371852377572746e-05, + "loss": 0.06820040345191955, + "step": 185230 + }, + { + "epoch": 26.293825408090843, + "grad_norm": 0.12938940525054932, + "learning_rate": 7.371710432931157e-05, + "loss": 0.0097171351313591, + "step": 185240 + }, + { + "epoch": 26.29524485450674, + "grad_norm": 0.003733355551958084, + "learning_rate": 7.371568488289567e-05, + "loss": 0.010588126629590989, + "step": 185250 + }, + { + "epoch": 26.29666430092264, + "grad_norm": 10.34849739074707, + "learning_rate": 7.371426543647978e-05, + "loss": 0.01931910216808319, + "step": 185260 + }, + { + "epoch": 26.298083747338538, + "grad_norm": 0.356342613697052, + "learning_rate": 7.371284599006388e-05, + "loss": 0.005565590411424637, + "step": 185270 + }, + { + "epoch": 26.299503193754436, + "grad_norm": 9.761873245239258, + "learning_rate": 7.371142654364798e-05, + "loss": 0.009288103878498077, + "step": 185280 + }, + { + "epoch": 26.300922640170334, + "grad_norm": 0.34845805168151855, + "learning_rate": 7.371000709723209e-05, + "loss": 0.005480292439460755, + "step": 185290 + }, + { + "epoch": 26.302342086586233, + "grad_norm": 0.04067667946219444, + "learning_rate": 7.370858765081618e-05, + "loss": 0.017999491095542906, + "step": 185300 + }, + { + "epoch": 26.303761533002127, + "grad_norm": 0.2408369481563568, + "learning_rate": 7.37071682044003e-05, + "loss": 0.005985639244318009, + "step": 185310 + }, + { + "epoch": 26.305180979418026, + "grad_norm": 0.009274369105696678, + "learning_rate": 7.370574875798439e-05, + "loss": 0.0026252716779708862, + "step": 185320 + }, + { + "epoch": 26.306600425833924, + "grad_norm": 0.19905032217502594, + "learning_rate": 7.370432931156849e-05, + "loss": 0.007606388628482818, + "step": 185330 + }, + { + "epoch": 26.308019872249822, + "grad_norm": 1.246777057647705, + "learning_rate": 7.370290986515259e-05, + "loss": 0.026966693997383117, + "step": 185340 + }, + { + "epoch": 26.30943931866572, + "grad_norm": 0.0737718790769577, + "learning_rate": 7.37014904187367e-05, + "loss": 0.009262524545192719, + "step": 185350 + }, + { + "epoch": 26.31085876508162, + "grad_norm": 0.008949129842221737, + "learning_rate": 7.37000709723208e-05, + "loss": 0.0009356290102005005, + "step": 185360 + }, + { + "epoch": 26.312278211497517, + "grad_norm": 0.19915853440761566, + "learning_rate": 7.369865152590491e-05, + "loss": 0.012062560766935349, + "step": 185370 + }, + { + "epoch": 26.313697657913412, + "grad_norm": 0.6373600363731384, + "learning_rate": 7.3697232079489e-05, + "loss": 0.0010539937764406204, + "step": 185380 + }, + { + "epoch": 26.31511710432931, + "grad_norm": 2.735361099243164, + "learning_rate": 7.36958126330731e-05, + "loss": 0.008680413663387298, + "step": 185390 + }, + { + "epoch": 26.31653655074521, + "grad_norm": 6.560279846191406, + "learning_rate": 7.369439318665721e-05, + "loss": 0.02447073757648468, + "step": 185400 + }, + { + "epoch": 26.317955997161107, + "grad_norm": 0.02329465188086033, + "learning_rate": 7.369297374024131e-05, + "loss": 0.008048436045646668, + "step": 185410 + }, + { + "epoch": 26.319375443577005, + "grad_norm": 1.3923299312591553, + "learning_rate": 7.369155429382542e-05, + "loss": 0.0038559455424547195, + "step": 185420 + }, + { + "epoch": 26.320794889992904, + "grad_norm": 17.76538848876953, + "learning_rate": 7.36901348474095e-05, + "loss": 0.025103604793548583, + "step": 185430 + }, + { + "epoch": 26.322214336408802, + "grad_norm": 0.9311656951904297, + "learning_rate": 7.368871540099362e-05, + "loss": 0.0018746614456176757, + "step": 185440 + }, + { + "epoch": 26.323633782824697, + "grad_norm": 4.119106292724609, + "learning_rate": 7.368729595457771e-05, + "loss": 0.05502949953079224, + "step": 185450 + }, + { + "epoch": 26.325053229240595, + "grad_norm": 2.002295970916748, + "learning_rate": 7.368587650816182e-05, + "loss": 0.030594706535339355, + "step": 185460 + }, + { + "epoch": 26.326472675656493, + "grad_norm": 0.5241455435752869, + "learning_rate": 7.368445706174592e-05, + "loss": 0.04083030223846436, + "step": 185470 + }, + { + "epoch": 26.32789212207239, + "grad_norm": 6.970800876617432, + "learning_rate": 7.368303761533002e-05, + "loss": 0.07771967649459839, + "step": 185480 + }, + { + "epoch": 26.32931156848829, + "grad_norm": 0.18519417941570282, + "learning_rate": 7.368161816891413e-05, + "loss": 0.01068648397922516, + "step": 185490 + }, + { + "epoch": 26.330731014904188, + "grad_norm": 0.18255914747714996, + "learning_rate": 7.368019872249823e-05, + "loss": 0.008278429508209229, + "step": 185500 + }, + { + "epoch": 26.330731014904188, + "eval_accuracy": 0.9860749030330006, + "eval_loss": 0.0583660714328289, + "eval_runtime": 31.9502, + "eval_samples_per_second": 492.235, + "eval_steps_per_second": 15.399, + "step": 185500 + }, + { + "epoch": 26.332150461320087, + "grad_norm": 0.26128241419792175, + "learning_rate": 7.367877927608234e-05, + "loss": 0.005008697509765625, + "step": 185510 + }, + { + "epoch": 26.33356990773598, + "grad_norm": 0.04760384559631348, + "learning_rate": 7.367735982966644e-05, + "loss": 0.0010403305292129516, + "step": 185520 + }, + { + "epoch": 26.33498935415188, + "grad_norm": 0.055734459310770035, + "learning_rate": 7.367594038325055e-05, + "loss": 0.05472882390022278, + "step": 185530 + }, + { + "epoch": 26.336408800567778, + "grad_norm": 0.5349218249320984, + "learning_rate": 7.367452093683463e-05, + "loss": 0.012010229378938675, + "step": 185540 + }, + { + "epoch": 26.337828246983676, + "grad_norm": 3.2898306846618652, + "learning_rate": 7.367310149041874e-05, + "loss": 0.009547671675682068, + "step": 185550 + }, + { + "epoch": 26.339247693399575, + "grad_norm": 1.2523845434188843, + "learning_rate": 7.367168204400284e-05, + "loss": 0.014304380118846893, + "step": 185560 + }, + { + "epoch": 26.340667139815473, + "grad_norm": 0.11693774908781052, + "learning_rate": 7.367026259758695e-05, + "loss": 0.011364903301000595, + "step": 185570 + }, + { + "epoch": 26.34208658623137, + "grad_norm": 0.0088630560785532, + "learning_rate": 7.366884315117105e-05, + "loss": 0.0022801902145147324, + "step": 185580 + }, + { + "epoch": 26.343506032647266, + "grad_norm": 0.34860771894454956, + "learning_rate": 7.366742370475514e-05, + "loss": 0.019647878408432008, + "step": 185590 + }, + { + "epoch": 26.344925479063164, + "grad_norm": 2.0048084259033203, + "learning_rate": 7.366600425833926e-05, + "loss": 0.0025117002427577973, + "step": 185600 + }, + { + "epoch": 26.346344925479062, + "grad_norm": 1.3591150045394897, + "learning_rate": 7.366458481192335e-05, + "loss": 0.021052075922489165, + "step": 185610 + }, + { + "epoch": 26.34776437189496, + "grad_norm": 11.956613540649414, + "learning_rate": 7.366316536550746e-05, + "loss": 0.04649336934089661, + "step": 185620 + }, + { + "epoch": 26.34918381831086, + "grad_norm": 6.438827991485596, + "learning_rate": 7.366174591909156e-05, + "loss": 0.02432842701673508, + "step": 185630 + }, + { + "epoch": 26.350603264726757, + "grad_norm": 12.976755142211914, + "learning_rate": 7.366032647267566e-05, + "loss": 0.030032038688659668, + "step": 185640 + }, + { + "epoch": 26.352022711142656, + "grad_norm": 0.029744205996394157, + "learning_rate": 7.365890702625976e-05, + "loss": 0.008355945348739624, + "step": 185650 + }, + { + "epoch": 26.35344215755855, + "grad_norm": 0.2410801351070404, + "learning_rate": 7.365748757984387e-05, + "loss": 0.002596767619252205, + "step": 185660 + }, + { + "epoch": 26.35486160397445, + "grad_norm": 3.141378164291382, + "learning_rate": 7.365606813342796e-05, + "loss": 0.010746054351329803, + "step": 185670 + }, + { + "epoch": 26.356281050390347, + "grad_norm": 2.674225091934204, + "learning_rate": 7.365464868701207e-05, + "loss": 0.00278434120118618, + "step": 185680 + }, + { + "epoch": 26.357700496806245, + "grad_norm": 0.12482249736785889, + "learning_rate": 7.365322924059617e-05, + "loss": 0.0032265182584524156, + "step": 185690 + }, + { + "epoch": 26.359119943222144, + "grad_norm": 0.47837790846824646, + "learning_rate": 7.365180979418027e-05, + "loss": 0.0076306134462356566, + "step": 185700 + }, + { + "epoch": 26.360539389638042, + "grad_norm": 0.06671160459518433, + "learning_rate": 7.365039034776438e-05, + "loss": 0.011296860873699188, + "step": 185710 + }, + { + "epoch": 26.36195883605394, + "grad_norm": 0.07664469629526138, + "learning_rate": 7.364897090134848e-05, + "loss": 0.09311719536781311, + "step": 185720 + }, + { + "epoch": 26.363378282469835, + "grad_norm": 0.8621576428413391, + "learning_rate": 7.364755145493259e-05, + "loss": 0.005290477722883225, + "step": 185730 + }, + { + "epoch": 26.364797728885733, + "grad_norm": 1.3778166770935059, + "learning_rate": 7.364613200851667e-05, + "loss": 0.02523401379585266, + "step": 185740 + }, + { + "epoch": 26.36621717530163, + "grad_norm": 0.01620257832109928, + "learning_rate": 7.364471256210078e-05, + "loss": 0.009639772772789, + "step": 185750 + }, + { + "epoch": 26.36763662171753, + "grad_norm": 0.21509350836277008, + "learning_rate": 7.364329311568488e-05, + "loss": 0.029006454348564147, + "step": 185760 + }, + { + "epoch": 26.36905606813343, + "grad_norm": 0.9007256031036377, + "learning_rate": 7.364187366926899e-05, + "loss": 0.005473754927515983, + "step": 185770 + }, + { + "epoch": 26.370475514549327, + "grad_norm": 2.886791467666626, + "learning_rate": 7.364045422285309e-05, + "loss": 0.013672125339508057, + "step": 185780 + }, + { + "epoch": 26.371894960965225, + "grad_norm": 1.035396933555603, + "learning_rate": 7.363903477643719e-05, + "loss": 0.0018527869135141373, + "step": 185790 + }, + { + "epoch": 26.37331440738112, + "grad_norm": 1.6224644184112549, + "learning_rate": 7.36376153300213e-05, + "loss": 0.07783627510070801, + "step": 185800 + }, + { + "epoch": 26.374733853797018, + "grad_norm": 0.7493976354598999, + "learning_rate": 7.36361958836054e-05, + "loss": 0.0027294475585222244, + "step": 185810 + }, + { + "epoch": 26.376153300212916, + "grad_norm": 7.783509731292725, + "learning_rate": 7.36347764371895e-05, + "loss": 0.02635921835899353, + "step": 185820 + }, + { + "epoch": 26.377572746628815, + "grad_norm": 4.019241809844971, + "learning_rate": 7.36333569907736e-05, + "loss": 0.0024406518787145615, + "step": 185830 + }, + { + "epoch": 26.378992193044713, + "grad_norm": 0.21529285609722137, + "learning_rate": 7.36319375443577e-05, + "loss": 0.03174015581607818, + "step": 185840 + }, + { + "epoch": 26.38041163946061, + "grad_norm": 1.530470609664917, + "learning_rate": 7.36305180979418e-05, + "loss": 0.020180048048496248, + "step": 185850 + }, + { + "epoch": 26.38183108587651, + "grad_norm": 0.07037585973739624, + "learning_rate": 7.362909865152591e-05, + "loss": 0.017058944702148436, + "step": 185860 + }, + { + "epoch": 26.383250532292404, + "grad_norm": 0.1646389663219452, + "learning_rate": 7.362767920511e-05, + "loss": 0.03936585783958435, + "step": 185870 + }, + { + "epoch": 26.384669978708303, + "grad_norm": 11.275287628173828, + "learning_rate": 7.362625975869412e-05, + "loss": 0.030217647552490234, + "step": 185880 + }, + { + "epoch": 26.3860894251242, + "grad_norm": 0.16208511590957642, + "learning_rate": 7.362484031227821e-05, + "loss": 0.009040801227092743, + "step": 185890 + }, + { + "epoch": 26.3875088715401, + "grad_norm": 0.04412378370761871, + "learning_rate": 7.362342086586231e-05, + "loss": 0.006353928148746491, + "step": 185900 + }, + { + "epoch": 26.388928317955997, + "grad_norm": 0.11191380023956299, + "learning_rate": 7.362200141944642e-05, + "loss": 0.02469276487827301, + "step": 185910 + }, + { + "epoch": 26.390347764371896, + "grad_norm": 0.050530992448329926, + "learning_rate": 7.362058197303052e-05, + "loss": 0.007769478857517243, + "step": 185920 + }, + { + "epoch": 26.391767210787794, + "grad_norm": 0.0131605826318264, + "learning_rate": 7.361916252661463e-05, + "loss": 0.006165988370776176, + "step": 185930 + }, + { + "epoch": 26.39318665720369, + "grad_norm": 5.190620422363281, + "learning_rate": 7.361774308019873e-05, + "loss": 0.06639398336410522, + "step": 185940 + }, + { + "epoch": 26.394606103619587, + "grad_norm": 4.076643466949463, + "learning_rate": 7.361632363378283e-05, + "loss": 0.018614162504673005, + "step": 185950 + }, + { + "epoch": 26.396025550035485, + "grad_norm": 0.6777787208557129, + "learning_rate": 7.361490418736692e-05, + "loss": 0.0035717353224754334, + "step": 185960 + }, + { + "epoch": 26.397444996451384, + "grad_norm": 3.5399787425994873, + "learning_rate": 7.361348474095103e-05, + "loss": 0.009322479367256165, + "step": 185970 + }, + { + "epoch": 26.398864442867282, + "grad_norm": 0.04201832786202431, + "learning_rate": 7.361206529453513e-05, + "loss": 0.001983555406332016, + "step": 185980 + }, + { + "epoch": 26.40028388928318, + "grad_norm": 1.5129941701889038, + "learning_rate": 7.361064584811924e-05, + "loss": 0.010160159319639206, + "step": 185990 + }, + { + "epoch": 26.40170333569908, + "grad_norm": 0.05333923548460007, + "learning_rate": 7.360922640170334e-05, + "loss": 0.015899233520030975, + "step": 186000 + }, + { + "epoch": 26.40170333569908, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.06392236053943634, + "eval_runtime": 30.8433, + "eval_samples_per_second": 509.9, + "eval_steps_per_second": 15.952, + "step": 186000 + }, + { + "epoch": 26.403122782114973, + "grad_norm": 0.03333721309900284, + "learning_rate": 7.360780695528744e-05, + "loss": 0.013498367369174957, + "step": 186010 + }, + { + "epoch": 26.40454222853087, + "grad_norm": 13.64572811126709, + "learning_rate": 7.360638750887155e-05, + "loss": 0.06043493151664734, + "step": 186020 + }, + { + "epoch": 26.40596167494677, + "grad_norm": 0.015192785300314426, + "learning_rate": 7.360496806245565e-05, + "loss": 0.0022812213748693464, + "step": 186030 + }, + { + "epoch": 26.40738112136267, + "grad_norm": 0.15450873970985413, + "learning_rate": 7.360354861603976e-05, + "loss": 0.0027406126260757445, + "step": 186040 + }, + { + "epoch": 26.408800567778567, + "grad_norm": 0.049003925174474716, + "learning_rate": 7.360212916962384e-05, + "loss": 0.002086610719561577, + "step": 186050 + }, + { + "epoch": 26.410220014194465, + "grad_norm": 1.4188547134399414, + "learning_rate": 7.360070972320795e-05, + "loss": 0.005754034221172333, + "step": 186060 + }, + { + "epoch": 26.411639460610363, + "grad_norm": 0.22541512548923492, + "learning_rate": 7.359929027679205e-05, + "loss": 0.018232859671115875, + "step": 186070 + }, + { + "epoch": 26.413058907026258, + "grad_norm": 0.03352675586938858, + "learning_rate": 7.359787083037616e-05, + "loss": 0.04069254994392395, + "step": 186080 + }, + { + "epoch": 26.414478353442156, + "grad_norm": 18.055036544799805, + "learning_rate": 7.359645138396026e-05, + "loss": 0.014264219999313354, + "step": 186090 + }, + { + "epoch": 26.415897799858055, + "grad_norm": 0.01466344390064478, + "learning_rate": 7.359503193754435e-05, + "loss": 0.013482490181922912, + "step": 186100 + }, + { + "epoch": 26.417317246273953, + "grad_norm": 6.20540189743042, + "learning_rate": 7.359361249112847e-05, + "loss": 0.01686638593673706, + "step": 186110 + }, + { + "epoch": 26.41873669268985, + "grad_norm": 0.033270079642534256, + "learning_rate": 7.359219304471256e-05, + "loss": 0.022359293699264527, + "step": 186120 + }, + { + "epoch": 26.42015613910575, + "grad_norm": 0.18060441315174103, + "learning_rate": 7.359077359829667e-05, + "loss": 0.03604276776313782, + "step": 186130 + }, + { + "epoch": 26.421575585521648, + "grad_norm": 2.2689197063446045, + "learning_rate": 7.358935415188077e-05, + "loss": 0.01664589047431946, + "step": 186140 + }, + { + "epoch": 26.422995031937543, + "grad_norm": 1.0888596773147583, + "learning_rate": 7.358793470546487e-05, + "loss": 0.014701120555400848, + "step": 186150 + }, + { + "epoch": 26.42441447835344, + "grad_norm": 0.15951749682426453, + "learning_rate": 7.358651525904897e-05, + "loss": 0.01974617540836334, + "step": 186160 + }, + { + "epoch": 26.42583392476934, + "grad_norm": 0.020309895277023315, + "learning_rate": 7.358509581263308e-05, + "loss": 0.021918630599975585, + "step": 186170 + }, + { + "epoch": 26.427253371185238, + "grad_norm": 0.023755228146910667, + "learning_rate": 7.358367636621717e-05, + "loss": 0.0018196739256381989, + "step": 186180 + }, + { + "epoch": 26.428672817601136, + "grad_norm": 12.213748931884766, + "learning_rate": 7.358225691980129e-05, + "loss": 0.017186929285526276, + "step": 186190 + }, + { + "epoch": 26.430092264017034, + "grad_norm": 0.4376166760921478, + "learning_rate": 7.358083747338538e-05, + "loss": 0.0010480977594852447, + "step": 186200 + }, + { + "epoch": 26.431511710432932, + "grad_norm": 1.404850721359253, + "learning_rate": 7.357941802696948e-05, + "loss": 0.001607433706521988, + "step": 186210 + }, + { + "epoch": 26.432931156848827, + "grad_norm": 0.09868898242712021, + "learning_rate": 7.357799858055359e-05, + "loss": 0.010129539668560028, + "step": 186220 + }, + { + "epoch": 26.434350603264726, + "grad_norm": 0.9866778254508972, + "learning_rate": 7.357657913413769e-05, + "loss": 0.008717909455299377, + "step": 186230 + }, + { + "epoch": 26.435770049680624, + "grad_norm": 10.56622314453125, + "learning_rate": 7.35751596877218e-05, + "loss": 0.0457323431968689, + "step": 186240 + }, + { + "epoch": 26.437189496096522, + "grad_norm": 0.18385837972164154, + "learning_rate": 7.35737402413059e-05, + "loss": 0.0038725633174180986, + "step": 186250 + }, + { + "epoch": 26.43860894251242, + "grad_norm": 5.551150798797607, + "learning_rate": 7.357232079489e-05, + "loss": 0.02162366211414337, + "step": 186260 + }, + { + "epoch": 26.44002838892832, + "grad_norm": 0.10577097535133362, + "learning_rate": 7.357090134847409e-05, + "loss": 0.04189826846122742, + "step": 186270 + }, + { + "epoch": 26.441447835344217, + "grad_norm": 0.11372650414705276, + "learning_rate": 7.35694819020582e-05, + "loss": 0.02750731408596039, + "step": 186280 + }, + { + "epoch": 26.442867281760112, + "grad_norm": 1.8986040353775024, + "learning_rate": 7.35680624556423e-05, + "loss": 0.0342750608921051, + "step": 186290 + }, + { + "epoch": 26.44428672817601, + "grad_norm": 26.244585037231445, + "learning_rate": 7.356664300922641e-05, + "loss": 0.02013804018497467, + "step": 186300 + }, + { + "epoch": 26.44570617459191, + "grad_norm": 0.023707306012511253, + "learning_rate": 7.356522356281051e-05, + "loss": 0.03878544569015503, + "step": 186310 + }, + { + "epoch": 26.447125621007807, + "grad_norm": 1.9518895149230957, + "learning_rate": 7.35638041163946e-05, + "loss": 0.005630603060126305, + "step": 186320 + }, + { + "epoch": 26.448545067423705, + "grad_norm": 0.032214537262916565, + "learning_rate": 7.356238466997872e-05, + "loss": 0.010818743705749511, + "step": 186330 + }, + { + "epoch": 26.449964513839603, + "grad_norm": 14.885580062866211, + "learning_rate": 7.356096522356281e-05, + "loss": 0.044491696357727054, + "step": 186340 + }, + { + "epoch": 26.4513839602555, + "grad_norm": 0.3704417645931244, + "learning_rate": 7.355954577714692e-05, + "loss": 0.0018510442227125169, + "step": 186350 + }, + { + "epoch": 26.4528034066714, + "grad_norm": 0.03359532356262207, + "learning_rate": 7.355812633073101e-05, + "loss": 0.008859743922948837, + "step": 186360 + }, + { + "epoch": 26.454222853087295, + "grad_norm": 4.59602165222168, + "learning_rate": 7.355670688431512e-05, + "loss": 0.014044463634490967, + "step": 186370 + }, + { + "epoch": 26.455642299503193, + "grad_norm": 0.8525452017784119, + "learning_rate": 7.355528743789922e-05, + "loss": 0.014632968604564667, + "step": 186380 + }, + { + "epoch": 26.45706174591909, + "grad_norm": 8.394326210021973, + "learning_rate": 7.355386799148333e-05, + "loss": 0.004787336662411689, + "step": 186390 + }, + { + "epoch": 26.45848119233499, + "grad_norm": 8.700714111328125, + "learning_rate": 7.355244854506744e-05, + "loss": 0.02960101664066315, + "step": 186400 + }, + { + "epoch": 26.459900638750888, + "grad_norm": 0.18395671248435974, + "learning_rate": 7.355102909865152e-05, + "loss": 0.011622699350118637, + "step": 186410 + }, + { + "epoch": 26.461320085166786, + "grad_norm": 0.5135088562965393, + "learning_rate": 7.354960965223563e-05, + "loss": 0.05078744888305664, + "step": 186420 + }, + { + "epoch": 26.462739531582685, + "grad_norm": 13.113768577575684, + "learning_rate": 7.354819020581973e-05, + "loss": 0.02945522665977478, + "step": 186430 + }, + { + "epoch": 26.46415897799858, + "grad_norm": 0.004286912269890308, + "learning_rate": 7.354677075940384e-05, + "loss": 0.003958478942513466, + "step": 186440 + }, + { + "epoch": 26.465578424414478, + "grad_norm": 0.01834517903625965, + "learning_rate": 7.354535131298794e-05, + "loss": 0.012998630106449128, + "step": 186450 + }, + { + "epoch": 26.466997870830376, + "grad_norm": 0.04004067927598953, + "learning_rate": 7.354393186657204e-05, + "loss": 0.008030912280082703, + "step": 186460 + }, + { + "epoch": 26.468417317246274, + "grad_norm": 0.03632790222764015, + "learning_rate": 7.354251242015613e-05, + "loss": 0.022511669993400575, + "step": 186470 + }, + { + "epoch": 26.469836763662173, + "grad_norm": 0.025592179968953133, + "learning_rate": 7.354109297374024e-05, + "loss": 0.0005944069474935531, + "step": 186480 + }, + { + "epoch": 26.47125621007807, + "grad_norm": 0.7582745552062988, + "learning_rate": 7.353967352732436e-05, + "loss": 0.0031089331954717637, + "step": 186490 + }, + { + "epoch": 26.47267565649397, + "grad_norm": 0.1979917287826538, + "learning_rate": 7.353825408090845e-05, + "loss": 0.0028180520981550216, + "step": 186500 + }, + { + "epoch": 26.47267565649397, + "eval_accuracy": 0.9900171679277675, + "eval_loss": 0.03502142056822777, + "eval_runtime": 31.1224, + "eval_samples_per_second": 505.328, + "eval_steps_per_second": 15.809, + "step": 186500 + }, + { + "epoch": 26.474095102909864, + "grad_norm": 0.21764467656612396, + "learning_rate": 7.353683463449255e-05, + "loss": 0.0036386813968420028, + "step": 186510 + }, + { + "epoch": 26.475514549325762, + "grad_norm": 0.017733043059706688, + "learning_rate": 7.353541518807665e-05, + "loss": 0.013045087456703186, + "step": 186520 + }, + { + "epoch": 26.47693399574166, + "grad_norm": 1.8834627866744995, + "learning_rate": 7.353399574166076e-05, + "loss": 0.0377467155456543, + "step": 186530 + }, + { + "epoch": 26.47835344215756, + "grad_norm": 0.9231944680213928, + "learning_rate": 7.353257629524486e-05, + "loss": 0.008476538211107254, + "step": 186540 + }, + { + "epoch": 26.479772888573457, + "grad_norm": 0.23009705543518066, + "learning_rate": 7.353115684882897e-05, + "loss": 0.02452366501092911, + "step": 186550 + }, + { + "epoch": 26.481192334989355, + "grad_norm": 2.013540267944336, + "learning_rate": 7.352973740241305e-05, + "loss": 0.0026718713343143463, + "step": 186560 + }, + { + "epoch": 26.482611781405254, + "grad_norm": 16.619680404663086, + "learning_rate": 7.352831795599716e-05, + "loss": 0.030244520306587218, + "step": 186570 + }, + { + "epoch": 26.48403122782115, + "grad_norm": 0.13640497624874115, + "learning_rate": 7.352689850958127e-05, + "loss": 0.023229685425758363, + "step": 186580 + }, + { + "epoch": 26.485450674237047, + "grad_norm": 0.02870710752904415, + "learning_rate": 7.352547906316537e-05, + "loss": 0.005366319790482521, + "step": 186590 + }, + { + "epoch": 26.486870120652945, + "grad_norm": 6.049400806427002, + "learning_rate": 7.352405961674948e-05, + "loss": 0.039882874488830565, + "step": 186600 + }, + { + "epoch": 26.488289567068843, + "grad_norm": 0.10954950749874115, + "learning_rate": 7.352264017033358e-05, + "loss": 0.003020814806222916, + "step": 186610 + }, + { + "epoch": 26.48970901348474, + "grad_norm": 0.09195968508720398, + "learning_rate": 7.352122072391768e-05, + "loss": 0.03874207437038422, + "step": 186620 + }, + { + "epoch": 26.49112845990064, + "grad_norm": 3.8620786666870117, + "learning_rate": 7.351980127750177e-05, + "loss": 0.010641635954380035, + "step": 186630 + }, + { + "epoch": 26.49254790631654, + "grad_norm": 0.6264975070953369, + "learning_rate": 7.351838183108588e-05, + "loss": 0.009137057512998582, + "step": 186640 + }, + { + "epoch": 26.493967352732433, + "grad_norm": 0.22944748401641846, + "learning_rate": 7.351696238466998e-05, + "loss": 0.02089664340019226, + "step": 186650 + }, + { + "epoch": 26.49538679914833, + "grad_norm": 2.5765247344970703, + "learning_rate": 7.351554293825409e-05, + "loss": 0.039310950040817264, + "step": 186660 + }, + { + "epoch": 26.49680624556423, + "grad_norm": 0.14304354786872864, + "learning_rate": 7.351412349183819e-05, + "loss": 0.008943051099777222, + "step": 186670 + }, + { + "epoch": 26.498225691980128, + "grad_norm": 1.8229115009307861, + "learning_rate": 7.351270404542229e-05, + "loss": 0.007130999863147735, + "step": 186680 + }, + { + "epoch": 26.499645138396026, + "grad_norm": 7.356447219848633, + "learning_rate": 7.35112845990064e-05, + "loss": 0.01725427955389023, + "step": 186690 + }, + { + "epoch": 26.501064584811925, + "grad_norm": 0.20110031962394714, + "learning_rate": 7.35098651525905e-05, + "loss": 0.002873242273926735, + "step": 186700 + }, + { + "epoch": 26.502484031227823, + "grad_norm": 0.005063197109848261, + "learning_rate": 7.35084457061746e-05, + "loss": 0.013153080642223359, + "step": 186710 + }, + { + "epoch": 26.503903477643718, + "grad_norm": 0.014375776052474976, + "learning_rate": 7.350702625975869e-05, + "loss": 0.03469110429286957, + "step": 186720 + }, + { + "epoch": 26.505322924059616, + "grad_norm": 0.3990916311740875, + "learning_rate": 7.35056068133428e-05, + "loss": 0.0028648562729358673, + "step": 186730 + }, + { + "epoch": 26.506742370475514, + "grad_norm": 0.03297169506549835, + "learning_rate": 7.35041873669269e-05, + "loss": 0.003746340423822403, + "step": 186740 + }, + { + "epoch": 26.508161816891413, + "grad_norm": 8.736231803894043, + "learning_rate": 7.350276792051101e-05, + "loss": 0.061881381273269656, + "step": 186750 + }, + { + "epoch": 26.50958126330731, + "grad_norm": 0.1806955188512802, + "learning_rate": 7.35013484740951e-05, + "loss": 0.0010636717081069947, + "step": 186760 + }, + { + "epoch": 26.51100070972321, + "grad_norm": 1.0942550897598267, + "learning_rate": 7.34999290276792e-05, + "loss": 0.02549983263015747, + "step": 186770 + }, + { + "epoch": 26.512420156139108, + "grad_norm": 1.3371232748031616, + "learning_rate": 7.349850958126331e-05, + "loss": 0.016201870143413545, + "step": 186780 + }, + { + "epoch": 26.513839602555002, + "grad_norm": 0.13042639195919037, + "learning_rate": 7.349709013484741e-05, + "loss": 0.011256366968154907, + "step": 186790 + }, + { + "epoch": 26.5152590489709, + "grad_norm": 0.06293094158172607, + "learning_rate": 7.349567068843152e-05, + "loss": 0.0035890169441699983, + "step": 186800 + }, + { + "epoch": 26.5166784953868, + "grad_norm": 0.06546209007501602, + "learning_rate": 7.349425124201562e-05, + "loss": 0.014071501791477203, + "step": 186810 + }, + { + "epoch": 26.518097941802697, + "grad_norm": 0.04115975648164749, + "learning_rate": 7.349283179559972e-05, + "loss": 0.009070324897766113, + "step": 186820 + }, + { + "epoch": 26.519517388218595, + "grad_norm": 0.0437438040971756, + "learning_rate": 7.349141234918382e-05, + "loss": 0.03559253513813019, + "step": 186830 + }, + { + "epoch": 26.520936834634494, + "grad_norm": 16.265518188476562, + "learning_rate": 7.348999290276793e-05, + "loss": 0.0366477370262146, + "step": 186840 + }, + { + "epoch": 26.522356281050392, + "grad_norm": 0.21089045703411102, + "learning_rate": 7.348857345635202e-05, + "loss": 0.013750053942203522, + "step": 186850 + }, + { + "epoch": 26.523775727466287, + "grad_norm": 6.262032985687256, + "learning_rate": 7.348715400993613e-05, + "loss": 0.03262893557548523, + "step": 186860 + }, + { + "epoch": 26.525195173882185, + "grad_norm": 0.07935374975204468, + "learning_rate": 7.348573456352023e-05, + "loss": 0.003208857774734497, + "step": 186870 + }, + { + "epoch": 26.526614620298083, + "grad_norm": 0.2105017453432083, + "learning_rate": 7.348431511710433e-05, + "loss": 0.013661330938339234, + "step": 186880 + }, + { + "epoch": 26.528034066713982, + "grad_norm": 26.090145111083984, + "learning_rate": 7.348289567068844e-05, + "loss": 0.034649378061294554, + "step": 186890 + }, + { + "epoch": 26.52945351312988, + "grad_norm": 12.466241836547852, + "learning_rate": 7.348147622427254e-05, + "loss": 0.038115900754928586, + "step": 186900 + }, + { + "epoch": 26.53087295954578, + "grad_norm": 0.053186990320682526, + "learning_rate": 7.348005677785665e-05, + "loss": 0.025875136256217957, + "step": 186910 + }, + { + "epoch": 26.532292405961677, + "grad_norm": 0.32158806920051575, + "learning_rate": 7.347863733144073e-05, + "loss": 0.0281094491481781, + "step": 186920 + }, + { + "epoch": 26.53371185237757, + "grad_norm": 22.976572036743164, + "learning_rate": 7.347721788502484e-05, + "loss": 0.044937634468078615, + "step": 186930 + }, + { + "epoch": 26.53513129879347, + "grad_norm": 0.1289796233177185, + "learning_rate": 7.347579843860894e-05, + "loss": 0.04056633114814758, + "step": 186940 + }, + { + "epoch": 26.536550745209368, + "grad_norm": 20.660526275634766, + "learning_rate": 7.347437899219305e-05, + "loss": 0.06407415866851807, + "step": 186950 + }, + { + "epoch": 26.537970191625266, + "grad_norm": 0.5497305989265442, + "learning_rate": 7.347295954577715e-05, + "loss": 0.07114406228065491, + "step": 186960 + }, + { + "epoch": 26.539389638041165, + "grad_norm": 0.4716319441795349, + "learning_rate": 7.347154009936126e-05, + "loss": 0.015671339631080628, + "step": 186970 + }, + { + "epoch": 26.540809084457063, + "grad_norm": 0.23035556077957153, + "learning_rate": 7.347012065294536e-05, + "loss": 0.008833847939968109, + "step": 186980 + }, + { + "epoch": 26.54222853087296, + "grad_norm": 4.435577392578125, + "learning_rate": 7.346870120652945e-05, + "loss": 0.0514735996723175, + "step": 186990 + }, + { + "epoch": 26.543647977288856, + "grad_norm": 13.916330337524414, + "learning_rate": 7.346728176011357e-05, + "loss": 0.03656842112541199, + "step": 187000 + }, + { + "epoch": 26.543647977288856, + "eval_accuracy": 0.9771094296432886, + "eval_loss": 0.09349644929170609, + "eval_runtime": 31.5501, + "eval_samples_per_second": 498.476, + "eval_steps_per_second": 15.594, + "step": 187000 + }, + { + "epoch": 26.545067423704754, + "grad_norm": 15.157449722290039, + "learning_rate": 7.346586231369766e-05, + "loss": 0.040513014793395995, + "step": 187010 + }, + { + "epoch": 26.546486870120653, + "grad_norm": 0.385690301656723, + "learning_rate": 7.346444286728177e-05, + "loss": 0.01646818071603775, + "step": 187020 + }, + { + "epoch": 26.54790631653655, + "grad_norm": 0.18771736323833466, + "learning_rate": 7.346302342086586e-05, + "loss": 0.022457094490528108, + "step": 187030 + }, + { + "epoch": 26.54932576295245, + "grad_norm": 0.05263993889093399, + "learning_rate": 7.346174591909156e-05, + "loss": 0.014117535948753358, + "step": 187040 + }, + { + "epoch": 26.550745209368348, + "grad_norm": 0.025993475690484047, + "learning_rate": 7.346032647267565e-05, + "loss": 0.058175408840179445, + "step": 187050 + }, + { + "epoch": 26.552164655784246, + "grad_norm": 0.01962387189269066, + "learning_rate": 7.345890702625976e-05, + "loss": 0.02167389541864395, + "step": 187060 + }, + { + "epoch": 26.55358410220014, + "grad_norm": 0.015427844598889351, + "learning_rate": 7.345748757984386e-05, + "loss": 0.019847363233566284, + "step": 187070 + }, + { + "epoch": 26.55500354861604, + "grad_norm": 0.0405634269118309, + "learning_rate": 7.345606813342797e-05, + "loss": 0.002233690395951271, + "step": 187080 + }, + { + "epoch": 26.556422995031937, + "grad_norm": 0.17142683267593384, + "learning_rate": 7.345464868701207e-05, + "loss": 0.010604397207498551, + "step": 187090 + }, + { + "epoch": 26.557842441447836, + "grad_norm": 0.04387868195772171, + "learning_rate": 7.345322924059617e-05, + "loss": 0.013574697077274323, + "step": 187100 + }, + { + "epoch": 26.559261887863734, + "grad_norm": 9.085929870605469, + "learning_rate": 7.345180979418026e-05, + "loss": 0.01048770770430565, + "step": 187110 + }, + { + "epoch": 26.560681334279632, + "grad_norm": 0.04001900553703308, + "learning_rate": 7.345039034776438e-05, + "loss": 0.01538444459438324, + "step": 187120 + }, + { + "epoch": 26.56210078069553, + "grad_norm": 0.08003544807434082, + "learning_rate": 7.344897090134847e-05, + "loss": 0.03279804587364197, + "step": 187130 + }, + { + "epoch": 26.563520227111425, + "grad_norm": 0.7920377254486084, + "learning_rate": 7.344755145493258e-05, + "loss": 0.0033142808824777603, + "step": 187140 + }, + { + "epoch": 26.564939673527324, + "grad_norm": 0.027896564453840256, + "learning_rate": 7.344613200851668e-05, + "loss": 0.00381629653275013, + "step": 187150 + }, + { + "epoch": 26.566359119943222, + "grad_norm": 0.24487587809562683, + "learning_rate": 7.344471256210078e-05, + "loss": 0.012527695298194886, + "step": 187160 + }, + { + "epoch": 26.56777856635912, + "grad_norm": 4.837042808532715, + "learning_rate": 7.344329311568489e-05, + "loss": 0.022226181626319886, + "step": 187170 + }, + { + "epoch": 26.56919801277502, + "grad_norm": 0.02711654268205166, + "learning_rate": 7.344187366926899e-05, + "loss": 0.0029070228338241577, + "step": 187180 + }, + { + "epoch": 26.570617459190917, + "grad_norm": 2.9125823974609375, + "learning_rate": 7.34404542228531e-05, + "loss": 0.012016794085502625, + "step": 187190 + }, + { + "epoch": 26.572036905606815, + "grad_norm": 0.11208943277597427, + "learning_rate": 7.343903477643718e-05, + "loss": 0.009389051795005798, + "step": 187200 + }, + { + "epoch": 26.57345635202271, + "grad_norm": 0.2107323408126831, + "learning_rate": 7.343761533002129e-05, + "loss": 0.015909388661384583, + "step": 187210 + }, + { + "epoch": 26.574875798438608, + "grad_norm": 6.6756744384765625, + "learning_rate": 7.343619588360539e-05, + "loss": 0.0070163488388061525, + "step": 187220 + }, + { + "epoch": 26.576295244854506, + "grad_norm": 0.10553637892007828, + "learning_rate": 7.34347764371895e-05, + "loss": 0.030267465114593505, + "step": 187230 + }, + { + "epoch": 26.577714691270405, + "grad_norm": 1.6098649501800537, + "learning_rate": 7.34333569907736e-05, + "loss": 0.015417490899562836, + "step": 187240 + }, + { + "epoch": 26.579134137686303, + "grad_norm": 0.26628410816192627, + "learning_rate": 7.343193754435771e-05, + "loss": 0.006569421291351319, + "step": 187250 + }, + { + "epoch": 26.5805535841022, + "grad_norm": 0.008119795471429825, + "learning_rate": 7.34305180979418e-05, + "loss": 0.006676267832517624, + "step": 187260 + }, + { + "epoch": 26.5819730305181, + "grad_norm": 0.9735649824142456, + "learning_rate": 7.34290986515259e-05, + "loss": 0.007107360661029816, + "step": 187270 + }, + { + "epoch": 26.583392476933994, + "grad_norm": 4.574164390563965, + "learning_rate": 7.342767920511002e-05, + "loss": 0.015383177995681762, + "step": 187280 + }, + { + "epoch": 26.584811923349893, + "grad_norm": 2.6749753952026367, + "learning_rate": 7.342625975869411e-05, + "loss": 0.0171529084444046, + "step": 187290 + }, + { + "epoch": 26.58623136976579, + "grad_norm": 2.536522626876831, + "learning_rate": 7.342484031227822e-05, + "loss": 0.05314785838127136, + "step": 187300 + }, + { + "epoch": 26.58765081618169, + "grad_norm": 18.922754287719727, + "learning_rate": 7.342342086586231e-05, + "loss": 0.01529897153377533, + "step": 187310 + }, + { + "epoch": 26.589070262597588, + "grad_norm": 0.5468068718910217, + "learning_rate": 7.342200141944642e-05, + "loss": 0.04063740074634552, + "step": 187320 + }, + { + "epoch": 26.590489709013486, + "grad_norm": 0.48769041895866394, + "learning_rate": 7.342058197303052e-05, + "loss": 0.007721133530139923, + "step": 187330 + }, + { + "epoch": 26.591909155429384, + "grad_norm": 0.7960296273231506, + "learning_rate": 7.341916252661463e-05, + "loss": 0.04451020359992981, + "step": 187340 + }, + { + "epoch": 26.59332860184528, + "grad_norm": 0.04680338874459267, + "learning_rate": 7.341774308019874e-05, + "loss": 0.02864699363708496, + "step": 187350 + }, + { + "epoch": 26.594748048261177, + "grad_norm": 1.7442355155944824, + "learning_rate": 7.341632363378282e-05, + "loss": 0.023955085873603822, + "step": 187360 + }, + { + "epoch": 26.596167494677076, + "grad_norm": 3.4547338485717773, + "learning_rate": 7.341490418736693e-05, + "loss": 0.04419172704219818, + "step": 187370 + }, + { + "epoch": 26.597586941092974, + "grad_norm": 1.824131727218628, + "learning_rate": 7.341348474095103e-05, + "loss": 0.04681034982204437, + "step": 187380 + }, + { + "epoch": 26.599006387508872, + "grad_norm": 0.023186497390270233, + "learning_rate": 7.341206529453514e-05, + "loss": 0.01559489220380783, + "step": 187390 + }, + { + "epoch": 26.60042583392477, + "grad_norm": 0.4776712954044342, + "learning_rate": 7.341064584811924e-05, + "loss": 0.029521465301513672, + "step": 187400 + }, + { + "epoch": 26.60184528034067, + "grad_norm": 0.020351536571979523, + "learning_rate": 7.340922640170334e-05, + "loss": 0.00454833060503006, + "step": 187410 + }, + { + "epoch": 26.603264726756564, + "grad_norm": 3.729581117630005, + "learning_rate": 7.340780695528743e-05, + "loss": 0.006207090243697167, + "step": 187420 + }, + { + "epoch": 26.604684173172462, + "grad_norm": 8.471291542053223, + "learning_rate": 7.340638750887154e-05, + "loss": 0.006292803585529328, + "step": 187430 + }, + { + "epoch": 26.60610361958836, + "grad_norm": 0.5368989706039429, + "learning_rate": 7.340496806245565e-05, + "loss": 0.007100984454154968, + "step": 187440 + }, + { + "epoch": 26.60752306600426, + "grad_norm": 0.026923906058073044, + "learning_rate": 7.340354861603975e-05, + "loss": 0.01119643747806549, + "step": 187450 + }, + { + "epoch": 26.608942512420157, + "grad_norm": 0.17730426788330078, + "learning_rate": 7.340212916962385e-05, + "loss": 0.005740134790539741, + "step": 187460 + }, + { + "epoch": 26.610361958836055, + "grad_norm": 2.410504102706909, + "learning_rate": 7.340070972320795e-05, + "loss": 0.03318266868591309, + "step": 187470 + }, + { + "epoch": 26.611781405251953, + "grad_norm": 0.13485155999660492, + "learning_rate": 7.339929027679206e-05, + "loss": 0.01533338725566864, + "step": 187480 + }, + { + "epoch": 26.613200851667848, + "grad_norm": 0.29260241985321045, + "learning_rate": 7.339787083037615e-05, + "loss": 0.0313270777463913, + "step": 187490 + }, + { + "epoch": 26.614620298083747, + "grad_norm": 0.2643528878688812, + "learning_rate": 7.339645138396027e-05, + "loss": 0.001842646673321724, + "step": 187500 + }, + { + "epoch": 26.614620298083747, + "eval_accuracy": 0.9895084885865073, + "eval_loss": 0.03887506201863289, + "eval_runtime": 34.2455, + "eval_samples_per_second": 459.242, + "eval_steps_per_second": 14.367, + "step": 187500 + }, + { + "epoch": 26.616039744499645, + "grad_norm": 0.032550230622291565, + "learning_rate": 7.339503193754435e-05, + "loss": 0.011184832453727723, + "step": 187510 + }, + { + "epoch": 26.617459190915543, + "grad_norm": 0.06969565153121948, + "learning_rate": 7.339361249112846e-05, + "loss": 0.017166340351104738, + "step": 187520 + }, + { + "epoch": 26.61887863733144, + "grad_norm": 0.1668543964624405, + "learning_rate": 7.339219304471257e-05, + "loss": 0.0015940971672534943, + "step": 187530 + }, + { + "epoch": 26.62029808374734, + "grad_norm": 0.062286317348480225, + "learning_rate": 7.339077359829667e-05, + "loss": 0.027431046962738036, + "step": 187540 + }, + { + "epoch": 26.621717530163238, + "grad_norm": 0.052853234112262726, + "learning_rate": 7.338935415188078e-05, + "loss": 0.0033203311264514922, + "step": 187550 + }, + { + "epoch": 26.623136976579133, + "grad_norm": 0.028827067464590073, + "learning_rate": 7.338793470546486e-05, + "loss": 0.008676151931285857, + "step": 187560 + }, + { + "epoch": 26.62455642299503, + "grad_norm": 0.8969391584396362, + "learning_rate": 7.338651525904897e-05, + "loss": 0.031139302253723144, + "step": 187570 + }, + { + "epoch": 26.62597586941093, + "grad_norm": 0.013324350118637085, + "learning_rate": 7.338509581263307e-05, + "loss": 0.0030543293803930284, + "step": 187580 + }, + { + "epoch": 26.627395315826828, + "grad_norm": 0.13581666350364685, + "learning_rate": 7.338367636621718e-05, + "loss": 0.008576932549476623, + "step": 187590 + }, + { + "epoch": 26.628814762242726, + "grad_norm": 2.1571667194366455, + "learning_rate": 7.338225691980128e-05, + "loss": 0.007740342617034912, + "step": 187600 + }, + { + "epoch": 26.630234208658624, + "grad_norm": 6.407258033752441, + "learning_rate": 7.338083747338539e-05, + "loss": 0.006663599610328674, + "step": 187610 + }, + { + "epoch": 26.631653655074523, + "grad_norm": 1.7370452880859375, + "learning_rate": 7.337941802696949e-05, + "loss": 0.022349701821804048, + "step": 187620 + }, + { + "epoch": 26.633073101490417, + "grad_norm": 0.10487902164459229, + "learning_rate": 7.337799858055359e-05, + "loss": 0.02918761074542999, + "step": 187630 + }, + { + "epoch": 26.634492547906316, + "grad_norm": 0.12429778277873993, + "learning_rate": 7.33765791341377e-05, + "loss": 0.07908987998962402, + "step": 187640 + }, + { + "epoch": 26.635911994322214, + "grad_norm": 1.8821569681167603, + "learning_rate": 7.33751596877218e-05, + "loss": 0.0704179584980011, + "step": 187650 + }, + { + "epoch": 26.637331440738112, + "grad_norm": 12.217181205749512, + "learning_rate": 7.33737402413059e-05, + "loss": 0.017322908341884612, + "step": 187660 + }, + { + "epoch": 26.63875088715401, + "grad_norm": 0.004214731976389885, + "learning_rate": 7.337232079488999e-05, + "loss": 0.003533269092440605, + "step": 187670 + }, + { + "epoch": 26.64017033356991, + "grad_norm": 0.2746047377586365, + "learning_rate": 7.33709013484741e-05, + "loss": 0.019543492794036867, + "step": 187680 + }, + { + "epoch": 26.641589779985807, + "grad_norm": 1.233067274093628, + "learning_rate": 7.33694819020582e-05, + "loss": 0.008617883920669556, + "step": 187690 + }, + { + "epoch": 26.643009226401702, + "grad_norm": 0.11047253012657166, + "learning_rate": 7.336806245564231e-05, + "loss": 0.029004138708114625, + "step": 187700 + }, + { + "epoch": 26.6444286728176, + "grad_norm": 4.785527229309082, + "learning_rate": 7.33666430092264e-05, + "loss": 0.006606794893741608, + "step": 187710 + }, + { + "epoch": 26.6458481192335, + "grad_norm": 0.050840284675359726, + "learning_rate": 7.33652235628105e-05, + "loss": 0.008026013523340226, + "step": 187720 + }, + { + "epoch": 26.647267565649397, + "grad_norm": 0.7833130955696106, + "learning_rate": 7.336380411639461e-05, + "loss": 0.03502286076545715, + "step": 187730 + }, + { + "epoch": 26.648687012065295, + "grad_norm": 0.06441321223974228, + "learning_rate": 7.336238466997871e-05, + "loss": 0.0016042642295360566, + "step": 187740 + }, + { + "epoch": 26.650106458481194, + "grad_norm": 4.011074542999268, + "learning_rate": 7.336096522356282e-05, + "loss": 0.012786972522735595, + "step": 187750 + }, + { + "epoch": 26.651525904897092, + "grad_norm": 0.05059684067964554, + "learning_rate": 7.335954577714692e-05, + "loss": 0.0027969956398010256, + "step": 187760 + }, + { + "epoch": 26.652945351312987, + "grad_norm": 0.049357082694768906, + "learning_rate": 7.335812633073102e-05, + "loss": 0.030567225813865662, + "step": 187770 + }, + { + "epoch": 26.654364797728885, + "grad_norm": 0.15770426392555237, + "learning_rate": 7.335670688431511e-05, + "loss": 0.018731872737407684, + "step": 187780 + }, + { + "epoch": 26.655784244144783, + "grad_norm": 0.19285565614700317, + "learning_rate": 7.335528743789923e-05, + "loss": 0.034409791231155396, + "step": 187790 + }, + { + "epoch": 26.65720369056068, + "grad_norm": 0.011073091998696327, + "learning_rate": 7.335386799148332e-05, + "loss": 0.0029382631182670593, + "step": 187800 + }, + { + "epoch": 26.65862313697658, + "grad_norm": 7.783898830413818, + "learning_rate": 7.335244854506743e-05, + "loss": 0.0168064147233963, + "step": 187810 + }, + { + "epoch": 26.660042583392478, + "grad_norm": 0.11797091364860535, + "learning_rate": 7.335102909865153e-05, + "loss": 0.038950249552726746, + "step": 187820 + }, + { + "epoch": 26.661462029808376, + "grad_norm": 0.5215746760368347, + "learning_rate": 7.334960965223563e-05, + "loss": 0.005537724122405052, + "step": 187830 + }, + { + "epoch": 26.66288147622427, + "grad_norm": 0.00385418557561934, + "learning_rate": 7.334819020581974e-05, + "loss": 0.01319928765296936, + "step": 187840 + }, + { + "epoch": 26.66430092264017, + "grad_norm": 0.011149434372782707, + "learning_rate": 7.334677075940384e-05, + "loss": 0.002995927631855011, + "step": 187850 + }, + { + "epoch": 26.665720369056068, + "grad_norm": 0.06397092342376709, + "learning_rate": 7.334535131298795e-05, + "loss": 0.022271454334259033, + "step": 187860 + }, + { + "epoch": 26.667139815471966, + "grad_norm": 0.049292244017124176, + "learning_rate": 7.334393186657203e-05, + "loss": 0.0052106622606515884, + "step": 187870 + }, + { + "epoch": 26.668559261887864, + "grad_norm": 0.29146242141723633, + "learning_rate": 7.334251242015614e-05, + "loss": 0.008117727935314178, + "step": 187880 + }, + { + "epoch": 26.669978708303763, + "grad_norm": 1.3380223512649536, + "learning_rate": 7.334109297374024e-05, + "loss": 0.028761667013168336, + "step": 187890 + }, + { + "epoch": 26.67139815471966, + "grad_norm": 0.31745249032974243, + "learning_rate": 7.333967352732435e-05, + "loss": 0.020203942060470582, + "step": 187900 + }, + { + "epoch": 26.672817601135556, + "grad_norm": 0.9593662023544312, + "learning_rate": 7.333825408090845e-05, + "loss": 0.04420211613178253, + "step": 187910 + }, + { + "epoch": 26.674237047551454, + "grad_norm": 0.15034893155097961, + "learning_rate": 7.333683463449255e-05, + "loss": 0.005492578074336052, + "step": 187920 + }, + { + "epoch": 26.675656493967352, + "grad_norm": 0.03192856162786484, + "learning_rate": 7.333541518807666e-05, + "loss": 0.011295275390148162, + "step": 187930 + }, + { + "epoch": 26.67707594038325, + "grad_norm": 2.271683692932129, + "learning_rate": 7.333399574166075e-05, + "loss": 0.030432549118995667, + "step": 187940 + }, + { + "epoch": 26.67849538679915, + "grad_norm": 0.7835888266563416, + "learning_rate": 7.333257629524486e-05, + "loss": 0.003641408309340477, + "step": 187950 + }, + { + "epoch": 26.679914833215047, + "grad_norm": 0.2430177479982376, + "learning_rate": 7.333115684882896e-05, + "loss": 0.03166298568248749, + "step": 187960 + }, + { + "epoch": 26.681334279630946, + "grad_norm": 0.16370585560798645, + "learning_rate": 7.332973740241307e-05, + "loss": 0.022794282436370848, + "step": 187970 + }, + { + "epoch": 26.68275372604684, + "grad_norm": 0.059641409665346146, + "learning_rate": 7.332831795599716e-05, + "loss": 0.0018623284995555878, + "step": 187980 + }, + { + "epoch": 26.68417317246274, + "grad_norm": 0.016690537333488464, + "learning_rate": 7.332689850958127e-05, + "loss": 0.006790892779827118, + "step": 187990 + }, + { + "epoch": 26.685592618878637, + "grad_norm": 0.5746123790740967, + "learning_rate": 7.332547906316536e-05, + "loss": 0.006010774523019791, + "step": 188000 + }, + { + "epoch": 26.685592618878637, + "eval_accuracy": 0.9895720735041648, + "eval_loss": 0.0370098277926445, + "eval_runtime": 31.835, + "eval_samples_per_second": 494.016, + "eval_steps_per_second": 15.455, + "step": 188000 + }, + { + "epoch": 26.687012065294535, + "grad_norm": 0.09517305344343185, + "learning_rate": 7.332405961674948e-05, + "loss": 0.00669003427028656, + "step": 188010 + }, + { + "epoch": 26.688431511710434, + "grad_norm": 0.14018070697784424, + "learning_rate": 7.332264017033357e-05, + "loss": 0.006291373074054718, + "step": 188020 + }, + { + "epoch": 26.689850958126332, + "grad_norm": 12.756889343261719, + "learning_rate": 7.332122072391767e-05, + "loss": 0.04175263047218323, + "step": 188030 + }, + { + "epoch": 26.69127040454223, + "grad_norm": 0.2030816376209259, + "learning_rate": 7.331980127750178e-05, + "loss": 0.010281038284301759, + "step": 188040 + }, + { + "epoch": 26.692689850958125, + "grad_norm": 0.1799834817647934, + "learning_rate": 7.331838183108588e-05, + "loss": 0.020020276308059692, + "step": 188050 + }, + { + "epoch": 26.694109297374023, + "grad_norm": 0.001269051106646657, + "learning_rate": 7.331696238466999e-05, + "loss": 0.004287093877792358, + "step": 188060 + }, + { + "epoch": 26.69552874378992, + "grad_norm": 10.934380531311035, + "learning_rate": 7.331554293825409e-05, + "loss": 0.011470809578895569, + "step": 188070 + }, + { + "epoch": 26.69694819020582, + "grad_norm": 0.015177497640252113, + "learning_rate": 7.331412349183818e-05, + "loss": 0.010675089061260223, + "step": 188080 + }, + { + "epoch": 26.698367636621718, + "grad_norm": 0.02462729439139366, + "learning_rate": 7.331270404542228e-05, + "loss": 0.029491376876831055, + "step": 188090 + }, + { + "epoch": 26.699787083037616, + "grad_norm": 1.180158019065857, + "learning_rate": 7.331128459900639e-05, + "loss": 0.018494173884391785, + "step": 188100 + }, + { + "epoch": 26.701206529453515, + "grad_norm": 0.16965286433696747, + "learning_rate": 7.330986515259049e-05, + "loss": 0.011497084796428681, + "step": 188110 + }, + { + "epoch": 26.70262597586941, + "grad_norm": 0.038389384746551514, + "learning_rate": 7.33084457061746e-05, + "loss": 0.01169542819261551, + "step": 188120 + }, + { + "epoch": 26.704045422285308, + "grad_norm": 4.385623931884766, + "learning_rate": 7.33070262597587e-05, + "loss": 0.019249793887138367, + "step": 188130 + }, + { + "epoch": 26.705464868701206, + "grad_norm": 0.029065119102597237, + "learning_rate": 7.33056068133428e-05, + "loss": 0.01381070762872696, + "step": 188140 + }, + { + "epoch": 26.706884315117104, + "grad_norm": 0.011003145948052406, + "learning_rate": 7.330418736692691e-05, + "loss": 0.0045494608581066135, + "step": 188150 + }, + { + "epoch": 26.708303761533003, + "grad_norm": 8.071669578552246, + "learning_rate": 7.3302767920511e-05, + "loss": 0.013244056701660156, + "step": 188160 + }, + { + "epoch": 26.7097232079489, + "grad_norm": 0.0890679880976677, + "learning_rate": 7.330134847409512e-05, + "loss": 0.018499897420406343, + "step": 188170 + }, + { + "epoch": 26.7111426543648, + "grad_norm": 1.4450228214263916, + "learning_rate": 7.32999290276792e-05, + "loss": 0.005136832967400551, + "step": 188180 + }, + { + "epoch": 26.712562100780694, + "grad_norm": 0.764386773109436, + "learning_rate": 7.329850958126331e-05, + "loss": 0.02540639042854309, + "step": 188190 + }, + { + "epoch": 26.713981547196592, + "grad_norm": 0.03412451595067978, + "learning_rate": 7.329709013484741e-05, + "loss": 0.011815245449542999, + "step": 188200 + }, + { + "epoch": 26.71540099361249, + "grad_norm": 0.5121376514434814, + "learning_rate": 7.329567068843152e-05, + "loss": 0.009456634521484375, + "step": 188210 + }, + { + "epoch": 26.71682044002839, + "grad_norm": 0.07311234623193741, + "learning_rate": 7.329425124201562e-05, + "loss": 0.0028085306286811827, + "step": 188220 + }, + { + "epoch": 26.718239886444287, + "grad_norm": 0.052992090582847595, + "learning_rate": 7.329283179559971e-05, + "loss": 0.005201851576566696, + "step": 188230 + }, + { + "epoch": 26.719659332860186, + "grad_norm": 24.78182601928711, + "learning_rate": 7.329141234918382e-05, + "loss": 0.02027573138475418, + "step": 188240 + }, + { + "epoch": 26.721078779276084, + "grad_norm": 1.1512938737869263, + "learning_rate": 7.328999290276792e-05, + "loss": 0.029468512535095213, + "step": 188250 + }, + { + "epoch": 26.72249822569198, + "grad_norm": 0.021208547055721283, + "learning_rate": 7.328857345635203e-05, + "loss": 0.002678105980157852, + "step": 188260 + }, + { + "epoch": 26.723917672107877, + "grad_norm": 0.011695347726345062, + "learning_rate": 7.328715400993613e-05, + "loss": 0.009458798170089721, + "step": 188270 + }, + { + "epoch": 26.725337118523775, + "grad_norm": 0.060376882553100586, + "learning_rate": 7.328573456352023e-05, + "loss": 0.0018534407019615174, + "step": 188280 + }, + { + "epoch": 26.726756564939674, + "grad_norm": 0.17809784412384033, + "learning_rate": 7.328431511710432e-05, + "loss": 0.02051375061273575, + "step": 188290 + }, + { + "epoch": 26.728176011355572, + "grad_norm": 0.006092201918363571, + "learning_rate": 7.328289567068844e-05, + "loss": 0.016334617137908937, + "step": 188300 + }, + { + "epoch": 26.72959545777147, + "grad_norm": 0.010518181137740612, + "learning_rate": 7.328147622427253e-05, + "loss": 0.002416614070534706, + "step": 188310 + }, + { + "epoch": 26.73101490418737, + "grad_norm": 15.289582252502441, + "learning_rate": 7.328005677785664e-05, + "loss": 0.07445269823074341, + "step": 188320 + }, + { + "epoch": 26.732434350603263, + "grad_norm": 9.944464683532715, + "learning_rate": 7.327863733144074e-05, + "loss": 0.016535747051239013, + "step": 188330 + }, + { + "epoch": 26.73385379701916, + "grad_norm": 13.462506294250488, + "learning_rate": 7.327721788502484e-05, + "loss": 0.014541253447532654, + "step": 188340 + }, + { + "epoch": 26.73527324343506, + "grad_norm": 1.5416743755340576, + "learning_rate": 7.327579843860895e-05, + "loss": 0.01921941488981247, + "step": 188350 + }, + { + "epoch": 26.73669268985096, + "grad_norm": 0.0642755925655365, + "learning_rate": 7.327437899219305e-05, + "loss": 0.04003239274024963, + "step": 188360 + }, + { + "epoch": 26.738112136266857, + "grad_norm": 0.01986604556441307, + "learning_rate": 7.327295954577716e-05, + "loss": 0.03404629826545715, + "step": 188370 + }, + { + "epoch": 26.739531582682755, + "grad_norm": 0.052182380110025406, + "learning_rate": 7.327154009936125e-05, + "loss": 0.001974167302250862, + "step": 188380 + }, + { + "epoch": 26.740951029098653, + "grad_norm": 13.864272117614746, + "learning_rate": 7.327012065294535e-05, + "loss": 0.007714889198541641, + "step": 188390 + }, + { + "epoch": 26.742370475514548, + "grad_norm": 4.329283714294434, + "learning_rate": 7.326870120652945e-05, + "loss": 0.01639443337917328, + "step": 188400 + }, + { + "epoch": 26.743789921930446, + "grad_norm": 9.661582946777344, + "learning_rate": 7.326728176011356e-05, + "loss": 0.012142999470233918, + "step": 188410 + }, + { + "epoch": 26.745209368346345, + "grad_norm": 0.2186303734779358, + "learning_rate": 7.326586231369766e-05, + "loss": 0.028564122319221497, + "step": 188420 + }, + { + "epoch": 26.746628814762243, + "grad_norm": 0.06014927104115486, + "learning_rate": 7.326444286728177e-05, + "loss": 0.048870065808296205, + "step": 188430 + }, + { + "epoch": 26.74804826117814, + "grad_norm": 4.261373996734619, + "learning_rate": 7.326302342086587e-05, + "loss": 0.023696142435073852, + "step": 188440 + }, + { + "epoch": 26.74946770759404, + "grad_norm": 7.83797550201416, + "learning_rate": 7.326160397444996e-05, + "loss": 0.02800889015197754, + "step": 188450 + }, + { + "epoch": 26.750887154009938, + "grad_norm": 5.565289497375488, + "learning_rate": 7.326018452803407e-05, + "loss": 0.030678954720497132, + "step": 188460 + }, + { + "epoch": 26.752306600425833, + "grad_norm": 1.1860551834106445, + "learning_rate": 7.325876508161817e-05, + "loss": 0.025014716386795043, + "step": 188470 + }, + { + "epoch": 26.75372604684173, + "grad_norm": 1.654003620147705, + "learning_rate": 7.325734563520228e-05, + "loss": 0.03852218091487884, + "step": 188480 + }, + { + "epoch": 26.75514549325763, + "grad_norm": 14.657230377197266, + "learning_rate": 7.325592618878637e-05, + "loss": 0.024325238168239595, + "step": 188490 + }, + { + "epoch": 26.756564939673527, + "grad_norm": 0.15956756472587585, + "learning_rate": 7.325450674237048e-05, + "loss": 0.012810032069683074, + "step": 188500 + }, + { + "epoch": 26.756564939673527, + "eval_accuracy": 0.9835315063266993, + "eval_loss": 0.06524772197008133, + "eval_runtime": 32.1149, + "eval_samples_per_second": 489.71, + "eval_steps_per_second": 15.32, + "step": 188500 + }, + { + "epoch": 26.757984386089426, + "grad_norm": 0.03423471376299858, + "learning_rate": 7.325308729595458e-05, + "loss": 0.02359835058450699, + "step": 188510 + }, + { + "epoch": 26.759403832505324, + "grad_norm": 0.8989615440368652, + "learning_rate": 7.325166784953869e-05, + "loss": 0.027366384863853455, + "step": 188520 + }, + { + "epoch": 26.760823278921222, + "grad_norm": 19.4143009185791, + "learning_rate": 7.325024840312278e-05, + "loss": 0.045195150375366214, + "step": 188530 + }, + { + "epoch": 26.762242725337117, + "grad_norm": 0.5083664059638977, + "learning_rate": 7.324882895670688e-05, + "loss": 0.004037468507885933, + "step": 188540 + }, + { + "epoch": 26.763662171753015, + "grad_norm": 3.555624485015869, + "learning_rate": 7.324740951029099e-05, + "loss": 0.014779219031333923, + "step": 188550 + }, + { + "epoch": 26.765081618168914, + "grad_norm": 6.3489227294921875, + "learning_rate": 7.324599006387509e-05, + "loss": 0.02743072509765625, + "step": 188560 + }, + { + "epoch": 26.766501064584812, + "grad_norm": 0.4514230191707611, + "learning_rate": 7.32445706174592e-05, + "loss": 0.0019258208572864532, + "step": 188570 + }, + { + "epoch": 26.76792051100071, + "grad_norm": 8.067468643188477, + "learning_rate": 7.32431511710433e-05, + "loss": 0.0258170485496521, + "step": 188580 + }, + { + "epoch": 26.76933995741661, + "grad_norm": 0.019365450367331505, + "learning_rate": 7.32417317246274e-05, + "loss": 0.011530144512653351, + "step": 188590 + }, + { + "epoch": 26.770759403832507, + "grad_norm": 0.17686904966831207, + "learning_rate": 7.324031227821149e-05, + "loss": 0.008080792427062989, + "step": 188600 + }, + { + "epoch": 26.7721788502484, + "grad_norm": 0.9952772855758667, + "learning_rate": 7.32388928317956e-05, + "loss": 0.0021892733871936797, + "step": 188610 + }, + { + "epoch": 26.7735982966643, + "grad_norm": 3.933973789215088, + "learning_rate": 7.32374733853797e-05, + "loss": 0.006074730306863785, + "step": 188620 + }, + { + "epoch": 26.7750177430802, + "grad_norm": 3.834097146987915, + "learning_rate": 7.323605393896381e-05, + "loss": 0.016403187811374665, + "step": 188630 + }, + { + "epoch": 26.776437189496097, + "grad_norm": 0.36729422211647034, + "learning_rate": 7.323463449254791e-05, + "loss": 0.004140164330601692, + "step": 188640 + }, + { + "epoch": 26.777856635911995, + "grad_norm": 0.01768394187092781, + "learning_rate": 7.3233215046132e-05, + "loss": 0.02498486787080765, + "step": 188650 + }, + { + "epoch": 26.779276082327893, + "grad_norm": 0.13923798501491547, + "learning_rate": 7.323179559971612e-05, + "loss": 0.028512534499168397, + "step": 188660 + }, + { + "epoch": 26.78069552874379, + "grad_norm": 0.02883913926780224, + "learning_rate": 7.323037615330021e-05, + "loss": 0.023695164918899538, + "step": 188670 + }, + { + "epoch": 26.782114975159686, + "grad_norm": 0.03864767402410507, + "learning_rate": 7.322895670688433e-05, + "loss": 0.013676604628562928, + "step": 188680 + }, + { + "epoch": 26.783534421575585, + "grad_norm": 0.41485267877578735, + "learning_rate": 7.322753726046842e-05, + "loss": 0.025458160042762756, + "step": 188690 + }, + { + "epoch": 26.784953867991483, + "grad_norm": 3.0127370357513428, + "learning_rate": 7.322611781405252e-05, + "loss": 0.025090917944908142, + "step": 188700 + }, + { + "epoch": 26.78637331440738, + "grad_norm": 0.013870681636035442, + "learning_rate": 7.322469836763662e-05, + "loss": 0.0218958243727684, + "step": 188710 + }, + { + "epoch": 26.78779276082328, + "grad_norm": 6.747122287750244, + "learning_rate": 7.322327892122073e-05, + "loss": 0.013598959147930145, + "step": 188720 + }, + { + "epoch": 26.789212207239178, + "grad_norm": 1.5780153274536133, + "learning_rate": 7.322185947480484e-05, + "loss": 0.0015523891896009445, + "step": 188730 + }, + { + "epoch": 26.790631653655076, + "grad_norm": 0.6623547077178955, + "learning_rate": 7.322044002838894e-05, + "loss": 0.0056773900985717775, + "step": 188740 + }, + { + "epoch": 26.79205110007097, + "grad_norm": 0.0929054394364357, + "learning_rate": 7.321902058197303e-05, + "loss": 0.02819412648677826, + "step": 188750 + }, + { + "epoch": 26.79347054648687, + "grad_norm": 16.230010986328125, + "learning_rate": 7.321760113555713e-05, + "loss": 0.011232849955558778, + "step": 188760 + }, + { + "epoch": 26.794889992902768, + "grad_norm": 5.821075916290283, + "learning_rate": 7.321618168914124e-05, + "loss": 0.038499706983566286, + "step": 188770 + }, + { + "epoch": 26.796309439318666, + "grad_norm": 0.03317848592996597, + "learning_rate": 7.321476224272534e-05, + "loss": 0.005138024687767029, + "step": 188780 + }, + { + "epoch": 26.797728885734564, + "grad_norm": 0.08211526274681091, + "learning_rate": 7.321334279630945e-05, + "loss": 0.04171091616153717, + "step": 188790 + }, + { + "epoch": 26.799148332150462, + "grad_norm": 10.034165382385254, + "learning_rate": 7.321192334989353e-05, + "loss": 0.009373712539672851, + "step": 188800 + }, + { + "epoch": 26.80056777856636, + "grad_norm": 0.2161841243505478, + "learning_rate": 7.321050390347765e-05, + "loss": 0.005015381798148155, + "step": 188810 + }, + { + "epoch": 26.801987224982255, + "grad_norm": 0.3456258177757263, + "learning_rate": 7.320908445706176e-05, + "loss": 0.01800067871809006, + "step": 188820 + }, + { + "epoch": 26.803406671398154, + "grad_norm": 0.1344594955444336, + "learning_rate": 7.320766501064585e-05, + "loss": 0.019866478443145753, + "step": 188830 + }, + { + "epoch": 26.804826117814052, + "grad_norm": 0.05065298080444336, + "learning_rate": 7.320624556422996e-05, + "loss": 0.015572462975978852, + "step": 188840 + }, + { + "epoch": 26.80624556422995, + "grad_norm": 6.842191696166992, + "learning_rate": 7.320482611781405e-05, + "loss": 0.01344592571258545, + "step": 188850 + }, + { + "epoch": 26.80766501064585, + "grad_norm": 0.019556038081645966, + "learning_rate": 7.320340667139816e-05, + "loss": 0.03243492841720581, + "step": 188860 + }, + { + "epoch": 26.809084457061747, + "grad_norm": 5.143561840057373, + "learning_rate": 7.320198722498226e-05, + "loss": 0.004134431853890419, + "step": 188870 + }, + { + "epoch": 26.810503903477645, + "grad_norm": 0.1694352924823761, + "learning_rate": 7.320056777856637e-05, + "loss": 0.011232419312000275, + "step": 188880 + }, + { + "epoch": 26.81192334989354, + "grad_norm": 0.02034018747508526, + "learning_rate": 7.319914833215047e-05, + "loss": 0.023965431749820708, + "step": 188890 + }, + { + "epoch": 26.81334279630944, + "grad_norm": 17.50876235961914, + "learning_rate": 7.319772888573456e-05, + "loss": 0.041399520635604856, + "step": 188900 + }, + { + "epoch": 26.814762242725337, + "grad_norm": 0.1126921996474266, + "learning_rate": 7.319630943931867e-05, + "loss": 0.008019477874040604, + "step": 188910 + }, + { + "epoch": 26.816181689141235, + "grad_norm": 0.06552189588546753, + "learning_rate": 7.319488999290277e-05, + "loss": 0.012293799221515656, + "step": 188920 + }, + { + "epoch": 26.817601135557133, + "grad_norm": 18.10056495666504, + "learning_rate": 7.319347054648688e-05, + "loss": 0.032669094204902646, + "step": 188930 + }, + { + "epoch": 26.81902058197303, + "grad_norm": 0.17457278072834015, + "learning_rate": 7.319205110007098e-05, + "loss": 0.00293286070227623, + "step": 188940 + }, + { + "epoch": 26.82044002838893, + "grad_norm": 6.9609456062316895, + "learning_rate": 7.319063165365508e-05, + "loss": 0.003600003197789192, + "step": 188950 + }, + { + "epoch": 26.821859474804825, + "grad_norm": 0.32621777057647705, + "learning_rate": 7.318921220723917e-05, + "loss": 0.001313912123441696, + "step": 188960 + }, + { + "epoch": 26.823278921220723, + "grad_norm": 0.013023651205003262, + "learning_rate": 7.318779276082328e-05, + "loss": 0.04759776890277863, + "step": 188970 + }, + { + "epoch": 26.82469836763662, + "grad_norm": 0.031396087259054184, + "learning_rate": 7.318637331440738e-05, + "loss": 0.004252446815371513, + "step": 188980 + }, + { + "epoch": 26.82611781405252, + "grad_norm": 8.55611515045166, + "learning_rate": 7.318495386799149e-05, + "loss": 0.015065418183803558, + "step": 188990 + }, + { + "epoch": 26.827537260468418, + "grad_norm": 0.2788658142089844, + "learning_rate": 7.318353442157559e-05, + "loss": 0.03408445417881012, + "step": 189000 + }, + { + "epoch": 26.827537260468418, + "eval_accuracy": 0.9900171679277675, + "eval_loss": 0.03628779202699661, + "eval_runtime": 31.4115, + "eval_samples_per_second": 500.677, + "eval_steps_per_second": 15.663, + "step": 189000 + }, + { + "epoch": 26.828956706884316, + "grad_norm": 0.678207278251648, + "learning_rate": 7.318211497515969e-05, + "loss": 0.041789695620536804, + "step": 189010 + }, + { + "epoch": 26.830376153300215, + "grad_norm": 1.3175474405288696, + "learning_rate": 7.31806955287438e-05, + "loss": 0.021518823504447938, + "step": 189020 + }, + { + "epoch": 26.83179559971611, + "grad_norm": 1.8201295137405396, + "learning_rate": 7.31792760823279e-05, + "loss": 0.012568691372871399, + "step": 189030 + }, + { + "epoch": 26.833215046132008, + "grad_norm": 0.4399189352989197, + "learning_rate": 7.317785663591201e-05, + "loss": 0.01185387521982193, + "step": 189040 + }, + { + "epoch": 26.834634492547906, + "grad_norm": 0.009739754721522331, + "learning_rate": 7.31764371894961e-05, + "loss": 0.019506052136421204, + "step": 189050 + }, + { + "epoch": 26.836053938963804, + "grad_norm": 0.0192350372672081, + "learning_rate": 7.31750177430802e-05, + "loss": 0.016666142642498015, + "step": 189060 + }, + { + "epoch": 26.837473385379703, + "grad_norm": 0.26270586252212524, + "learning_rate": 7.31735982966643e-05, + "loss": 0.007002228498458862, + "step": 189070 + }, + { + "epoch": 26.8388928317956, + "grad_norm": 0.1748242974281311, + "learning_rate": 7.317217885024841e-05, + "loss": 0.0260200172662735, + "step": 189080 + }, + { + "epoch": 26.8403122782115, + "grad_norm": 0.12431571632623672, + "learning_rate": 7.317075940383251e-05, + "loss": 0.002550382912158966, + "step": 189090 + }, + { + "epoch": 26.841731724627394, + "grad_norm": 4.019560813903809, + "learning_rate": 7.316933995741662e-05, + "loss": 0.017748671770095825, + "step": 189100 + }, + { + "epoch": 26.843151171043292, + "grad_norm": 1.0739699602127075, + "learning_rate": 7.316792051100072e-05, + "loss": 0.03678792417049408, + "step": 189110 + }, + { + "epoch": 26.84457061745919, + "grad_norm": 0.3797835111618042, + "learning_rate": 7.316650106458481e-05, + "loss": 0.06159224510192871, + "step": 189120 + }, + { + "epoch": 26.84599006387509, + "grad_norm": 0.005844158120453358, + "learning_rate": 7.316508161816892e-05, + "loss": 0.02004968822002411, + "step": 189130 + }, + { + "epoch": 26.847409510290987, + "grad_norm": 0.5213274955749512, + "learning_rate": 7.316366217175302e-05, + "loss": 0.018800088763237, + "step": 189140 + }, + { + "epoch": 26.848828956706885, + "grad_norm": 0.24965515732765198, + "learning_rate": 7.316224272533713e-05, + "loss": 0.006807164847850799, + "step": 189150 + }, + { + "epoch": 26.850248403122784, + "grad_norm": 0.4500533938407898, + "learning_rate": 7.316082327892122e-05, + "loss": 0.024412599205970765, + "step": 189160 + }, + { + "epoch": 26.85166784953868, + "grad_norm": 0.23979371786117554, + "learning_rate": 7.315940383250533e-05, + "loss": 0.011640702188014985, + "step": 189170 + }, + { + "epoch": 26.853087295954577, + "grad_norm": 0.1599941998720169, + "learning_rate": 7.315798438608942e-05, + "loss": 0.00100809708237648, + "step": 189180 + }, + { + "epoch": 26.854506742370475, + "grad_norm": 0.4046964943408966, + "learning_rate": 7.315656493967354e-05, + "loss": 0.014085778594017028, + "step": 189190 + }, + { + "epoch": 26.855926188786373, + "grad_norm": 0.3576463758945465, + "learning_rate": 7.315514549325763e-05, + "loss": 0.03356168270111084, + "step": 189200 + }, + { + "epoch": 26.85734563520227, + "grad_norm": 0.0311473049223423, + "learning_rate": 7.315372604684173e-05, + "loss": 0.0017621759325265884, + "step": 189210 + }, + { + "epoch": 26.85876508161817, + "grad_norm": 1.7946358919143677, + "learning_rate": 7.315230660042584e-05, + "loss": 0.02091485857963562, + "step": 189220 + }, + { + "epoch": 26.86018452803407, + "grad_norm": 0.009117831476032734, + "learning_rate": 7.315088715400994e-05, + "loss": 0.006025741249322892, + "step": 189230 + }, + { + "epoch": 26.861603974449963, + "grad_norm": 0.8974032402038574, + "learning_rate": 7.314946770759405e-05, + "loss": 0.006789863109588623, + "step": 189240 + }, + { + "epoch": 26.86302342086586, + "grad_norm": 16.40374755859375, + "learning_rate": 7.314804826117815e-05, + "loss": 0.017146663367748262, + "step": 189250 + }, + { + "epoch": 26.86444286728176, + "grad_norm": 0.07117621600627899, + "learning_rate": 7.314662881476224e-05, + "loss": 0.029807117581367493, + "step": 189260 + }, + { + "epoch": 26.865862313697658, + "grad_norm": 1.5023095607757568, + "learning_rate": 7.314520936834634e-05, + "loss": 0.012615567445755005, + "step": 189270 + }, + { + "epoch": 26.867281760113556, + "grad_norm": 0.2269541174173355, + "learning_rate": 7.314378992193045e-05, + "loss": 0.014696714282035828, + "step": 189280 + }, + { + "epoch": 26.868701206529455, + "grad_norm": 12.85835075378418, + "learning_rate": 7.314237047551455e-05, + "loss": 0.029680892825126648, + "step": 189290 + }, + { + "epoch": 26.870120652945353, + "grad_norm": 0.11000434309244156, + "learning_rate": 7.314095102909866e-05, + "loss": 0.0014032527804374694, + "step": 189300 + }, + { + "epoch": 26.871540099361248, + "grad_norm": 7.263960361480713, + "learning_rate": 7.313953158268276e-05, + "loss": 0.007339473068714142, + "step": 189310 + }, + { + "epoch": 26.872959545777146, + "grad_norm": 0.052711986005306244, + "learning_rate": 7.313811213626686e-05, + "loss": 0.02144011706113815, + "step": 189320 + }, + { + "epoch": 26.874378992193044, + "grad_norm": 0.15134350955486298, + "learning_rate": 7.313669268985097e-05, + "loss": 0.006086874753236771, + "step": 189330 + }, + { + "epoch": 26.875798438608943, + "grad_norm": 0.2780214250087738, + "learning_rate": 7.313527324343506e-05, + "loss": 0.012708163261413575, + "step": 189340 + }, + { + "epoch": 26.87721788502484, + "grad_norm": 6.48891019821167, + "learning_rate": 7.313385379701917e-05, + "loss": 0.005064753815531731, + "step": 189350 + }, + { + "epoch": 26.87863733144074, + "grad_norm": 2.621422290802002, + "learning_rate": 7.313243435060326e-05, + "loss": 0.016575507819652557, + "step": 189360 + }, + { + "epoch": 26.880056777856637, + "grad_norm": 0.2874918580055237, + "learning_rate": 7.313101490418737e-05, + "loss": 0.009697733819484711, + "step": 189370 + }, + { + "epoch": 26.881476224272532, + "grad_norm": 0.4448912441730499, + "learning_rate": 7.312959545777147e-05, + "loss": 0.014798295497894288, + "step": 189380 + }, + { + "epoch": 26.88289567068843, + "grad_norm": 0.03115709498524666, + "learning_rate": 7.312817601135558e-05, + "loss": 0.0077231451869010925, + "step": 189390 + }, + { + "epoch": 26.88431511710433, + "grad_norm": 0.4502294659614563, + "learning_rate": 7.312675656493968e-05, + "loss": 0.011985012143850327, + "step": 189400 + }, + { + "epoch": 26.885734563520227, + "grad_norm": 0.04135891795158386, + "learning_rate": 7.312547906316537e-05, + "loss": 0.023848718404769896, + "step": 189410 + }, + { + "epoch": 26.887154009936125, + "grad_norm": 0.019645944237709045, + "learning_rate": 7.312405961674947e-05, + "loss": 0.008200354874134064, + "step": 189420 + }, + { + "epoch": 26.888573456352024, + "grad_norm": 0.12066321074962616, + "learning_rate": 7.312264017033358e-05, + "loss": 0.008301748335361481, + "step": 189430 + }, + { + "epoch": 26.889992902767922, + "grad_norm": 0.7534964680671692, + "learning_rate": 7.312122072391767e-05, + "loss": 0.017807821929454803, + "step": 189440 + }, + { + "epoch": 26.891412349183817, + "grad_norm": 0.04180710390210152, + "learning_rate": 7.311980127750178e-05, + "loss": 0.0009963613003492356, + "step": 189450 + }, + { + "epoch": 26.892831795599715, + "grad_norm": 0.3103296756744385, + "learning_rate": 7.311838183108587e-05, + "loss": 0.007639676332473755, + "step": 189460 + }, + { + "epoch": 26.894251242015613, + "grad_norm": 0.007717971224337816, + "learning_rate": 7.311696238466998e-05, + "loss": 0.0047048904001712796, + "step": 189470 + }, + { + "epoch": 26.89567068843151, + "grad_norm": 0.05832973122596741, + "learning_rate": 7.311554293825408e-05, + "loss": 0.006114475801587105, + "step": 189480 + }, + { + "epoch": 26.89709013484741, + "grad_norm": 0.05083521082997322, + "learning_rate": 7.311412349183818e-05, + "loss": 0.018617957830429077, + "step": 189490 + }, + { + "epoch": 26.89850958126331, + "grad_norm": 0.9210319519042969, + "learning_rate": 7.311270404542229e-05, + "loss": 0.0049930233508348465, + "step": 189500 + }, + { + "epoch": 26.89850958126331, + "eval_accuracy": 0.9919883003751511, + "eval_loss": 0.03193502128124237, + "eval_runtime": 31.3916, + "eval_samples_per_second": 500.995, + "eval_steps_per_second": 15.673, + "step": 189500 + }, + { + "epoch": 26.899929027679207, + "grad_norm": 0.2334737926721573, + "learning_rate": 7.311128459900639e-05, + "loss": 0.005908265709877014, + "step": 189510 + }, + { + "epoch": 26.9013484740951, + "grad_norm": 0.08405650407075882, + "learning_rate": 7.31098651525905e-05, + "loss": 0.005118655040860176, + "step": 189520 + }, + { + "epoch": 26.902767920511, + "grad_norm": 0.06460914015769958, + "learning_rate": 7.31084457061746e-05, + "loss": 0.012769202888011932, + "step": 189530 + }, + { + "epoch": 26.904187366926898, + "grad_norm": 0.035414278507232666, + "learning_rate": 7.31070262597587e-05, + "loss": 0.03583086133003235, + "step": 189540 + }, + { + "epoch": 26.905606813342796, + "grad_norm": 3.5920794010162354, + "learning_rate": 7.310560681334279e-05, + "loss": 0.0023455359041690826, + "step": 189550 + }, + { + "epoch": 26.907026259758695, + "grad_norm": 0.18315298855304718, + "learning_rate": 7.31041873669269e-05, + "loss": 0.011439354717731475, + "step": 189560 + }, + { + "epoch": 26.908445706174593, + "grad_norm": 1.4919438362121582, + "learning_rate": 7.3102767920511e-05, + "loss": 0.022183892130851746, + "step": 189570 + }, + { + "epoch": 26.90986515259049, + "grad_norm": 0.14786766469478607, + "learning_rate": 7.310134847409511e-05, + "loss": 0.009397557377815247, + "step": 189580 + }, + { + "epoch": 26.911284599006386, + "grad_norm": 11.2883939743042, + "learning_rate": 7.309992902767921e-05, + "loss": 0.028970792889595032, + "step": 189590 + }, + { + "epoch": 26.912704045422284, + "grad_norm": 0.2933865785598755, + "learning_rate": 7.30985095812633e-05, + "loss": 0.022501245141029358, + "step": 189600 + }, + { + "epoch": 26.914123491838183, + "grad_norm": 12.474496841430664, + "learning_rate": 7.309709013484742e-05, + "loss": 0.011172473430633545, + "step": 189610 + }, + { + "epoch": 26.91554293825408, + "grad_norm": 0.015203606337308884, + "learning_rate": 7.309567068843151e-05, + "loss": 0.023987928032875062, + "step": 189620 + }, + { + "epoch": 26.91696238466998, + "grad_norm": 0.3995610177516937, + "learning_rate": 7.309425124201562e-05, + "loss": 0.0059918597340583805, + "step": 189630 + }, + { + "epoch": 26.918381831085878, + "grad_norm": 1.0486778020858765, + "learning_rate": 7.309283179559971e-05, + "loss": 0.026215651631355287, + "step": 189640 + }, + { + "epoch": 26.919801277501776, + "grad_norm": 0.18271683156490326, + "learning_rate": 7.309141234918382e-05, + "loss": 0.002969799190759659, + "step": 189650 + }, + { + "epoch": 26.92122072391767, + "grad_norm": 0.0035377545282244682, + "learning_rate": 7.308999290276792e-05, + "loss": 0.010035133361816407, + "step": 189660 + }, + { + "epoch": 26.92264017033357, + "grad_norm": 16.967758178710938, + "learning_rate": 7.308857345635203e-05, + "loss": 0.013067707419395447, + "step": 189670 + }, + { + "epoch": 26.924059616749467, + "grad_norm": 0.5764768719673157, + "learning_rate": 7.308715400993614e-05, + "loss": 0.010634054243564606, + "step": 189680 + }, + { + "epoch": 26.925479063165366, + "grad_norm": 2.2871851921081543, + "learning_rate": 7.308573456352022e-05, + "loss": 0.0034970838576555254, + "step": 189690 + }, + { + "epoch": 26.926898509581264, + "grad_norm": 6.825158596038818, + "learning_rate": 7.308431511710433e-05, + "loss": 0.013949987292289735, + "step": 189700 + }, + { + "epoch": 26.928317955997162, + "grad_norm": 0.004913253244012594, + "learning_rate": 7.308289567068843e-05, + "loss": 0.007551711797714233, + "step": 189710 + }, + { + "epoch": 26.92973740241306, + "grad_norm": 0.6563774347305298, + "learning_rate": 7.308147622427254e-05, + "loss": 0.0028549857437610627, + "step": 189720 + }, + { + "epoch": 26.931156848828955, + "grad_norm": 2.383897066116333, + "learning_rate": 7.308005677785664e-05, + "loss": 0.012095504999160766, + "step": 189730 + }, + { + "epoch": 26.932576295244854, + "grad_norm": 0.21172527968883514, + "learning_rate": 7.307863733144075e-05, + "loss": 0.020552679896354675, + "step": 189740 + }, + { + "epoch": 26.933995741660752, + "grad_norm": 1.5524160861968994, + "learning_rate": 7.307721788502483e-05, + "loss": 0.0173823282122612, + "step": 189750 + }, + { + "epoch": 26.93541518807665, + "grad_norm": 0.020539473742246628, + "learning_rate": 7.307579843860894e-05, + "loss": 0.06522330641746521, + "step": 189760 + }, + { + "epoch": 26.93683463449255, + "grad_norm": 0.026387304067611694, + "learning_rate": 7.307437899219306e-05, + "loss": 0.0032629553228616715, + "step": 189770 + }, + { + "epoch": 26.938254080908447, + "grad_norm": 0.17154236137866974, + "learning_rate": 7.307295954577715e-05, + "loss": 0.04615503549575806, + "step": 189780 + }, + { + "epoch": 26.939673527324345, + "grad_norm": 0.14950093626976013, + "learning_rate": 7.307154009936126e-05, + "loss": 0.014542612433433532, + "step": 189790 + }, + { + "epoch": 26.94109297374024, + "grad_norm": 6.93013334274292, + "learning_rate": 7.307012065294535e-05, + "loss": 0.005568959563970566, + "step": 189800 + }, + { + "epoch": 26.942512420156138, + "grad_norm": 6.6198039054870605, + "learning_rate": 7.306870120652946e-05, + "loss": 0.02078878879547119, + "step": 189810 + }, + { + "epoch": 26.943931866572036, + "grad_norm": 0.06405095756053925, + "learning_rate": 7.306728176011356e-05, + "loss": 0.006396336853504181, + "step": 189820 + }, + { + "epoch": 26.945351312987935, + "grad_norm": 1.4942833185195923, + "learning_rate": 7.306586231369767e-05, + "loss": 0.01767081618309021, + "step": 189830 + }, + { + "epoch": 26.946770759403833, + "grad_norm": 0.883314847946167, + "learning_rate": 7.306444286728176e-05, + "loss": 0.014265105128288269, + "step": 189840 + }, + { + "epoch": 26.94819020581973, + "grad_norm": 5.1805853843688965, + "learning_rate": 7.306302342086586e-05, + "loss": 0.0271822988986969, + "step": 189850 + }, + { + "epoch": 26.94960965223563, + "grad_norm": 21.267745971679688, + "learning_rate": 7.306160397444997e-05, + "loss": 0.030523258447647094, + "step": 189860 + }, + { + "epoch": 26.951029098651524, + "grad_norm": 10.170642852783203, + "learning_rate": 7.306018452803407e-05, + "loss": 0.0802939772605896, + "step": 189870 + }, + { + "epoch": 26.952448545067423, + "grad_norm": 0.00623898021876812, + "learning_rate": 7.305876508161818e-05, + "loss": 0.02804630100727081, + "step": 189880 + }, + { + "epoch": 26.95386799148332, + "grad_norm": 1.8390332460403442, + "learning_rate": 7.305734563520228e-05, + "loss": 0.010955916345119476, + "step": 189890 + }, + { + "epoch": 26.95528743789922, + "grad_norm": 0.1632658690214157, + "learning_rate": 7.305592618878638e-05, + "loss": 0.001447218656539917, + "step": 189900 + }, + { + "epoch": 26.956706884315118, + "grad_norm": 4.635684013366699, + "learning_rate": 7.305450674237047e-05, + "loss": 0.00467139333486557, + "step": 189910 + }, + { + "epoch": 26.958126330731016, + "grad_norm": 1.9341189861297607, + "learning_rate": 7.305308729595458e-05, + "loss": 0.036345621943473815, + "step": 189920 + }, + { + "epoch": 26.959545777146914, + "grad_norm": 1.9904857873916626, + "learning_rate": 7.305166784953868e-05, + "loss": 0.02397596538066864, + "step": 189930 + }, + { + "epoch": 26.96096522356281, + "grad_norm": 0.25566235184669495, + "learning_rate": 7.305024840312279e-05, + "loss": 0.009804637730121612, + "step": 189940 + }, + { + "epoch": 26.962384669978707, + "grad_norm": 0.08098485320806503, + "learning_rate": 7.304882895670689e-05, + "loss": 0.002619272470474243, + "step": 189950 + }, + { + "epoch": 26.963804116394606, + "grad_norm": 0.02058754488825798, + "learning_rate": 7.304740951029099e-05, + "loss": 0.020161154866218566, + "step": 189960 + }, + { + "epoch": 26.965223562810504, + "grad_norm": 0.16339130699634552, + "learning_rate": 7.30459900638751e-05, + "loss": 0.022706815600395204, + "step": 189970 + }, + { + "epoch": 26.966643009226402, + "grad_norm": 0.04152192175388336, + "learning_rate": 7.30445706174592e-05, + "loss": 0.0037408445030450823, + "step": 189980 + }, + { + "epoch": 26.9680624556423, + "grad_norm": 2.1305203437805176, + "learning_rate": 7.30431511710433e-05, + "loss": 0.007586668431758881, + "step": 189990 + }, + { + "epoch": 26.9694819020582, + "grad_norm": 0.21330054104328156, + "learning_rate": 7.304173172462739e-05, + "loss": 0.004882342740893364, + "step": 190000 + }, + { + "epoch": 26.9694819020582, + "eval_accuracy": 0.9891905639982196, + "eval_loss": 0.04136533662676811, + "eval_runtime": 32.3586, + "eval_samples_per_second": 486.022, + "eval_steps_per_second": 15.205, + "step": 190000 + }, + { + "epoch": 26.970901348474094, + "grad_norm": 0.07474946230649948, + "learning_rate": 7.30403122782115e-05, + "loss": 0.005545902997255325, + "step": 190010 + }, + { + "epoch": 26.972320794889992, + "grad_norm": 0.04088188335299492, + "learning_rate": 7.30388928317956e-05, + "loss": 0.0012014809995889663, + "step": 190020 + }, + { + "epoch": 26.97374024130589, + "grad_norm": 0.85775226354599, + "learning_rate": 7.303747338537971e-05, + "loss": 0.02331533133983612, + "step": 190030 + }, + { + "epoch": 26.97515968772179, + "grad_norm": 2.7781662940979004, + "learning_rate": 7.30360539389638e-05, + "loss": 0.0027332380414009094, + "step": 190040 + }, + { + "epoch": 26.976579134137687, + "grad_norm": 0.04568341746926308, + "learning_rate": 7.303463449254792e-05, + "loss": 0.0054175123572349545, + "step": 190050 + }, + { + "epoch": 26.977998580553585, + "grad_norm": 24.873756408691406, + "learning_rate": 7.303321504613201e-05, + "loss": 0.08656294941902161, + "step": 190060 + }, + { + "epoch": 26.979418026969483, + "grad_norm": 1.0953096151351929, + "learning_rate": 7.303179559971611e-05, + "loss": 0.011835424602031708, + "step": 190070 + }, + { + "epoch": 26.980837473385378, + "grad_norm": 1.7570234537124634, + "learning_rate": 7.303037615330022e-05, + "loss": 0.02049819827079773, + "step": 190080 + }, + { + "epoch": 26.982256919801276, + "grad_norm": 0.2277624011039734, + "learning_rate": 7.302895670688432e-05, + "loss": 0.006035779044032097, + "step": 190090 + }, + { + "epoch": 26.983676366217175, + "grad_norm": 0.22057947516441345, + "learning_rate": 7.302753726046843e-05, + "loss": 0.033043688535690306, + "step": 190100 + }, + { + "epoch": 26.985095812633073, + "grad_norm": 0.02845805324614048, + "learning_rate": 7.302611781405252e-05, + "loss": 0.004549219831824302, + "step": 190110 + }, + { + "epoch": 26.98651525904897, + "grad_norm": 0.059764064848423004, + "learning_rate": 7.302469836763663e-05, + "loss": 0.0034071147441864015, + "step": 190120 + }, + { + "epoch": 26.98793470546487, + "grad_norm": 0.37608271837234497, + "learning_rate": 7.302327892122072e-05, + "loss": 0.013873066008090972, + "step": 190130 + }, + { + "epoch": 26.989354151880768, + "grad_norm": 1.136201024055481, + "learning_rate": 7.302185947480483e-05, + "loss": 0.011026496440172196, + "step": 190140 + }, + { + "epoch": 26.990773598296663, + "grad_norm": 0.0018522769678384066, + "learning_rate": 7.302044002838893e-05, + "loss": 0.0015318792313337326, + "step": 190150 + }, + { + "epoch": 26.99219304471256, + "grad_norm": 0.030058465898036957, + "learning_rate": 7.301902058197303e-05, + "loss": 0.007352723926305771, + "step": 190160 + }, + { + "epoch": 26.99361249112846, + "grad_norm": 10.101192474365234, + "learning_rate": 7.301760113555714e-05, + "loss": 0.04160658419132233, + "step": 190170 + }, + { + "epoch": 26.995031937544358, + "grad_norm": 6.1396484375, + "learning_rate": 7.301618168914124e-05, + "loss": 0.009499992430210113, + "step": 190180 + }, + { + "epoch": 26.996451383960256, + "grad_norm": 0.037005405873060226, + "learning_rate": 7.301476224272535e-05, + "loss": 0.04469865560531616, + "step": 190190 + }, + { + "epoch": 26.997870830376154, + "grad_norm": 0.4418095648288727, + "learning_rate": 7.301334279630945e-05, + "loss": 0.00975717306137085, + "step": 190200 + }, + { + "epoch": 26.999290276792053, + "grad_norm": 0.10016702860593796, + "learning_rate": 7.301192334989354e-05, + "loss": 0.0026993710547685624, + "step": 190210 + }, + { + "epoch": 27.000709723207947, + "grad_norm": 0.06966713815927505, + "learning_rate": 7.301050390347764e-05, + "loss": 0.005028136819601059, + "step": 190220 + }, + { + "epoch": 27.002129169623846, + "grad_norm": 21.379751205444336, + "learning_rate": 7.300908445706175e-05, + "loss": 0.029791766405105592, + "step": 190230 + }, + { + "epoch": 27.003548616039744, + "grad_norm": 2.0384562015533447, + "learning_rate": 7.300766501064585e-05, + "loss": 0.006724938750267029, + "step": 190240 + }, + { + "epoch": 27.004968062455642, + "grad_norm": 5.174483776092529, + "learning_rate": 7.300624556422996e-05, + "loss": 0.009113608300685883, + "step": 190250 + }, + { + "epoch": 27.00638750887154, + "grad_norm": 0.0426425077021122, + "learning_rate": 7.300482611781406e-05, + "loss": 0.007337945699691773, + "step": 190260 + }, + { + "epoch": 27.00780695528744, + "grad_norm": 1.488190770149231, + "learning_rate": 7.300340667139815e-05, + "loss": 0.01213696449995041, + "step": 190270 + }, + { + "epoch": 27.009226401703337, + "grad_norm": 0.2771105468273163, + "learning_rate": 7.300198722498227e-05, + "loss": 0.01691460460424423, + "step": 190280 + }, + { + "epoch": 27.010645848119232, + "grad_norm": 0.09017574787139893, + "learning_rate": 7.300056777856636e-05, + "loss": 0.04592446684837341, + "step": 190290 + }, + { + "epoch": 27.01206529453513, + "grad_norm": 0.016614161431789398, + "learning_rate": 7.299914833215047e-05, + "loss": 0.011706657707691193, + "step": 190300 + }, + { + "epoch": 27.01348474095103, + "grad_norm": 0.0196833573281765, + "learning_rate": 7.299772888573456e-05, + "loss": 0.0007622580975294113, + "step": 190310 + }, + { + "epoch": 27.014904187366927, + "grad_norm": 0.9090941548347473, + "learning_rate": 7.299630943931867e-05, + "loss": 0.004079666361212731, + "step": 190320 + }, + { + "epoch": 27.016323633782825, + "grad_norm": 3.108656883239746, + "learning_rate": 7.299488999290277e-05, + "loss": 0.021924589574337006, + "step": 190330 + }, + { + "epoch": 27.017743080198724, + "grad_norm": 1.8141950368881226, + "learning_rate": 7.299347054648688e-05, + "loss": 0.0014764942228794099, + "step": 190340 + }, + { + "epoch": 27.019162526614622, + "grad_norm": 1.0359172821044922, + "learning_rate": 7.299205110007097e-05, + "loss": 0.0026554994285106657, + "step": 190350 + }, + { + "epoch": 27.020581973030517, + "grad_norm": 0.39589354395866394, + "learning_rate": 7.299063165365507e-05, + "loss": 0.013091954588890075, + "step": 190360 + }, + { + "epoch": 27.022001419446415, + "grad_norm": 3.7686965465545654, + "learning_rate": 7.298921220723918e-05, + "loss": 0.01959640681743622, + "step": 190370 + }, + { + "epoch": 27.023420865862313, + "grad_norm": 0.08263858407735825, + "learning_rate": 7.298779276082328e-05, + "loss": 0.007483527809381485, + "step": 190380 + }, + { + "epoch": 27.02484031227821, + "grad_norm": 0.6665281653404236, + "learning_rate": 7.298637331440739e-05, + "loss": 0.002825581282377243, + "step": 190390 + }, + { + "epoch": 27.02625975869411, + "grad_norm": 0.2287253737449646, + "learning_rate": 7.298495386799149e-05, + "loss": 0.02541396617889404, + "step": 190400 + }, + { + "epoch": 27.027679205110008, + "grad_norm": 0.3134785592556, + "learning_rate": 7.29835344215756e-05, + "loss": 0.003209775686264038, + "step": 190410 + }, + { + "epoch": 27.029098651525906, + "grad_norm": 0.02858641929924488, + "learning_rate": 7.298211497515968e-05, + "loss": 0.003565317392349243, + "step": 190420 + }, + { + "epoch": 27.0305180979418, + "grad_norm": 0.009315136820077896, + "learning_rate": 7.29806955287438e-05, + "loss": 0.009657030552625656, + "step": 190430 + }, + { + "epoch": 27.0319375443577, + "grad_norm": 0.07407386600971222, + "learning_rate": 7.297927608232789e-05, + "loss": 0.014524415135383606, + "step": 190440 + }, + { + "epoch": 27.033356990773598, + "grad_norm": 6.046768665313721, + "learning_rate": 7.2977856635912e-05, + "loss": 0.04394986629486084, + "step": 190450 + }, + { + "epoch": 27.034776437189496, + "grad_norm": 3.5749804973602295, + "learning_rate": 7.29764371894961e-05, + "loss": 0.02310638129711151, + "step": 190460 + }, + { + "epoch": 27.036195883605394, + "grad_norm": 2.7364389896392822, + "learning_rate": 7.29750177430802e-05, + "loss": 0.016384579241275787, + "step": 190470 + }, + { + "epoch": 27.037615330021293, + "grad_norm": 0.060251276940107346, + "learning_rate": 7.297359829666431e-05, + "loss": 0.0027560558170080184, + "step": 190480 + }, + { + "epoch": 27.03903477643719, + "grad_norm": 0.019740397110581398, + "learning_rate": 7.29721788502484e-05, + "loss": 0.029629665613174438, + "step": 190490 + }, + { + "epoch": 27.040454222853086, + "grad_norm": 0.004155039321631193, + "learning_rate": 7.297075940383252e-05, + "loss": 0.0038744907826185225, + "step": 190500 + }, + { + "epoch": 27.040454222853086, + "eval_accuracy": 0.9903986774337127, + "eval_loss": 0.038767747581005096, + "eval_runtime": 32.0702, + "eval_samples_per_second": 490.392, + "eval_steps_per_second": 15.341, + "step": 190500 + }, + { + "epoch": 27.041873669268984, + "grad_norm": 0.7028621435165405, + "learning_rate": 7.296933995741661e-05, + "loss": 0.019236382842063905, + "step": 190510 + }, + { + "epoch": 27.043293115684882, + "grad_norm": 0.02412698231637478, + "learning_rate": 7.296792051100071e-05, + "loss": 0.02139263302087784, + "step": 190520 + }, + { + "epoch": 27.04471256210078, + "grad_norm": 0.05041045323014259, + "learning_rate": 7.296650106458481e-05, + "loss": 0.007019779086112976, + "step": 190530 + }, + { + "epoch": 27.04613200851668, + "grad_norm": 1.0413655042648315, + "learning_rate": 7.296508161816892e-05, + "loss": 0.023050764203071596, + "step": 190540 + }, + { + "epoch": 27.047551454932577, + "grad_norm": 0.15732373297214508, + "learning_rate": 7.296366217175302e-05, + "loss": 0.0024082962423563003, + "step": 190550 + }, + { + "epoch": 27.048970901348476, + "grad_norm": 4.847802639007568, + "learning_rate": 7.296224272533713e-05, + "loss": 0.02096717804670334, + "step": 190560 + }, + { + "epoch": 27.05039034776437, + "grad_norm": 1.8062970638275146, + "learning_rate": 7.296082327892122e-05, + "loss": 0.01569068878889084, + "step": 190570 + }, + { + "epoch": 27.05180979418027, + "grad_norm": 0.1076725646853447, + "learning_rate": 7.295940383250532e-05, + "loss": 0.005478966981172562, + "step": 190580 + }, + { + "epoch": 27.053229240596167, + "grad_norm": 0.016289861872792244, + "learning_rate": 7.295798438608943e-05, + "loss": 0.018553417921066285, + "step": 190590 + }, + { + "epoch": 27.054648687012065, + "grad_norm": 0.5304680466651917, + "learning_rate": 7.295656493967353e-05, + "loss": 0.001379173994064331, + "step": 190600 + }, + { + "epoch": 27.056068133427964, + "grad_norm": 0.1390530914068222, + "learning_rate": 7.295514549325764e-05, + "loss": 0.004712678864598275, + "step": 190610 + }, + { + "epoch": 27.057487579843862, + "grad_norm": 0.21817746758460999, + "learning_rate": 7.295372604684173e-05, + "loss": 0.020476356148719788, + "step": 190620 + }, + { + "epoch": 27.05890702625976, + "grad_norm": 0.06915868818759918, + "learning_rate": 7.295230660042584e-05, + "loss": 0.006013140454888344, + "step": 190630 + }, + { + "epoch": 27.060326472675655, + "grad_norm": 0.3161304295063019, + "learning_rate": 7.295088715400993e-05, + "loss": 0.010921621322631836, + "step": 190640 + }, + { + "epoch": 27.061745919091553, + "grad_norm": 21.37973403930664, + "learning_rate": 7.294946770759404e-05, + "loss": 0.02122836858034134, + "step": 190650 + }, + { + "epoch": 27.06316536550745, + "grad_norm": 0.7612192630767822, + "learning_rate": 7.294804826117814e-05, + "loss": 0.013109302520751953, + "step": 190660 + }, + { + "epoch": 27.06458481192335, + "grad_norm": 3.6927075386047363, + "learning_rate": 7.294662881476224e-05, + "loss": 0.03947166502475739, + "step": 190670 + }, + { + "epoch": 27.066004258339248, + "grad_norm": 3.121595859527588, + "learning_rate": 7.294520936834635e-05, + "loss": 0.013760532438755035, + "step": 190680 + }, + { + "epoch": 27.067423704755146, + "grad_norm": 0.03771788254380226, + "learning_rate": 7.294378992193045e-05, + "loss": 0.03608244359493255, + "step": 190690 + }, + { + "epoch": 27.068843151171045, + "grad_norm": 3.6242127418518066, + "learning_rate": 7.294237047551456e-05, + "loss": 0.025374853610992433, + "step": 190700 + }, + { + "epoch": 27.07026259758694, + "grad_norm": 11.555706977844238, + "learning_rate": 7.294095102909866e-05, + "loss": 0.013564574718475341, + "step": 190710 + }, + { + "epoch": 27.071682044002838, + "grad_norm": 0.06795185804367065, + "learning_rate": 7.293953158268275e-05, + "loss": 0.011789542436599732, + "step": 190720 + }, + { + "epoch": 27.073101490418736, + "grad_norm": 19.761680603027344, + "learning_rate": 7.293811213626685e-05, + "loss": 0.0199387788772583, + "step": 190730 + }, + { + "epoch": 27.074520936834634, + "grad_norm": 3.0455949306488037, + "learning_rate": 7.293669268985096e-05, + "loss": 0.020233365893363952, + "step": 190740 + }, + { + "epoch": 27.075940383250533, + "grad_norm": 0.04115818068385124, + "learning_rate": 7.293527324343506e-05, + "loss": 0.01156722754240036, + "step": 190750 + }, + { + "epoch": 27.07735982966643, + "grad_norm": 0.44880253076553345, + "learning_rate": 7.293385379701917e-05, + "loss": 0.003426133468747139, + "step": 190760 + }, + { + "epoch": 27.07877927608233, + "grad_norm": 0.14137986302375793, + "learning_rate": 7.293243435060327e-05, + "loss": 0.020248760282993317, + "step": 190770 + }, + { + "epoch": 27.080198722498224, + "grad_norm": 5.373007774353027, + "learning_rate": 7.293101490418736e-05, + "loss": 0.026418781280517577, + "step": 190780 + }, + { + "epoch": 27.081618168914122, + "grad_norm": 1.8440251350402832, + "learning_rate": 7.292959545777148e-05, + "loss": 0.013500216603279113, + "step": 190790 + }, + { + "epoch": 27.08303761533002, + "grad_norm": 0.0735994428396225, + "learning_rate": 7.292817601135557e-05, + "loss": 0.00676964819431305, + "step": 190800 + }, + { + "epoch": 27.08445706174592, + "grad_norm": 0.290141761302948, + "learning_rate": 7.292675656493968e-05, + "loss": 0.023635977506637575, + "step": 190810 + }, + { + "epoch": 27.085876508161817, + "grad_norm": 0.09833427518606186, + "learning_rate": 7.292533711852378e-05, + "loss": 0.0034241225570440293, + "step": 190820 + }, + { + "epoch": 27.087295954577716, + "grad_norm": 13.822493553161621, + "learning_rate": 7.292391767210788e-05, + "loss": 0.01098010316491127, + "step": 190830 + }, + { + "epoch": 27.088715400993614, + "grad_norm": 16.826095581054688, + "learning_rate": 7.292249822569198e-05, + "loss": 0.06436796188354492, + "step": 190840 + }, + { + "epoch": 27.09013484740951, + "grad_norm": 3.9041240215301514, + "learning_rate": 7.292107877927609e-05, + "loss": 0.04785264134407043, + "step": 190850 + }, + { + "epoch": 27.091554293825407, + "grad_norm": 0.29727864265441895, + "learning_rate": 7.291965933286018e-05, + "loss": 0.0035093042999505995, + "step": 190860 + }, + { + "epoch": 27.092973740241305, + "grad_norm": 8.17792797088623, + "learning_rate": 7.29182398864443e-05, + "loss": 0.016250143945217132, + "step": 190870 + }, + { + "epoch": 27.094393186657204, + "grad_norm": 0.2940853238105774, + "learning_rate": 7.291682044002839e-05, + "loss": 0.0028091400861740114, + "step": 190880 + }, + { + "epoch": 27.095812633073102, + "grad_norm": 2.8062431812286377, + "learning_rate": 7.291540099361249e-05, + "loss": 0.001753637194633484, + "step": 190890 + }, + { + "epoch": 27.097232079489, + "grad_norm": 3.1623528003692627, + "learning_rate": 7.29139815471966e-05, + "loss": 0.004714594036340713, + "step": 190900 + }, + { + "epoch": 27.0986515259049, + "grad_norm": 0.09320322424173355, + "learning_rate": 7.29125621007807e-05, + "loss": 0.0042792316526174545, + "step": 190910 + }, + { + "epoch": 27.100070972320793, + "grad_norm": 0.5841223001480103, + "learning_rate": 7.291114265436481e-05, + "loss": 0.005404908210039139, + "step": 190920 + }, + { + "epoch": 27.10149041873669, + "grad_norm": 0.07453355938196182, + "learning_rate": 7.290972320794889e-05, + "loss": 0.0015937503427267074, + "step": 190930 + }, + { + "epoch": 27.10290986515259, + "grad_norm": 0.009103303775191307, + "learning_rate": 7.2908303761533e-05, + "loss": 0.009330148249864579, + "step": 190940 + }, + { + "epoch": 27.10432931156849, + "grad_norm": 0.23937132954597473, + "learning_rate": 7.29068843151171e-05, + "loss": 0.0024670470505952836, + "step": 190950 + }, + { + "epoch": 27.105748757984387, + "grad_norm": 0.6615861654281616, + "learning_rate": 7.290546486870121e-05, + "loss": 0.004988449439406395, + "step": 190960 + }, + { + "epoch": 27.107168204400285, + "grad_norm": 0.005132375285029411, + "learning_rate": 7.290404542228532e-05, + "loss": 0.012666505575180054, + "step": 190970 + }, + { + "epoch": 27.108587650816183, + "grad_norm": 0.42300546169281006, + "learning_rate": 7.290262597586941e-05, + "loss": 0.003406164422631264, + "step": 190980 + }, + { + "epoch": 27.110007097232078, + "grad_norm": 3.9132001399993896, + "learning_rate": 7.290120652945352e-05, + "loss": 0.006708452105522155, + "step": 190990 + }, + { + "epoch": 27.111426543647976, + "grad_norm": 0.00575717119500041, + "learning_rate": 7.289978708303762e-05, + "loss": 0.016863283514976502, + "step": 191000 + }, + { + "epoch": 27.111426543647976, + "eval_accuracy": 0.9892541489158772, + "eval_loss": 0.039571791887283325, + "eval_runtime": 31.9444, + "eval_samples_per_second": 492.324, + "eval_steps_per_second": 15.402, + "step": 191000 + }, + { + "epoch": 27.112845990063875, + "grad_norm": 0.07511303573846817, + "learning_rate": 7.289836763662173e-05, + "loss": 0.007731519639492035, + "step": 191010 + }, + { + "epoch": 27.114265436479773, + "grad_norm": 0.19819971919059753, + "learning_rate": 7.289694819020582e-05, + "loss": 0.05561207532882691, + "step": 191020 + }, + { + "epoch": 27.11568488289567, + "grad_norm": 0.012442406266927719, + "learning_rate": 7.289552874378992e-05, + "loss": 0.009793688356876374, + "step": 191030 + }, + { + "epoch": 27.11710432931157, + "grad_norm": 15.246973991394043, + "learning_rate": 7.289410929737402e-05, + "loss": 0.03686095178127289, + "step": 191040 + }, + { + "epoch": 27.118523775727468, + "grad_norm": 0.19784848392009735, + "learning_rate": 7.289268985095813e-05, + "loss": 0.0005286432802677155, + "step": 191050 + }, + { + "epoch": 27.119943222143363, + "grad_norm": 0.12025101482868195, + "learning_rate": 7.289127040454224e-05, + "loss": 0.030495744943618775, + "step": 191060 + }, + { + "epoch": 27.12136266855926, + "grad_norm": 0.04281320795416832, + "learning_rate": 7.288985095812634e-05, + "loss": 0.03690598905086517, + "step": 191070 + }, + { + "epoch": 27.12278211497516, + "grad_norm": 7.66448974609375, + "learning_rate": 7.288843151171043e-05, + "loss": 0.004063408821821213, + "step": 191080 + }, + { + "epoch": 27.124201561391057, + "grad_norm": 3.684537410736084, + "learning_rate": 7.288701206529453e-05, + "loss": 0.004122399911284447, + "step": 191090 + }, + { + "epoch": 27.125621007806956, + "grad_norm": 6.876227855682373, + "learning_rate": 7.288559261887864e-05, + "loss": 0.005297063291072846, + "step": 191100 + }, + { + "epoch": 27.127040454222854, + "grad_norm": 0.16277210414409637, + "learning_rate": 7.288417317246274e-05, + "loss": 0.01753024458885193, + "step": 191110 + }, + { + "epoch": 27.128459900638752, + "grad_norm": 0.3997461795806885, + "learning_rate": 7.288275372604685e-05, + "loss": 0.03114272654056549, + "step": 191120 + }, + { + "epoch": 27.129879347054647, + "grad_norm": 2.0052740573883057, + "learning_rate": 7.288133427963095e-05, + "loss": 0.009379880875349045, + "step": 191130 + }, + { + "epoch": 27.131298793470545, + "grad_norm": 0.24144825339317322, + "learning_rate": 7.287991483321505e-05, + "loss": 0.0010399144142866136, + "step": 191140 + }, + { + "epoch": 27.132718239886444, + "grad_norm": 0.06684955954551697, + "learning_rate": 7.287849538679916e-05, + "loss": 0.014570978283882142, + "step": 191150 + }, + { + "epoch": 27.134137686302342, + "grad_norm": 0.40837106108665466, + "learning_rate": 7.287707594038325e-05, + "loss": 0.01435604989528656, + "step": 191160 + }, + { + "epoch": 27.13555713271824, + "grad_norm": 0.10928481817245483, + "learning_rate": 7.287565649396737e-05, + "loss": 0.009691636264324188, + "step": 191170 + }, + { + "epoch": 27.13697657913414, + "grad_norm": 0.17414265871047974, + "learning_rate": 7.287423704755146e-05, + "loss": 0.01358020156621933, + "step": 191180 + }, + { + "epoch": 27.138396025550037, + "grad_norm": 0.10036390274763107, + "learning_rate": 7.287281760113556e-05, + "loss": 0.0037049394100904464, + "step": 191190 + }, + { + "epoch": 27.13981547196593, + "grad_norm": 1.7566581964492798, + "learning_rate": 7.287139815471966e-05, + "loss": 0.0030408762395381927, + "step": 191200 + }, + { + "epoch": 27.14123491838183, + "grad_norm": 8.21112060546875, + "learning_rate": 7.286997870830377e-05, + "loss": 0.018000730872154237, + "step": 191210 + }, + { + "epoch": 27.14265436479773, + "grad_norm": 0.022566141560673714, + "learning_rate": 7.286855926188787e-05, + "loss": 0.0011714600026607514, + "step": 191220 + }, + { + "epoch": 27.144073811213627, + "grad_norm": 1.4900633096694946, + "learning_rate": 7.286713981547198e-05, + "loss": 0.0415353924036026, + "step": 191230 + }, + { + "epoch": 27.145493257629525, + "grad_norm": 16.06897735595703, + "learning_rate": 7.286572036905606e-05, + "loss": 0.02426575720310211, + "step": 191240 + }, + { + "epoch": 27.146912704045423, + "grad_norm": 0.020825082436203957, + "learning_rate": 7.286430092264017e-05, + "loss": 0.01692379415035248, + "step": 191250 + }, + { + "epoch": 27.14833215046132, + "grad_norm": 0.048319168388843536, + "learning_rate": 7.286288147622428e-05, + "loss": 0.022628961503505705, + "step": 191260 + }, + { + "epoch": 27.149751596877216, + "grad_norm": 0.008091052062809467, + "learning_rate": 7.286146202980838e-05, + "loss": 0.005841666832566261, + "step": 191270 + }, + { + "epoch": 27.151171043293115, + "grad_norm": 0.12350740283727646, + "learning_rate": 7.286004258339249e-05, + "loss": 0.017672154307365417, + "step": 191280 + }, + { + "epoch": 27.152590489709013, + "grad_norm": 0.14047111570835114, + "learning_rate": 7.285862313697657e-05, + "loss": 0.0028246358036994935, + "step": 191290 + }, + { + "epoch": 27.15400993612491, + "grad_norm": 1.0076727867126465, + "learning_rate": 7.285720369056069e-05, + "loss": 0.023023207485675812, + "step": 191300 + }, + { + "epoch": 27.15542938254081, + "grad_norm": 0.0076265339739620686, + "learning_rate": 7.285578424414478e-05, + "loss": 0.006774328649044037, + "step": 191310 + }, + { + "epoch": 27.156848828956708, + "grad_norm": 0.055047791451215744, + "learning_rate": 7.28543647977289e-05, + "loss": 0.04213235974311828, + "step": 191320 + }, + { + "epoch": 27.158268275372606, + "grad_norm": 3.459190607070923, + "learning_rate": 7.285294535131299e-05, + "loss": 0.030608350038528444, + "step": 191330 + }, + { + "epoch": 27.1596877217885, + "grad_norm": 0.022169923409819603, + "learning_rate": 7.285152590489709e-05, + "loss": 0.046110183000564575, + "step": 191340 + }, + { + "epoch": 27.1611071682044, + "grad_norm": 0.14559943974018097, + "learning_rate": 7.28501064584812e-05, + "loss": 0.0009686313569545746, + "step": 191350 + }, + { + "epoch": 27.162526614620297, + "grad_norm": 0.7566708922386169, + "learning_rate": 7.28486870120653e-05, + "loss": 0.011867016553878784, + "step": 191360 + }, + { + "epoch": 27.163946061036196, + "grad_norm": 4.718181133270264, + "learning_rate": 7.284726756564941e-05, + "loss": 0.03537431359291077, + "step": 191370 + }, + { + "epoch": 27.165365507452094, + "grad_norm": 0.04275204613804817, + "learning_rate": 7.28458481192335e-05, + "loss": 0.024711066484451295, + "step": 191380 + }, + { + "epoch": 27.166784953867992, + "grad_norm": 0.7030513286590576, + "learning_rate": 7.28444286728176e-05, + "loss": 0.0057178489863872525, + "step": 191390 + }, + { + "epoch": 27.16820440028389, + "grad_norm": 0.004694937728345394, + "learning_rate": 7.28430092264017e-05, + "loss": 0.004941980168223381, + "step": 191400 + }, + { + "epoch": 27.169623846699785, + "grad_norm": 2.7520062923431396, + "learning_rate": 7.284158977998581e-05, + "loss": 0.009537214785814286, + "step": 191410 + }, + { + "epoch": 27.171043293115684, + "grad_norm": 0.029213711619377136, + "learning_rate": 7.284017033356991e-05, + "loss": 0.02362590730190277, + "step": 191420 + }, + { + "epoch": 27.172462739531582, + "grad_norm": 0.06821836531162262, + "learning_rate": 7.283875088715402e-05, + "loss": 0.01345895677804947, + "step": 191430 + }, + { + "epoch": 27.17388218594748, + "grad_norm": 0.011302479542791843, + "learning_rate": 7.283733144073812e-05, + "loss": 0.02245138883590698, + "step": 191440 + }, + { + "epoch": 27.17530163236338, + "grad_norm": 2.002932071685791, + "learning_rate": 7.283591199432221e-05, + "loss": 0.00673588365316391, + "step": 191450 + }, + { + "epoch": 27.176721078779277, + "grad_norm": 0.39228808879852295, + "learning_rate": 7.283449254790632e-05, + "loss": 0.010158324241638183, + "step": 191460 + }, + { + "epoch": 27.178140525195175, + "grad_norm": 4.707523345947266, + "learning_rate": 7.283307310149042e-05, + "loss": 0.007866961508989334, + "step": 191470 + }, + { + "epoch": 27.17955997161107, + "grad_norm": 0.020739568397402763, + "learning_rate": 7.283165365507453e-05, + "loss": 0.016995808482170104, + "step": 191480 + }, + { + "epoch": 27.18097941802697, + "grad_norm": 0.121962770819664, + "learning_rate": 7.283023420865863e-05, + "loss": 0.005230249464511871, + "step": 191490 + }, + { + "epoch": 27.182398864442867, + "grad_norm": 0.06573532521724701, + "learning_rate": 7.282881476224273e-05, + "loss": 0.008110152184963226, + "step": 191500 + }, + { + "epoch": 27.182398864442867, + "eval_accuracy": 0.9871558466331786, + "eval_loss": 0.05078424885869026, + "eval_runtime": 31.5442, + "eval_samples_per_second": 498.57, + "eval_steps_per_second": 15.597, + "step": 191500 + }, + { + "epoch": 27.183818310858765, + "grad_norm": 0.6670852899551392, + "learning_rate": 7.282739531582683e-05, + "loss": 0.0005929749459028244, + "step": 191510 + }, + { + "epoch": 27.185237757274663, + "grad_norm": 0.0717526525259018, + "learning_rate": 7.282597586941094e-05, + "loss": 0.015395966172218323, + "step": 191520 + }, + { + "epoch": 27.18665720369056, + "grad_norm": 0.2663034498691559, + "learning_rate": 7.282455642299503e-05, + "loss": 0.030570638179779053, + "step": 191530 + }, + { + "epoch": 27.18807665010646, + "grad_norm": 0.03635529428720474, + "learning_rate": 7.282313697657914e-05, + "loss": 0.021543405950069427, + "step": 191540 + }, + { + "epoch": 27.189496096522355, + "grad_norm": 0.36646637320518494, + "learning_rate": 7.282171753016324e-05, + "loss": 0.005037686973810196, + "step": 191550 + }, + { + "epoch": 27.190915542938253, + "grad_norm": 0.003721645101904869, + "learning_rate": 7.282029808374734e-05, + "loss": 0.009990616142749787, + "step": 191560 + }, + { + "epoch": 27.19233498935415, + "grad_norm": 0.6375809907913208, + "learning_rate": 7.281887863733145e-05, + "loss": 0.01558317244052887, + "step": 191570 + }, + { + "epoch": 27.19375443577005, + "grad_norm": 0.012593589723110199, + "learning_rate": 7.281745919091555e-05, + "loss": 0.03950413167476654, + "step": 191580 + }, + { + "epoch": 27.195173882185948, + "grad_norm": 0.12629349529743195, + "learning_rate": 7.281603974449966e-05, + "loss": 0.023264995217323302, + "step": 191590 + }, + { + "epoch": 27.196593328601846, + "grad_norm": 0.0070425961166620255, + "learning_rate": 7.281462029808374e-05, + "loss": 0.0021388839930295946, + "step": 191600 + }, + { + "epoch": 27.198012775017745, + "grad_norm": 1.4395498037338257, + "learning_rate": 7.281320085166785e-05, + "loss": 0.007991529256105422, + "step": 191610 + }, + { + "epoch": 27.19943222143364, + "grad_norm": 0.3317471146583557, + "learning_rate": 7.281178140525195e-05, + "loss": 0.023469412326812746, + "step": 191620 + }, + { + "epoch": 27.200851667849538, + "grad_norm": 0.054901450872421265, + "learning_rate": 7.281036195883606e-05, + "loss": 0.009079059958457947, + "step": 191630 + }, + { + "epoch": 27.202271114265436, + "grad_norm": 2.459756851196289, + "learning_rate": 7.280894251242016e-05, + "loss": 0.018339964747428893, + "step": 191640 + }, + { + "epoch": 27.203690560681334, + "grad_norm": 21.645111083984375, + "learning_rate": 7.280752306600426e-05, + "loss": 0.03264265060424805, + "step": 191650 + }, + { + "epoch": 27.205110007097232, + "grad_norm": 0.7140429019927979, + "learning_rate": 7.280610361958837e-05, + "loss": 0.01011199802160263, + "step": 191660 + }, + { + "epoch": 27.20652945351313, + "grad_norm": 0.10388354957103729, + "learning_rate": 7.280468417317246e-05, + "loss": 0.006242910772562027, + "step": 191670 + }, + { + "epoch": 27.20794889992903, + "grad_norm": 13.004758834838867, + "learning_rate": 7.280326472675658e-05, + "loss": 0.01620042622089386, + "step": 191680 + }, + { + "epoch": 27.209368346344924, + "grad_norm": 0.8808399438858032, + "learning_rate": 7.280184528034067e-05, + "loss": 0.00786823183298111, + "step": 191690 + }, + { + "epoch": 27.210787792760822, + "grad_norm": 14.583945274353027, + "learning_rate": 7.280042583392477e-05, + "loss": 0.0395379513502121, + "step": 191700 + }, + { + "epoch": 27.21220723917672, + "grad_norm": 11.186941146850586, + "learning_rate": 7.279900638750887e-05, + "loss": 0.025801566243171693, + "step": 191710 + }, + { + "epoch": 27.21362668559262, + "grad_norm": 0.5796197652816772, + "learning_rate": 7.279758694109298e-05, + "loss": 0.013778576254844665, + "step": 191720 + }, + { + "epoch": 27.215046132008517, + "grad_norm": 7.668276309967041, + "learning_rate": 7.279616749467708e-05, + "loss": 0.006299269199371338, + "step": 191730 + }, + { + "epoch": 27.216465578424415, + "grad_norm": 0.02313707023859024, + "learning_rate": 7.279474804826119e-05, + "loss": 0.018675515055656434, + "step": 191740 + }, + { + "epoch": 27.217885024840314, + "grad_norm": 0.04697905480861664, + "learning_rate": 7.279332860184528e-05, + "loss": 0.006418564170598984, + "step": 191750 + }, + { + "epoch": 27.21930447125621, + "grad_norm": 2.179476737976074, + "learning_rate": 7.279190915542938e-05, + "loss": 0.021026183664798737, + "step": 191760 + }, + { + "epoch": 27.220723917672107, + "grad_norm": 0.5000434517860413, + "learning_rate": 7.279048970901349e-05, + "loss": 0.003215542435646057, + "step": 191770 + }, + { + "epoch": 27.222143364088005, + "grad_norm": 2.692800998687744, + "learning_rate": 7.278907026259759e-05, + "loss": 0.004678188264369965, + "step": 191780 + }, + { + "epoch": 27.223562810503903, + "grad_norm": 0.5846898555755615, + "learning_rate": 7.27876508161817e-05, + "loss": 0.0011133570224046707, + "step": 191790 + }, + { + "epoch": 27.2249822569198, + "grad_norm": 8.804787635803223, + "learning_rate": 7.278623136976578e-05, + "loss": 0.034216052293777464, + "step": 191800 + }, + { + "epoch": 27.2264017033357, + "grad_norm": 0.054185304790735245, + "learning_rate": 7.27848119233499e-05, + "loss": 0.004387175664305687, + "step": 191810 + }, + { + "epoch": 27.2278211497516, + "grad_norm": 0.02936997264623642, + "learning_rate": 7.278339247693399e-05, + "loss": 0.0015100345015525817, + "step": 191820 + }, + { + "epoch": 27.229240596167493, + "grad_norm": 0.05014582350850105, + "learning_rate": 7.27819730305181e-05, + "loss": 0.0033941134810447694, + "step": 191830 + }, + { + "epoch": 27.23066004258339, + "grad_norm": 11.994876861572266, + "learning_rate": 7.27805535841022e-05, + "loss": 0.030228492617607117, + "step": 191840 + }, + { + "epoch": 27.23207948899929, + "grad_norm": 0.2561153173446655, + "learning_rate": 7.277913413768631e-05, + "loss": 0.003010893613100052, + "step": 191850 + }, + { + "epoch": 27.233498935415188, + "grad_norm": 0.4217016398906708, + "learning_rate": 7.277771469127041e-05, + "loss": 0.009272967278957368, + "step": 191860 + }, + { + "epoch": 27.234918381831086, + "grad_norm": 5.388932228088379, + "learning_rate": 7.277629524485451e-05, + "loss": 0.002620214968919754, + "step": 191870 + }, + { + "epoch": 27.236337828246985, + "grad_norm": 1.3735383749008179, + "learning_rate": 7.277487579843862e-05, + "loss": 0.01397559642791748, + "step": 191880 + }, + { + "epoch": 27.237757274662883, + "grad_norm": 5.27756404876709, + "learning_rate": 7.277345635202272e-05, + "loss": 0.016096514463424683, + "step": 191890 + }, + { + "epoch": 27.239176721078778, + "grad_norm": 0.020602475851774216, + "learning_rate": 7.277203690560683e-05, + "loss": 0.0009720738977193833, + "step": 191900 + }, + { + "epoch": 27.240596167494676, + "grad_norm": 0.38097840547561646, + "learning_rate": 7.277061745919091e-05, + "loss": 0.012950451672077179, + "step": 191910 + }, + { + "epoch": 27.242015613910574, + "grad_norm": 0.4531165361404419, + "learning_rate": 7.276919801277502e-05, + "loss": 0.003508702665567398, + "step": 191920 + }, + { + "epoch": 27.243435060326473, + "grad_norm": 9.191899299621582, + "learning_rate": 7.276777856635912e-05, + "loss": 0.009597922116518021, + "step": 191930 + }, + { + "epoch": 27.24485450674237, + "grad_norm": 0.06469989567995071, + "learning_rate": 7.276635911994323e-05, + "loss": 0.012405528128147126, + "step": 191940 + }, + { + "epoch": 27.24627395315827, + "grad_norm": 4.348508358001709, + "learning_rate": 7.276493967352733e-05, + "loss": 0.0231952041387558, + "step": 191950 + }, + { + "epoch": 27.247693399574167, + "grad_norm": 0.3782586455345154, + "learning_rate": 7.276352022711142e-05, + "loss": 0.023311315476894377, + "step": 191960 + }, + { + "epoch": 27.249112845990062, + "grad_norm": 0.08815713226795197, + "learning_rate": 7.276210078069554e-05, + "loss": 0.00874444842338562, + "step": 191970 + }, + { + "epoch": 27.25053229240596, + "grad_norm": 0.01843853108584881, + "learning_rate": 7.276068133427963e-05, + "loss": 0.0022217508405447004, + "step": 191980 + }, + { + "epoch": 27.25195173882186, + "grad_norm": 3.090183973312378, + "learning_rate": 7.275926188786374e-05, + "loss": 0.0028020404279232025, + "step": 191990 + }, + { + "epoch": 27.253371185237757, + "grad_norm": 0.01635570079088211, + "learning_rate": 7.275784244144784e-05, + "loss": 0.003599552810192108, + "step": 192000 + }, + { + "epoch": 27.253371185237757, + "eval_accuracy": 0.9894449036688497, + "eval_loss": 0.043725162744522095, + "eval_runtime": 31.3866, + "eval_samples_per_second": 501.073, + "eval_steps_per_second": 15.675, + "step": 192000 + }, + { + "epoch": 27.254790631653655, + "grad_norm": 0.9335953593254089, + "learning_rate": 7.275642299503194e-05, + "loss": 0.006120849028229713, + "step": 192010 + }, + { + "epoch": 27.256210078069554, + "grad_norm": 0.1854640692472458, + "learning_rate": 7.275500354861604e-05, + "loss": 0.01742289662361145, + "step": 192020 + }, + { + "epoch": 27.257629524485452, + "grad_norm": 0.2505531907081604, + "learning_rate": 7.275358410220015e-05, + "loss": 0.05888239741325378, + "step": 192030 + }, + { + "epoch": 27.259048970901347, + "grad_norm": 0.13134899735450745, + "learning_rate": 7.275216465578424e-05, + "loss": 0.033838319778442386, + "step": 192040 + }, + { + "epoch": 27.260468417317245, + "grad_norm": 0.006814971566200256, + "learning_rate": 7.275074520936835e-05, + "loss": 0.010715234279632568, + "step": 192050 + }, + { + "epoch": 27.261887863733143, + "grad_norm": 1.2635633945465088, + "learning_rate": 7.274932576295245e-05, + "loss": 0.017574332654476166, + "step": 192060 + }, + { + "epoch": 27.26330731014904, + "grad_norm": 1.6964399814605713, + "learning_rate": 7.274790631653655e-05, + "loss": 0.016111280024051666, + "step": 192070 + }, + { + "epoch": 27.26472675656494, + "grad_norm": 0.04628168046474457, + "learning_rate": 7.274648687012066e-05, + "loss": 0.01741349995136261, + "step": 192080 + }, + { + "epoch": 27.26614620298084, + "grad_norm": 1.5139299631118774, + "learning_rate": 7.274506742370476e-05, + "loss": 0.00951283574104309, + "step": 192090 + }, + { + "epoch": 27.267565649396737, + "grad_norm": 3.5954463481903076, + "learning_rate": 7.274364797728887e-05, + "loss": 0.024058346450328828, + "step": 192100 + }, + { + "epoch": 27.26898509581263, + "grad_norm": 5.590195655822754, + "learning_rate": 7.274222853087295e-05, + "loss": 0.0033436667174100874, + "step": 192110 + }, + { + "epoch": 27.27040454222853, + "grad_norm": 3.6502912044525146, + "learning_rate": 7.274080908445706e-05, + "loss": 0.023967276513576507, + "step": 192120 + }, + { + "epoch": 27.271823988644428, + "grad_norm": 5.5545525550842285, + "learning_rate": 7.273938963804116e-05, + "loss": 0.042171865701675415, + "step": 192130 + }, + { + "epoch": 27.273243435060326, + "grad_norm": 1.9402835369110107, + "learning_rate": 7.273797019162527e-05, + "loss": 0.0029782522469758987, + "step": 192140 + }, + { + "epoch": 27.274662881476225, + "grad_norm": 7.670345306396484, + "learning_rate": 7.273655074520937e-05, + "loss": 0.021824659407138826, + "step": 192150 + }, + { + "epoch": 27.276082327892123, + "grad_norm": 10.063202857971191, + "learning_rate": 7.273527324343507e-05, + "loss": 0.06295344829559327, + "step": 192160 + }, + { + "epoch": 27.27750177430802, + "grad_norm": 15.409021377563477, + "learning_rate": 7.273385379701916e-05, + "loss": 0.06572320461273193, + "step": 192170 + }, + { + "epoch": 27.278921220723916, + "grad_norm": 0.07848988473415375, + "learning_rate": 7.273243435060328e-05, + "loss": 0.030690982937812805, + "step": 192180 + }, + { + "epoch": 27.280340667139814, + "grad_norm": 0.29228296875953674, + "learning_rate": 7.273101490418737e-05, + "loss": 0.0058541204780340195, + "step": 192190 + }, + { + "epoch": 27.281760113555713, + "grad_norm": 0.006753263063728809, + "learning_rate": 7.272959545777147e-05, + "loss": 0.0022713787853717803, + "step": 192200 + }, + { + "epoch": 27.28317955997161, + "grad_norm": 0.020971564576029778, + "learning_rate": 7.272817601135558e-05, + "loss": 0.0069107115268707275, + "step": 192210 + }, + { + "epoch": 27.28459900638751, + "grad_norm": 0.1624942421913147, + "learning_rate": 7.272675656493968e-05, + "loss": 0.012769517302513123, + "step": 192220 + }, + { + "epoch": 27.286018452803408, + "grad_norm": 0.49525171518325806, + "learning_rate": 7.272533711852379e-05, + "loss": 0.0008503921329975128, + "step": 192230 + }, + { + "epoch": 27.287437899219306, + "grad_norm": 0.10035304725170135, + "learning_rate": 7.272391767210787e-05, + "loss": 0.03760080933570862, + "step": 192240 + }, + { + "epoch": 27.2888573456352, + "grad_norm": 0.16340743005275726, + "learning_rate": 7.272249822569198e-05, + "loss": 0.03274021744728088, + "step": 192250 + }, + { + "epoch": 27.2902767920511, + "grad_norm": 12.497358322143555, + "learning_rate": 7.272107877927608e-05, + "loss": 0.03415640294551849, + "step": 192260 + }, + { + "epoch": 27.291696238466997, + "grad_norm": 0.06945726275444031, + "learning_rate": 7.271965933286019e-05, + "loss": 0.0036655433475971224, + "step": 192270 + }, + { + "epoch": 27.293115684882896, + "grad_norm": 0.935482919216156, + "learning_rate": 7.271823988644429e-05, + "loss": 0.03622791469097138, + "step": 192280 + }, + { + "epoch": 27.294535131298794, + "grad_norm": 5.749697685241699, + "learning_rate": 7.271682044002839e-05, + "loss": 0.01534755527973175, + "step": 192290 + }, + { + "epoch": 27.295954577714692, + "grad_norm": 0.24894388020038605, + "learning_rate": 7.27154009936125e-05, + "loss": 0.011195748299360275, + "step": 192300 + }, + { + "epoch": 27.29737402413059, + "grad_norm": 0.5875866413116455, + "learning_rate": 7.27139815471966e-05, + "loss": 0.004829529300332069, + "step": 192310 + }, + { + "epoch": 27.298793470546485, + "grad_norm": 0.35514646768569946, + "learning_rate": 7.271256210078071e-05, + "loss": 0.030643638968467713, + "step": 192320 + }, + { + "epoch": 27.300212916962384, + "grad_norm": 3.8799221515655518, + "learning_rate": 7.27111426543648e-05, + "loss": 0.009128369390964508, + "step": 192330 + }, + { + "epoch": 27.301632363378282, + "grad_norm": 3.541836977005005, + "learning_rate": 7.27097232079489e-05, + "loss": 0.04575211405754089, + "step": 192340 + }, + { + "epoch": 27.30305180979418, + "grad_norm": 10.362959861755371, + "learning_rate": 7.2708303761533e-05, + "loss": 0.016270115971565247, + "step": 192350 + }, + { + "epoch": 27.30447125621008, + "grad_norm": 10.71204948425293, + "learning_rate": 7.270688431511711e-05, + "loss": 0.035083499550819394, + "step": 192360 + }, + { + "epoch": 27.305890702625977, + "grad_norm": 13.9587984085083, + "learning_rate": 7.270546486870121e-05, + "loss": 0.008557512611150741, + "step": 192370 + }, + { + "epoch": 27.307310149041875, + "grad_norm": 12.791983604431152, + "learning_rate": 7.270404542228532e-05, + "loss": 0.039958652853965757, + "step": 192380 + }, + { + "epoch": 27.30872959545777, + "grad_norm": 0.03249330446124077, + "learning_rate": 7.270262597586942e-05, + "loss": 0.016773785650730132, + "step": 192390 + }, + { + "epoch": 27.310149041873668, + "grad_norm": 3.2128078937530518, + "learning_rate": 7.270120652945351e-05, + "loss": 0.004736151546239853, + "step": 192400 + }, + { + "epoch": 27.311568488289566, + "grad_norm": 0.008798387832939625, + "learning_rate": 7.269978708303762e-05, + "loss": 0.014683310687541962, + "step": 192410 + }, + { + "epoch": 27.312987934705465, + "grad_norm": 0.09853330999612808, + "learning_rate": 7.269836763662172e-05, + "loss": 0.06459051370620728, + "step": 192420 + }, + { + "epoch": 27.314407381121363, + "grad_norm": 7.659812927246094, + "learning_rate": 7.269694819020583e-05, + "loss": 0.02552286982536316, + "step": 192430 + }, + { + "epoch": 27.31582682753726, + "grad_norm": 0.6270888447761536, + "learning_rate": 7.269552874378992e-05, + "loss": 0.025262367725372315, + "step": 192440 + }, + { + "epoch": 27.31724627395316, + "grad_norm": 0.06060485914349556, + "learning_rate": 7.269410929737403e-05, + "loss": 0.019373589754104616, + "step": 192450 + }, + { + "epoch": 27.318665720369054, + "grad_norm": 4.697579383850098, + "learning_rate": 7.269268985095812e-05, + "loss": 0.011298523843288421, + "step": 192460 + }, + { + "epoch": 27.320085166784953, + "grad_norm": 1.1738414764404297, + "learning_rate": 7.269127040454224e-05, + "loss": 0.021178680658340453, + "step": 192470 + }, + { + "epoch": 27.32150461320085, + "grad_norm": 1.1079069375991821, + "learning_rate": 7.268985095812633e-05, + "loss": 0.014105400443077088, + "step": 192480 + }, + { + "epoch": 27.32292405961675, + "grad_norm": 13.901383399963379, + "learning_rate": 7.268843151171044e-05, + "loss": 0.0918906033039093, + "step": 192490 + }, + { + "epoch": 27.324343506032648, + "grad_norm": 6.123275279998779, + "learning_rate": 7.268701206529454e-05, + "loss": 0.012577636539936066, + "step": 192500 + }, + { + "epoch": 27.324343506032648, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.05263574421405792, + "eval_runtime": 30.6074, + "eval_samples_per_second": 513.829, + "eval_steps_per_second": 16.075, + "step": 192500 + }, + { + "epoch": 27.325762952448546, + "grad_norm": 0.28537851572036743, + "learning_rate": 7.268559261887864e-05, + "loss": 0.01684686541557312, + "step": 192510 + }, + { + "epoch": 27.327182398864444, + "grad_norm": 0.6246315836906433, + "learning_rate": 7.268417317246275e-05, + "loss": 0.00737549364566803, + "step": 192520 + }, + { + "epoch": 27.32860184528034, + "grad_norm": 0.5183212161064148, + "learning_rate": 7.268275372604685e-05, + "loss": 0.034848323464393614, + "step": 192530 + }, + { + "epoch": 27.330021291696237, + "grad_norm": 0.12499444931745529, + "learning_rate": 7.268133427963096e-05, + "loss": 0.005743818357586861, + "step": 192540 + }, + { + "epoch": 27.331440738112136, + "grad_norm": 0.2816559672355652, + "learning_rate": 7.267991483321504e-05, + "loss": 0.009364154189825058, + "step": 192550 + }, + { + "epoch": 27.332860184528034, + "grad_norm": 9.78445053100586, + "learning_rate": 7.267849538679915e-05, + "loss": 0.008307234942913055, + "step": 192560 + }, + { + "epoch": 27.334279630943932, + "grad_norm": 1.0602397918701172, + "learning_rate": 7.267707594038325e-05, + "loss": 0.005616111680865288, + "step": 192570 + }, + { + "epoch": 27.33569907735983, + "grad_norm": 0.19451405107975006, + "learning_rate": 7.267565649396736e-05, + "loss": 0.021198779344558716, + "step": 192580 + }, + { + "epoch": 27.33711852377573, + "grad_norm": 1.3735493421554565, + "learning_rate": 7.267423704755146e-05, + "loss": 0.014619407057762147, + "step": 192590 + }, + { + "epoch": 27.338537970191624, + "grad_norm": 0.2826090157032013, + "learning_rate": 7.267281760113556e-05, + "loss": 0.004455520212650299, + "step": 192600 + }, + { + "epoch": 27.339957416607522, + "grad_norm": 0.02483578585088253, + "learning_rate": 7.267139815471967e-05, + "loss": 0.012232869863510132, + "step": 192610 + }, + { + "epoch": 27.34137686302342, + "grad_norm": 0.4757656753063202, + "learning_rate": 7.266997870830376e-05, + "loss": 0.004407022520899773, + "step": 192620 + }, + { + "epoch": 27.34279630943932, + "grad_norm": 0.4853677749633789, + "learning_rate": 7.266855926188787e-05, + "loss": 0.0016824111342430114, + "step": 192630 + }, + { + "epoch": 27.344215755855217, + "grad_norm": 4.160188674926758, + "learning_rate": 7.266713981547197e-05, + "loss": 0.012746265530586243, + "step": 192640 + }, + { + "epoch": 27.345635202271115, + "grad_norm": 0.2907813489437103, + "learning_rate": 7.266572036905607e-05, + "loss": 0.003908243030309677, + "step": 192650 + }, + { + "epoch": 27.347054648687013, + "grad_norm": 0.032432492822408676, + "learning_rate": 7.266430092264017e-05, + "loss": 0.01395152062177658, + "step": 192660 + }, + { + "epoch": 27.348474095102908, + "grad_norm": 0.019541189074516296, + "learning_rate": 7.266288147622428e-05, + "loss": 0.01312016397714615, + "step": 192670 + }, + { + "epoch": 27.349893541518806, + "grad_norm": 0.25598546862602234, + "learning_rate": 7.266146202980838e-05, + "loss": 0.018972185254096986, + "step": 192680 + }, + { + "epoch": 27.351312987934705, + "grad_norm": 0.21253684163093567, + "learning_rate": 7.266004258339249e-05, + "loss": 0.011956007778644561, + "step": 192690 + }, + { + "epoch": 27.352732434350603, + "grad_norm": 0.9690915942192078, + "learning_rate": 7.265862313697658e-05, + "loss": 0.008505237102508546, + "step": 192700 + }, + { + "epoch": 27.3541518807665, + "grad_norm": 0.19429108500480652, + "learning_rate": 7.265720369056068e-05, + "loss": 0.010507977753877639, + "step": 192710 + }, + { + "epoch": 27.3555713271824, + "grad_norm": 6.002783298492432, + "learning_rate": 7.265578424414479e-05, + "loss": 0.009403792023658753, + "step": 192720 + }, + { + "epoch": 27.356990773598298, + "grad_norm": 0.1031627282500267, + "learning_rate": 7.265436479772889e-05, + "loss": 0.00461808480322361, + "step": 192730 + }, + { + "epoch": 27.358410220014193, + "grad_norm": 0.07485984265804291, + "learning_rate": 7.2652945351313e-05, + "loss": 0.014574369788169861, + "step": 192740 + }, + { + "epoch": 27.35982966643009, + "grad_norm": 2.4415688514709473, + "learning_rate": 7.265152590489708e-05, + "loss": 0.005652159824967385, + "step": 192750 + }, + { + "epoch": 27.36124911284599, + "grad_norm": 6.611206531524658, + "learning_rate": 7.26501064584812e-05, + "loss": 0.05050676465034485, + "step": 192760 + }, + { + "epoch": 27.362668559261888, + "grad_norm": 0.8058321475982666, + "learning_rate": 7.264868701206529e-05, + "loss": 0.02901065945625305, + "step": 192770 + }, + { + "epoch": 27.364088005677786, + "grad_norm": 0.6232451796531677, + "learning_rate": 7.26472675656494e-05, + "loss": 0.02761862874031067, + "step": 192780 + }, + { + "epoch": 27.365507452093684, + "grad_norm": 6.8258466720581055, + "learning_rate": 7.26458481192335e-05, + "loss": 0.02263970971107483, + "step": 192790 + }, + { + "epoch": 27.366926898509583, + "grad_norm": 0.2924034893512726, + "learning_rate": 7.26444286728176e-05, + "loss": 0.07118003964424133, + "step": 192800 + }, + { + "epoch": 27.368346344925477, + "grad_norm": 0.036500655114650726, + "learning_rate": 7.264300922640171e-05, + "loss": 0.016094301640987397, + "step": 192810 + }, + { + "epoch": 27.369765791341376, + "grad_norm": 0.10804445296525955, + "learning_rate": 7.26415897799858e-05, + "loss": 0.0072418123483657835, + "step": 192820 + }, + { + "epoch": 27.371185237757274, + "grad_norm": 0.04723421111702919, + "learning_rate": 7.264017033356992e-05, + "loss": 0.006356345117092132, + "step": 192830 + }, + { + "epoch": 27.372604684173172, + "grad_norm": 0.0451689213514328, + "learning_rate": 7.263875088715401e-05, + "loss": 0.006957966089248657, + "step": 192840 + }, + { + "epoch": 27.37402413058907, + "grad_norm": 0.009951244108378887, + "learning_rate": 7.263733144073813e-05, + "loss": 0.0016916975378990172, + "step": 192850 + }, + { + "epoch": 27.37544357700497, + "grad_norm": 0.5159999132156372, + "learning_rate": 7.263591199432221e-05, + "loss": 0.001369594782590866, + "step": 192860 + }, + { + "epoch": 27.376863023420867, + "grad_norm": 0.031101446598768234, + "learning_rate": 7.263449254790632e-05, + "loss": 0.006752415746450424, + "step": 192870 + }, + { + "epoch": 27.378282469836762, + "grad_norm": 1.2147598266601562, + "learning_rate": 7.263307310149042e-05, + "loss": 0.0026787161827087403, + "step": 192880 + }, + { + "epoch": 27.37970191625266, + "grad_norm": 0.3453077971935272, + "learning_rate": 7.263165365507453e-05, + "loss": 0.001881706714630127, + "step": 192890 + }, + { + "epoch": 27.38112136266856, + "grad_norm": 10.431466102600098, + "learning_rate": 7.263023420865863e-05, + "loss": 0.009825704991817475, + "step": 192900 + }, + { + "epoch": 27.382540809084457, + "grad_norm": 2.1506569385528564, + "learning_rate": 7.262881476224272e-05, + "loss": 0.015144245326519012, + "step": 192910 + }, + { + "epoch": 27.383960255500355, + "grad_norm": 0.19058628380298615, + "learning_rate": 7.262739531582683e-05, + "loss": 0.036941546201705935, + "step": 192920 + }, + { + "epoch": 27.385379701916253, + "grad_norm": 0.010055413469672203, + "learning_rate": 7.262597586941093e-05, + "loss": 0.004380850121378899, + "step": 192930 + }, + { + "epoch": 27.386799148332152, + "grad_norm": 0.05052359029650688, + "learning_rate": 7.262455642299504e-05, + "loss": 0.019149002432823182, + "step": 192940 + }, + { + "epoch": 27.388218594748047, + "grad_norm": 0.02068670280277729, + "learning_rate": 7.262313697657914e-05, + "loss": 0.007971327006816863, + "step": 192950 + }, + { + "epoch": 27.389638041163945, + "grad_norm": 0.15018948912620544, + "learning_rate": 7.262171753016324e-05, + "loss": 0.0241265133023262, + "step": 192960 + }, + { + "epoch": 27.391057487579843, + "grad_norm": 0.11812274158000946, + "learning_rate": 7.262029808374733e-05, + "loss": 0.007207539677619934, + "step": 192970 + }, + { + "epoch": 27.39247693399574, + "grad_norm": 0.5834751129150391, + "learning_rate": 7.261887863733145e-05, + "loss": 0.028163719177246093, + "step": 192980 + }, + { + "epoch": 27.39389638041164, + "grad_norm": 0.007938174530863762, + "learning_rate": 7.261745919091554e-05, + "loss": 0.029203468561172487, + "step": 192990 + }, + { + "epoch": 27.395315826827538, + "grad_norm": 0.06520994752645493, + "learning_rate": 7.261603974449965e-05, + "loss": 0.005591839551925659, + "step": 193000 + }, + { + "epoch": 27.395315826827538, + "eval_accuracy": 0.9881732053156991, + "eval_loss": 0.05294051021337509, + "eval_runtime": 31.5585, + "eval_samples_per_second": 498.344, + "eval_steps_per_second": 15.59, + "step": 193000 + }, + { + "epoch": 27.396735273243436, + "grad_norm": 0.905492901802063, + "learning_rate": 7.261462029808375e-05, + "loss": 0.012733143568038941, + "step": 193010 + }, + { + "epoch": 27.39815471965933, + "grad_norm": 0.020528074353933334, + "learning_rate": 7.261320085166785e-05, + "loss": 0.017116999626159667, + "step": 193020 + }, + { + "epoch": 27.39957416607523, + "grad_norm": 0.1567072868347168, + "learning_rate": 7.261178140525196e-05, + "loss": 0.00597529523074627, + "step": 193030 + }, + { + "epoch": 27.400993612491128, + "grad_norm": 3.7888786792755127, + "learning_rate": 7.261036195883606e-05, + "loss": 0.021745382249355315, + "step": 193040 + }, + { + "epoch": 27.402413058907026, + "grad_norm": 0.05655008181929588, + "learning_rate": 7.260894251242017e-05, + "loss": 0.0047262925654649734, + "step": 193050 + }, + { + "epoch": 27.403832505322924, + "grad_norm": 0.012928323820233345, + "learning_rate": 7.260752306600425e-05, + "loss": 0.005807017534971237, + "step": 193060 + }, + { + "epoch": 27.405251951738823, + "grad_norm": 0.504296600818634, + "learning_rate": 7.260610361958836e-05, + "loss": 0.03884035050868988, + "step": 193070 + }, + { + "epoch": 27.40667139815472, + "grad_norm": 0.18279112875461578, + "learning_rate": 7.260468417317246e-05, + "loss": 0.01447649598121643, + "step": 193080 + }, + { + "epoch": 27.408090844570616, + "grad_norm": 1.2084641456604004, + "learning_rate": 7.260326472675657e-05, + "loss": 0.03102928400039673, + "step": 193090 + }, + { + "epoch": 27.409510290986514, + "grad_norm": 2.858085870742798, + "learning_rate": 7.260184528034067e-05, + "loss": 0.003287140280008316, + "step": 193100 + }, + { + "epoch": 27.410929737402412, + "grad_norm": 7.392579078674316, + "learning_rate": 7.260042583392477e-05, + "loss": 0.08295182585716247, + "step": 193110 + }, + { + "epoch": 27.41234918381831, + "grad_norm": 2.484957456588745, + "learning_rate": 7.259900638750888e-05, + "loss": 0.03063863515853882, + "step": 193120 + }, + { + "epoch": 27.41376863023421, + "grad_norm": 1.6492180824279785, + "learning_rate": 7.259758694109297e-05, + "loss": 0.004266216978430748, + "step": 193130 + }, + { + "epoch": 27.415188076650107, + "grad_norm": 0.037830788642168045, + "learning_rate": 7.259616749467708e-05, + "loss": 0.008016214519739152, + "step": 193140 + }, + { + "epoch": 27.416607523066006, + "grad_norm": 0.2279769778251648, + "learning_rate": 7.259474804826118e-05, + "loss": 0.008768919855356216, + "step": 193150 + }, + { + "epoch": 27.4180269694819, + "grad_norm": 8.310822486877441, + "learning_rate": 7.259332860184528e-05, + "loss": 0.03413282632827759, + "step": 193160 + }, + { + "epoch": 27.4194464158978, + "grad_norm": 0.02976670302450657, + "learning_rate": 7.259190915542938e-05, + "loss": 0.01064833700656891, + "step": 193170 + }, + { + "epoch": 27.420865862313697, + "grad_norm": 2.6944215297698975, + "learning_rate": 7.259048970901349e-05, + "loss": 0.012797209620475768, + "step": 193180 + }, + { + "epoch": 27.422285308729595, + "grad_norm": 0.02363651618361473, + "learning_rate": 7.258907026259759e-05, + "loss": 0.016674244403839113, + "step": 193190 + }, + { + "epoch": 27.423704755145494, + "grad_norm": 0.04916385933756828, + "learning_rate": 7.25876508161817e-05, + "loss": 0.004124181345105171, + "step": 193200 + }, + { + "epoch": 27.425124201561392, + "grad_norm": 2.7386929988861084, + "learning_rate": 7.258623136976581e-05, + "loss": 0.024037326872348785, + "step": 193210 + }, + { + "epoch": 27.42654364797729, + "grad_norm": 0.1105380579829216, + "learning_rate": 7.258481192334989e-05, + "loss": 0.01785142868757248, + "step": 193220 + }, + { + "epoch": 27.427963094393185, + "grad_norm": 0.3431069850921631, + "learning_rate": 7.2583392476934e-05, + "loss": 0.007657737284898758, + "step": 193230 + }, + { + "epoch": 27.429382540809083, + "grad_norm": 0.24978198111057281, + "learning_rate": 7.25819730305181e-05, + "loss": 0.033359667658805846, + "step": 193240 + }, + { + "epoch": 27.43080198722498, + "grad_norm": 0.2176293432712555, + "learning_rate": 7.258055358410221e-05, + "loss": 0.004822316020727158, + "step": 193250 + }, + { + "epoch": 27.43222143364088, + "grad_norm": 0.007598851807415485, + "learning_rate": 7.257913413768631e-05, + "loss": 0.016088399291038512, + "step": 193260 + }, + { + "epoch": 27.433640880056778, + "grad_norm": 0.11797560751438141, + "learning_rate": 7.25777146912704e-05, + "loss": 0.008596062660217285, + "step": 193270 + }, + { + "epoch": 27.435060326472676, + "grad_norm": 0.19256022572517395, + "learning_rate": 7.25762952448545e-05, + "loss": 0.0031682148575782774, + "step": 193280 + }, + { + "epoch": 27.436479772888575, + "grad_norm": 5.535938739776611, + "learning_rate": 7.257487579843861e-05, + "loss": 0.004862877726554871, + "step": 193290 + }, + { + "epoch": 27.43789921930447, + "grad_norm": 5.54543399810791, + "learning_rate": 7.257345635202272e-05, + "loss": 0.046193286776542664, + "step": 193300 + }, + { + "epoch": 27.439318665720368, + "grad_norm": 0.44455868005752563, + "learning_rate": 7.257203690560682e-05, + "loss": 0.027576729655265808, + "step": 193310 + }, + { + "epoch": 27.440738112136266, + "grad_norm": 0.33059024810791016, + "learning_rate": 7.257061745919092e-05, + "loss": 0.004559338092803955, + "step": 193320 + }, + { + "epoch": 27.442157558552164, + "grad_norm": 0.8335413932800293, + "learning_rate": 7.256919801277502e-05, + "loss": 0.005940182134509087, + "step": 193330 + }, + { + "epoch": 27.443577004968063, + "grad_norm": 0.10758711397647858, + "learning_rate": 7.256777856635913e-05, + "loss": 0.013065491616725922, + "step": 193340 + }, + { + "epoch": 27.44499645138396, + "grad_norm": 2.6789984703063965, + "learning_rate": 7.256635911994322e-05, + "loss": 0.027133998274803162, + "step": 193350 + }, + { + "epoch": 27.44641589779986, + "grad_norm": 0.029700249433517456, + "learning_rate": 7.256493967352734e-05, + "loss": 0.018335330486297607, + "step": 193360 + }, + { + "epoch": 27.447835344215754, + "grad_norm": 1.7988362312316895, + "learning_rate": 7.256352022711142e-05, + "loss": 0.05142894983291626, + "step": 193370 + }, + { + "epoch": 27.449254790631652, + "grad_norm": 2.069445848464966, + "learning_rate": 7.256210078069553e-05, + "loss": 0.013165530562400819, + "step": 193380 + }, + { + "epoch": 27.45067423704755, + "grad_norm": 0.06755340844392776, + "learning_rate": 7.256068133427963e-05, + "loss": 0.0100299634039402, + "step": 193390 + }, + { + "epoch": 27.45209368346345, + "grad_norm": 0.12241218239068985, + "learning_rate": 7.255926188786374e-05, + "loss": 0.004058561846613884, + "step": 193400 + }, + { + "epoch": 27.453513129879347, + "grad_norm": 2.394653797149658, + "learning_rate": 7.255784244144785e-05, + "loss": 0.01110411286354065, + "step": 193410 + }, + { + "epoch": 27.454932576295246, + "grad_norm": 0.11674730479717255, + "learning_rate": 7.255642299503193e-05, + "loss": 0.004887010902166367, + "step": 193420 + }, + { + "epoch": 27.456352022711144, + "grad_norm": 0.09779095649719238, + "learning_rate": 7.255500354861604e-05, + "loss": 0.010445123910903931, + "step": 193430 + }, + { + "epoch": 27.45777146912704, + "grad_norm": 0.1291721910238266, + "learning_rate": 7.255358410220014e-05, + "loss": 0.0046006724238395694, + "step": 193440 + }, + { + "epoch": 27.459190915542937, + "grad_norm": 0.0853378176689148, + "learning_rate": 7.255216465578425e-05, + "loss": 0.02443172484636307, + "step": 193450 + }, + { + "epoch": 27.460610361958835, + "grad_norm": 9.036330223083496, + "learning_rate": 7.255074520936835e-05, + "loss": 0.033898597955703734, + "step": 193460 + }, + { + "epoch": 27.462029808374734, + "grad_norm": 0.11886949092149734, + "learning_rate": 7.254932576295245e-05, + "loss": 0.009515132755041122, + "step": 193470 + }, + { + "epoch": 27.463449254790632, + "grad_norm": 0.17238454520702362, + "learning_rate": 7.254790631653654e-05, + "loss": 0.03461544811725616, + "step": 193480 + }, + { + "epoch": 27.46486870120653, + "grad_norm": 0.10988962650299072, + "learning_rate": 7.254648687012066e-05, + "loss": 0.009623764455318451, + "step": 193490 + }, + { + "epoch": 27.46628814762243, + "grad_norm": 13.527241706848145, + "learning_rate": 7.254506742370477e-05, + "loss": 0.01566622853279114, + "step": 193500 + }, + { + "epoch": 27.46628814762243, + "eval_accuracy": 0.9896356584218223, + "eval_loss": 0.038744013756513596, + "eval_runtime": 31.4642, + "eval_samples_per_second": 499.838, + "eval_steps_per_second": 15.637, + "step": 193500 + }, + { + "epoch": 27.467707594038323, + "grad_norm": 10.137734413146973, + "learning_rate": 7.254364797728886e-05, + "loss": 0.06885168552398682, + "step": 193510 + }, + { + "epoch": 27.46912704045422, + "grad_norm": 1.6174707412719727, + "learning_rate": 7.254222853087296e-05, + "loss": 0.015177835524082185, + "step": 193520 + }, + { + "epoch": 27.47054648687012, + "grad_norm": 2.245093584060669, + "learning_rate": 7.254080908445706e-05, + "loss": 0.005404622852802276, + "step": 193530 + }, + { + "epoch": 27.471965933286018, + "grad_norm": 3.3633434772491455, + "learning_rate": 7.253938963804117e-05, + "loss": 0.01857292354106903, + "step": 193540 + }, + { + "epoch": 27.473385379701917, + "grad_norm": 0.059933871030807495, + "learning_rate": 7.253797019162527e-05, + "loss": 0.02881430685520172, + "step": 193550 + }, + { + "epoch": 27.474804826117815, + "grad_norm": 0.05531551316380501, + "learning_rate": 7.253655074520938e-05, + "loss": 0.021489013731479645, + "step": 193560 + }, + { + "epoch": 27.476224272533713, + "grad_norm": 0.8936261534690857, + "learning_rate": 7.253513129879348e-05, + "loss": 0.038233527541160585, + "step": 193570 + }, + { + "epoch": 27.477643718949608, + "grad_norm": 1.2794357538223267, + "learning_rate": 7.253371185237757e-05, + "loss": 0.021727338433265686, + "step": 193580 + }, + { + "epoch": 27.479063165365506, + "grad_norm": 0.18674349784851074, + "learning_rate": 7.253229240596168e-05, + "loss": 0.027194038033485413, + "step": 193590 + }, + { + "epoch": 27.480482611781405, + "grad_norm": 5.173076152801514, + "learning_rate": 7.253087295954578e-05, + "loss": 0.008458210527896881, + "step": 193600 + }, + { + "epoch": 27.481902058197303, + "grad_norm": 0.38695406913757324, + "learning_rate": 7.252945351312989e-05, + "loss": 0.031680533289909364, + "step": 193610 + }, + { + "epoch": 27.4833215046132, + "grad_norm": 7.293417453765869, + "learning_rate": 7.252803406671399e-05, + "loss": 0.012097161263227463, + "step": 193620 + }, + { + "epoch": 27.4847409510291, + "grad_norm": 2.9026196002960205, + "learning_rate": 7.252661462029809e-05, + "loss": 0.055040210485458374, + "step": 193630 + }, + { + "epoch": 27.486160397444998, + "grad_norm": 0.6467548608779907, + "learning_rate": 7.252519517388218e-05, + "loss": 0.007366280257701874, + "step": 193640 + }, + { + "epoch": 27.487579843860892, + "grad_norm": 0.14186537265777588, + "learning_rate": 7.25237757274663e-05, + "loss": 0.019501134753227234, + "step": 193650 + }, + { + "epoch": 27.48899929027679, + "grad_norm": 0.027431383728981018, + "learning_rate": 7.252235628105039e-05, + "loss": 0.001647978276014328, + "step": 193660 + }, + { + "epoch": 27.49041873669269, + "grad_norm": 0.24745650589466095, + "learning_rate": 7.25209368346345e-05, + "loss": 0.005335744097828865, + "step": 193670 + }, + { + "epoch": 27.491838183108587, + "grad_norm": 0.023845994845032692, + "learning_rate": 7.25195173882186e-05, + "loss": 0.02104470133781433, + "step": 193680 + }, + { + "epoch": 27.493257629524486, + "grad_norm": 0.13914403319358826, + "learning_rate": 7.25180979418027e-05, + "loss": 0.048237618803977964, + "step": 193690 + }, + { + "epoch": 27.494677075940384, + "grad_norm": 0.1331053376197815, + "learning_rate": 7.251667849538681e-05, + "loss": 0.011075158417224885, + "step": 193700 + }, + { + "epoch": 27.496096522356282, + "grad_norm": 1.1499992609024048, + "learning_rate": 7.25152590489709e-05, + "loss": 0.0041279088705778125, + "step": 193710 + }, + { + "epoch": 27.497515968772177, + "grad_norm": 4.042318344116211, + "learning_rate": 7.251383960255502e-05, + "loss": 0.015175622701644898, + "step": 193720 + }, + { + "epoch": 27.498935415188075, + "grad_norm": 1.68840754032135, + "learning_rate": 7.25124201561391e-05, + "loss": 0.02406703233718872, + "step": 193730 + }, + { + "epoch": 27.500354861603974, + "grad_norm": 0.016986539587378502, + "learning_rate": 7.251100070972321e-05, + "loss": 0.03199329078197479, + "step": 193740 + }, + { + "epoch": 27.501774308019872, + "grad_norm": 0.05986368656158447, + "learning_rate": 7.250958126330731e-05, + "loss": 0.02616932988166809, + "step": 193750 + }, + { + "epoch": 27.50319375443577, + "grad_norm": 0.4170517921447754, + "learning_rate": 7.250816181689142e-05, + "loss": 0.00861719697713852, + "step": 193760 + }, + { + "epoch": 27.50461320085167, + "grad_norm": 0.5403091311454773, + "learning_rate": 7.250674237047552e-05, + "loss": 0.0015028126537799836, + "step": 193770 + }, + { + "epoch": 27.506032647267567, + "grad_norm": 0.25221922993659973, + "learning_rate": 7.250532292405961e-05, + "loss": 0.02630663812160492, + "step": 193780 + }, + { + "epoch": 27.50745209368346, + "grad_norm": 6.1998796463012695, + "learning_rate": 7.250390347764373e-05, + "loss": 0.009553057700395584, + "step": 193790 + }, + { + "epoch": 27.50887154009936, + "grad_norm": 0.013890617527067661, + "learning_rate": 7.250248403122782e-05, + "loss": 0.0032037150114774702, + "step": 193800 + }, + { + "epoch": 27.51029098651526, + "grad_norm": 8.757580757141113, + "learning_rate": 7.250106458481193e-05, + "loss": 0.007738690078258515, + "step": 193810 + }, + { + "epoch": 27.511710432931157, + "grad_norm": 0.013722538016736507, + "learning_rate": 7.249964513839603e-05, + "loss": 0.01079782247543335, + "step": 193820 + }, + { + "epoch": 27.513129879347055, + "grad_norm": 0.06550579518079758, + "learning_rate": 7.249822569198013e-05, + "loss": 0.0040745589882135395, + "step": 193830 + }, + { + "epoch": 27.514549325762953, + "grad_norm": 9.426186561584473, + "learning_rate": 7.249680624556423e-05, + "loss": 0.047920146584510805, + "step": 193840 + }, + { + "epoch": 27.51596877217885, + "grad_norm": 0.13324980437755585, + "learning_rate": 7.249538679914834e-05, + "loss": 0.001703895628452301, + "step": 193850 + }, + { + "epoch": 27.517388218594746, + "grad_norm": 0.058988623321056366, + "learning_rate": 7.249396735273243e-05, + "loss": 0.025460436940193176, + "step": 193860 + }, + { + "epoch": 27.518807665010645, + "grad_norm": 0.7023128867149353, + "learning_rate": 7.249254790631655e-05, + "loss": 0.009439042955636977, + "step": 193870 + }, + { + "epoch": 27.520227111426543, + "grad_norm": 0.19634778797626495, + "learning_rate": 7.249112845990064e-05, + "loss": 0.012955144047737122, + "step": 193880 + }, + { + "epoch": 27.52164655784244, + "grad_norm": 0.08518175780773163, + "learning_rate": 7.248970901348474e-05, + "loss": 0.0012576386332511901, + "step": 193890 + }, + { + "epoch": 27.52306600425834, + "grad_norm": 0.031583890318870544, + "learning_rate": 7.248828956706885e-05, + "loss": 0.0020586978644132614, + "step": 193900 + }, + { + "epoch": 27.524485450674238, + "grad_norm": 0.1708974987268448, + "learning_rate": 7.248687012065295e-05, + "loss": 0.004543206095695496, + "step": 193910 + }, + { + "epoch": 27.525904897090136, + "grad_norm": 1.978804349899292, + "learning_rate": 7.248545067423706e-05, + "loss": 0.009185792505741119, + "step": 193920 + }, + { + "epoch": 27.52732434350603, + "grad_norm": 0.02831723354756832, + "learning_rate": 7.248403122782116e-05, + "loss": 0.006884568184614181, + "step": 193930 + }, + { + "epoch": 27.52874378992193, + "grad_norm": 0.010911201126873493, + "learning_rate": 7.248261178140525e-05, + "loss": 0.013623172044754028, + "step": 193940 + }, + { + "epoch": 27.530163236337827, + "grad_norm": 0.005236917175352573, + "learning_rate": 7.248119233498935e-05, + "loss": 0.0023550845682621, + "step": 193950 + }, + { + "epoch": 27.531582682753726, + "grad_norm": 0.16717736423015594, + "learning_rate": 7.247977288857346e-05, + "loss": 0.0028717577457427978, + "step": 193960 + }, + { + "epoch": 27.533002129169624, + "grad_norm": 0.3024803102016449, + "learning_rate": 7.247835344215756e-05, + "loss": 0.008090399205684662, + "step": 193970 + }, + { + "epoch": 27.534421575585522, + "grad_norm": 0.0537935309112072, + "learning_rate": 7.247693399574167e-05, + "loss": 0.0198883056640625, + "step": 193980 + }, + { + "epoch": 27.53584102200142, + "grad_norm": 0.00943696592003107, + "learning_rate": 7.247551454932577e-05, + "loss": 0.0019733503460884093, + "step": 193990 + }, + { + "epoch": 27.537260468417315, + "grad_norm": 0.1997118592262268, + "learning_rate": 7.247409510290987e-05, + "loss": 0.0013665162026882172, + "step": 194000 + }, + { + "epoch": 27.537260468417315, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.05062446743249893, + "eval_runtime": 31.6991, + "eval_samples_per_second": 496.133, + "eval_steps_per_second": 15.521, + "step": 194000 + }, + { + "epoch": 27.538679914833214, + "grad_norm": 0.5731847882270813, + "learning_rate": 7.247267565649398e-05, + "loss": 0.006389033794403076, + "step": 194010 + }, + { + "epoch": 27.540099361249112, + "grad_norm": 4.471863269805908, + "learning_rate": 7.247125621007807e-05, + "loss": 0.005113248899579048, + "step": 194020 + }, + { + "epoch": 27.54151880766501, + "grad_norm": 0.053153689950704575, + "learning_rate": 7.246983676366218e-05, + "loss": 0.0365321010351181, + "step": 194030 + }, + { + "epoch": 27.54293825408091, + "grad_norm": 0.33966121077537537, + "learning_rate": 7.246841731724627e-05, + "loss": 0.01822395622730255, + "step": 194040 + }, + { + "epoch": 27.544357700496807, + "grad_norm": 0.006621926557272673, + "learning_rate": 7.246699787083038e-05, + "loss": 0.0013311482965946198, + "step": 194050 + }, + { + "epoch": 27.545777146912705, + "grad_norm": 7.236966133117676, + "learning_rate": 7.246557842441448e-05, + "loss": 0.02517053484916687, + "step": 194060 + }, + { + "epoch": 27.5471965933286, + "grad_norm": 1.345295786857605, + "learning_rate": 7.246415897799859e-05, + "loss": 0.007672365009784699, + "step": 194070 + }, + { + "epoch": 27.5486160397445, + "grad_norm": 9.879610061645508, + "learning_rate": 7.246273953158269e-05, + "loss": 0.0156234011054039, + "step": 194080 + }, + { + "epoch": 27.550035486160397, + "grad_norm": 0.0868101567029953, + "learning_rate": 7.246132008516678e-05, + "loss": 0.00645247995853424, + "step": 194090 + }, + { + "epoch": 27.551454932576295, + "grad_norm": 0.1853124499320984, + "learning_rate": 7.24599006387509e-05, + "loss": 0.008346389234066009, + "step": 194100 + }, + { + "epoch": 27.552874378992193, + "grad_norm": 0.1416526883840561, + "learning_rate": 7.245848119233499e-05, + "loss": 0.013795724511146546, + "step": 194110 + }, + { + "epoch": 27.55429382540809, + "grad_norm": 0.7813186049461365, + "learning_rate": 7.24570617459191e-05, + "loss": 0.0014112185686826705, + "step": 194120 + }, + { + "epoch": 27.55571327182399, + "grad_norm": 0.32569387555122375, + "learning_rate": 7.24556422995032e-05, + "loss": 0.04689969420433045, + "step": 194130 + }, + { + "epoch": 27.557132718239885, + "grad_norm": 0.3619830310344696, + "learning_rate": 7.24542228530873e-05, + "loss": 0.016082713007926942, + "step": 194140 + }, + { + "epoch": 27.558552164655783, + "grad_norm": 0.02658393234014511, + "learning_rate": 7.24528034066714e-05, + "loss": 0.01008908748626709, + "step": 194150 + }, + { + "epoch": 27.55997161107168, + "grad_norm": 0.07847211509943008, + "learning_rate": 7.24513839602555e-05, + "loss": 0.004049653932452202, + "step": 194160 + }, + { + "epoch": 27.56139105748758, + "grad_norm": 0.0009643009398132563, + "learning_rate": 7.24499645138396e-05, + "loss": 0.006664827466011047, + "step": 194170 + }, + { + "epoch": 27.562810503903478, + "grad_norm": 0.006437450647354126, + "learning_rate": 7.244854506742371e-05, + "loss": 0.0073145791888237, + "step": 194180 + }, + { + "epoch": 27.564229950319376, + "grad_norm": 0.07782561331987381, + "learning_rate": 7.244712562100781e-05, + "loss": 0.01605016887187958, + "step": 194190 + }, + { + "epoch": 27.565649396735274, + "grad_norm": 0.013188580051064491, + "learning_rate": 7.244570617459191e-05, + "loss": 0.024689550697803497, + "step": 194200 + }, + { + "epoch": 27.56706884315117, + "grad_norm": 0.0019886635709553957, + "learning_rate": 7.244428672817602e-05, + "loss": 0.025366997718811034, + "step": 194210 + }, + { + "epoch": 27.568488289567068, + "grad_norm": 0.015872769057750702, + "learning_rate": 7.244286728176012e-05, + "loss": 0.03976929783821106, + "step": 194220 + }, + { + "epoch": 27.569907735982966, + "grad_norm": 0.025905465707182884, + "learning_rate": 7.244144783534423e-05, + "loss": 0.06920395493507385, + "step": 194230 + }, + { + "epoch": 27.571327182398864, + "grad_norm": 0.9639049768447876, + "learning_rate": 7.244002838892831e-05, + "loss": 0.062835431098938, + "step": 194240 + }, + { + "epoch": 27.572746628814762, + "grad_norm": 0.12305587530136108, + "learning_rate": 7.243860894251242e-05, + "loss": 0.017047525942325593, + "step": 194250 + }, + { + "epoch": 27.57416607523066, + "grad_norm": 0.06698401272296906, + "learning_rate": 7.243718949609652e-05, + "loss": 0.010906749963760376, + "step": 194260 + }, + { + "epoch": 27.57558552164656, + "grad_norm": 3.9658596515655518, + "learning_rate": 7.243577004968063e-05, + "loss": 0.022066891193389893, + "step": 194270 + }, + { + "epoch": 27.577004968062454, + "grad_norm": 0.5028477311134338, + "learning_rate": 7.243435060326473e-05, + "loss": 0.009159748256206513, + "step": 194280 + }, + { + "epoch": 27.578424414478352, + "grad_norm": 0.01767244189977646, + "learning_rate": 7.243293115684884e-05, + "loss": 0.01359551101922989, + "step": 194290 + }, + { + "epoch": 27.57984386089425, + "grad_norm": 0.2161344438791275, + "learning_rate": 7.243151171043294e-05, + "loss": 0.01429903507232666, + "step": 194300 + }, + { + "epoch": 27.58126330731015, + "grad_norm": 0.049348168075084686, + "learning_rate": 7.243009226401703e-05, + "loss": 0.005317571386694908, + "step": 194310 + }, + { + "epoch": 27.582682753726047, + "grad_norm": 8.98844051361084, + "learning_rate": 7.242867281760114e-05, + "loss": 0.031242066621780397, + "step": 194320 + }, + { + "epoch": 27.584102200141945, + "grad_norm": 1.4541960954666138, + "learning_rate": 7.242725337118524e-05, + "loss": 0.015732346475124358, + "step": 194330 + }, + { + "epoch": 27.585521646557844, + "grad_norm": 0.10602814704179764, + "learning_rate": 7.242583392476935e-05, + "loss": 0.001212640479207039, + "step": 194340 + }, + { + "epoch": 27.58694109297374, + "grad_norm": 0.2605985105037689, + "learning_rate": 7.242441447835344e-05, + "loss": 0.008151350915431977, + "step": 194350 + }, + { + "epoch": 27.588360539389637, + "grad_norm": 0.7105262279510498, + "learning_rate": 7.242299503193755e-05, + "loss": 0.009935232251882553, + "step": 194360 + }, + { + "epoch": 27.589779985805535, + "grad_norm": 10.89035701751709, + "learning_rate": 7.242157558552164e-05, + "loss": 0.040141144394874574, + "step": 194370 + }, + { + "epoch": 27.591199432221433, + "grad_norm": 1.2218159437179565, + "learning_rate": 7.242015613910576e-05, + "loss": 0.008334851264953614, + "step": 194380 + }, + { + "epoch": 27.59261887863733, + "grad_norm": 14.952829360961914, + "learning_rate": 7.241873669268985e-05, + "loss": 0.029950487613677978, + "step": 194390 + }, + { + "epoch": 27.59403832505323, + "grad_norm": 1.1687366962432861, + "learning_rate": 7.241731724627395e-05, + "loss": 0.0038801409304142, + "step": 194400 + }, + { + "epoch": 27.59545777146913, + "grad_norm": 0.8116788268089294, + "learning_rate": 7.241589779985806e-05, + "loss": 0.009042491018772126, + "step": 194410 + }, + { + "epoch": 27.596877217885023, + "grad_norm": 0.19249936938285828, + "learning_rate": 7.241447835344216e-05, + "loss": 0.010548249632120133, + "step": 194420 + }, + { + "epoch": 27.59829666430092, + "grad_norm": 3.982158899307251, + "learning_rate": 7.241305890702627e-05, + "loss": 0.02258785218000412, + "step": 194430 + }, + { + "epoch": 27.59971611071682, + "grad_norm": 0.10827697068452835, + "learning_rate": 7.241163946061037e-05, + "loss": 0.0007278017699718475, + "step": 194440 + }, + { + "epoch": 27.601135557132718, + "grad_norm": 4.50486946105957, + "learning_rate": 7.241022001419446e-05, + "loss": 0.024019385874271392, + "step": 194450 + }, + { + "epoch": 27.602555003548616, + "grad_norm": 0.437339186668396, + "learning_rate": 7.240880056777856e-05, + "loss": 0.004879604652523994, + "step": 194460 + }, + { + "epoch": 27.603974449964515, + "grad_norm": 0.028868554159998894, + "learning_rate": 7.240738112136267e-05, + "loss": 0.004581199586391449, + "step": 194470 + }, + { + "epoch": 27.605393896380413, + "grad_norm": 0.3829028904438019, + "learning_rate": 7.240596167494677e-05, + "loss": 0.0065251275897026065, + "step": 194480 + }, + { + "epoch": 27.606813342796308, + "grad_norm": 11.95362377166748, + "learning_rate": 7.240454222853088e-05, + "loss": 0.030523031949996948, + "step": 194490 + }, + { + "epoch": 27.608232789212206, + "grad_norm": 0.03454846516251564, + "learning_rate": 7.240312278211498e-05, + "loss": 0.0033931177109479903, + "step": 194500 + }, + { + "epoch": 27.608232789212206, + "eval_accuracy": 0.9876645259744389, + "eval_loss": 0.049023568630218506, + "eval_runtime": 31.7455, + "eval_samples_per_second": 495.408, + "eval_steps_per_second": 15.498, + "step": 194500 + }, + { + "epoch": 27.609652235628104, + "grad_norm": 8.11600399017334, + "learning_rate": 7.240170333569908e-05, + "loss": 0.02420666366815567, + "step": 194510 + }, + { + "epoch": 27.611071682044003, + "grad_norm": 1.2308505773544312, + "learning_rate": 7.240028388928319e-05, + "loss": 0.004038641974329948, + "step": 194520 + }, + { + "epoch": 27.6124911284599, + "grad_norm": 0.13889044523239136, + "learning_rate": 7.239886444286728e-05, + "loss": 0.028676152229309082, + "step": 194530 + }, + { + "epoch": 27.6139105748758, + "grad_norm": 0.11969706416130066, + "learning_rate": 7.23974449964514e-05, + "loss": 0.012166638672351838, + "step": 194540 + }, + { + "epoch": 27.615330021291697, + "grad_norm": 0.05028977990150452, + "learning_rate": 7.239602555003548e-05, + "loss": 0.0026525136083364485, + "step": 194550 + }, + { + "epoch": 27.616749467707596, + "grad_norm": 0.04151633754372597, + "learning_rate": 7.239460610361959e-05, + "loss": 0.01660589575767517, + "step": 194560 + }, + { + "epoch": 27.61816891412349, + "grad_norm": 2.1057205200195312, + "learning_rate": 7.239318665720369e-05, + "loss": 0.01290602833032608, + "step": 194570 + }, + { + "epoch": 27.61958836053939, + "grad_norm": 0.6332845687866211, + "learning_rate": 7.23917672107878e-05, + "loss": 0.013334937393665314, + "step": 194580 + }, + { + "epoch": 27.621007806955287, + "grad_norm": 0.0036874795332551003, + "learning_rate": 7.23903477643719e-05, + "loss": 0.016587881743907927, + "step": 194590 + }, + { + "epoch": 27.622427253371185, + "grad_norm": 0.10944578796625137, + "learning_rate": 7.238892831795599e-05, + "loss": 0.006412695348262787, + "step": 194600 + }, + { + "epoch": 27.623846699787084, + "grad_norm": 0.02158203348517418, + "learning_rate": 7.23875088715401e-05, + "loss": 0.0077064275741577145, + "step": 194610 + }, + { + "epoch": 27.625266146202982, + "grad_norm": 8.446231842041016, + "learning_rate": 7.23860894251242e-05, + "loss": 0.02053317427635193, + "step": 194620 + }, + { + "epoch": 27.62668559261888, + "grad_norm": 0.7073497772216797, + "learning_rate": 7.238466997870831e-05, + "loss": 0.008832070231437682, + "step": 194630 + }, + { + "epoch": 27.628105039034775, + "grad_norm": 0.3995440602302551, + "learning_rate": 7.238325053229241e-05, + "loss": 0.030134502053260803, + "step": 194640 + }, + { + "epoch": 27.629524485450673, + "grad_norm": 0.5356709957122803, + "learning_rate": 7.238183108587652e-05, + "loss": 0.0025517381727695465, + "step": 194650 + }, + { + "epoch": 27.63094393186657, + "grad_norm": 0.508738100528717, + "learning_rate": 7.23804116394606e-05, + "loss": 0.0584236204624176, + "step": 194660 + }, + { + "epoch": 27.63236337828247, + "grad_norm": 3.735403299331665, + "learning_rate": 7.237899219304472e-05, + "loss": 0.018692411482334137, + "step": 194670 + }, + { + "epoch": 27.63378282469837, + "grad_norm": 0.044718239456415176, + "learning_rate": 7.237757274662881e-05, + "loss": 0.00918361395597458, + "step": 194680 + }, + { + "epoch": 27.635202271114267, + "grad_norm": 0.02615862712264061, + "learning_rate": 7.237615330021292e-05, + "loss": 0.009574076533317566, + "step": 194690 + }, + { + "epoch": 27.636621717530165, + "grad_norm": 7.62746000289917, + "learning_rate": 7.237473385379703e-05, + "loss": 0.008371596038341523, + "step": 194700 + }, + { + "epoch": 27.63804116394606, + "grad_norm": 11.403969764709473, + "learning_rate": 7.237331440738112e-05, + "loss": 0.013580551743507386, + "step": 194710 + }, + { + "epoch": 27.639460610361958, + "grad_norm": 0.030922509729862213, + "learning_rate": 7.237189496096523e-05, + "loss": 0.01377268135547638, + "step": 194720 + }, + { + "epoch": 27.640880056777856, + "grad_norm": 0.020522521808743477, + "learning_rate": 7.237047551454933e-05, + "loss": 0.020868125557899474, + "step": 194730 + }, + { + "epoch": 27.642299503193755, + "grad_norm": 0.06438703089952469, + "learning_rate": 7.236905606813344e-05, + "loss": 0.0026008371263742448, + "step": 194740 + }, + { + "epoch": 27.643718949609653, + "grad_norm": 0.007966134697198868, + "learning_rate": 7.236763662171753e-05, + "loss": 0.024573230743408205, + "step": 194750 + }, + { + "epoch": 27.64513839602555, + "grad_norm": 19.586912155151367, + "learning_rate": 7.236621717530163e-05, + "loss": 0.05641996264457703, + "step": 194760 + }, + { + "epoch": 27.64655784244145, + "grad_norm": 2.76981520652771, + "learning_rate": 7.236479772888573e-05, + "loss": 0.023810994625091553, + "step": 194770 + }, + { + "epoch": 27.647977288857344, + "grad_norm": 1.1098496913909912, + "learning_rate": 7.236337828246984e-05, + "loss": 0.011935867369174957, + "step": 194780 + }, + { + "epoch": 27.649396735273243, + "grad_norm": 2.1107776165008545, + "learning_rate": 7.236195883605395e-05, + "loss": 0.028429260849952696, + "step": 194790 + }, + { + "epoch": 27.65081618168914, + "grad_norm": 1.1085683107376099, + "learning_rate": 7.236053938963805e-05, + "loss": 0.024118885397911072, + "step": 194800 + }, + { + "epoch": 27.65223562810504, + "grad_norm": 20.18557357788086, + "learning_rate": 7.235911994322215e-05, + "loss": 0.03005107045173645, + "step": 194810 + }, + { + "epoch": 27.653655074520938, + "grad_norm": 0.11447689682245255, + "learning_rate": 7.235770049680624e-05, + "loss": 0.006506098806858063, + "step": 194820 + }, + { + "epoch": 27.655074520936836, + "grad_norm": 2.9150421619415283, + "learning_rate": 7.235628105039035e-05, + "loss": 0.012411996722221375, + "step": 194830 + }, + { + "epoch": 27.656493967352734, + "grad_norm": 0.018338732421398163, + "learning_rate": 7.235486160397445e-05, + "loss": 0.02610718607902527, + "step": 194840 + }, + { + "epoch": 27.65791341376863, + "grad_norm": 0.060660816729068756, + "learning_rate": 7.235344215755856e-05, + "loss": 0.0013103295117616654, + "step": 194850 + }, + { + "epoch": 27.659332860184527, + "grad_norm": 0.026376813650131226, + "learning_rate": 7.235202271114265e-05, + "loss": 0.013385216891765594, + "step": 194860 + }, + { + "epoch": 27.660752306600425, + "grad_norm": 1.1619806289672852, + "learning_rate": 7.235060326472676e-05, + "loss": 0.00556679368019104, + "step": 194870 + }, + { + "epoch": 27.662171753016324, + "grad_norm": 0.03577132895588875, + "learning_rate": 7.234918381831087e-05, + "loss": 0.004527937248349189, + "step": 194880 + }, + { + "epoch": 27.663591199432222, + "grad_norm": 0.5969024300575256, + "learning_rate": 7.234776437189497e-05, + "loss": 0.0006962567567825317, + "step": 194890 + }, + { + "epoch": 27.66501064584812, + "grad_norm": 0.016016175970435143, + "learning_rate": 7.234634492547908e-05, + "loss": 0.003934904560446739, + "step": 194900 + }, + { + "epoch": 27.66643009226402, + "grad_norm": 1.1932965517044067, + "learning_rate": 7.234492547906316e-05, + "loss": 0.032938557863235476, + "step": 194910 + }, + { + "epoch": 27.667849538679913, + "grad_norm": 0.03399991616606712, + "learning_rate": 7.234350603264727e-05, + "loss": 0.0012613482773303985, + "step": 194920 + }, + { + "epoch": 27.669268985095812, + "grad_norm": 0.02242336980998516, + "learning_rate": 7.234208658623137e-05, + "loss": 0.03049791157245636, + "step": 194930 + }, + { + "epoch": 27.67068843151171, + "grad_norm": 16.49684715270996, + "learning_rate": 7.234066713981548e-05, + "loss": 0.021389029920101166, + "step": 194940 + }, + { + "epoch": 27.67210787792761, + "grad_norm": 0.03332288935780525, + "learning_rate": 7.233924769339958e-05, + "loss": 0.02236344665288925, + "step": 194950 + }, + { + "epoch": 27.673527324343507, + "grad_norm": 0.012899374589323997, + "learning_rate": 7.233782824698367e-05, + "loss": 0.03733210563659668, + "step": 194960 + }, + { + "epoch": 27.674946770759405, + "grad_norm": 0.3432348966598511, + "learning_rate": 7.233640880056779e-05, + "loss": 0.00944051593542099, + "step": 194970 + }, + { + "epoch": 27.676366217175303, + "grad_norm": 20.03549575805664, + "learning_rate": 7.233498935415188e-05, + "loss": 0.04665187299251557, + "step": 194980 + }, + { + "epoch": 27.677785663591198, + "grad_norm": 0.3963428735733032, + "learning_rate": 7.2333569907736e-05, + "loss": 0.023091964423656464, + "step": 194990 + }, + { + "epoch": 27.679205110007096, + "grad_norm": 0.17352242767810822, + "learning_rate": 7.233229240596168e-05, + "loss": 0.010494294762611388, + "step": 195000 + }, + { + "epoch": 27.679205110007096, + "eval_accuracy": 0.9858205633623705, + "eval_loss": 0.05128824710845947, + "eval_runtime": 31.8858, + "eval_samples_per_second": 493.228, + "eval_steps_per_second": 15.43, + "step": 195000 + }, + { + "epoch": 27.680624556422995, + "grad_norm": 0.6445503830909729, + "learning_rate": 7.233087295954578e-05, + "loss": 0.00569445826113224, + "step": 195010 + }, + { + "epoch": 27.682044002838893, + "grad_norm": 19.58580207824707, + "learning_rate": 7.232945351312989e-05, + "loss": 0.058859062194824216, + "step": 195020 + }, + { + "epoch": 27.68346344925479, + "grad_norm": 0.005852479953318834, + "learning_rate": 7.232803406671398e-05, + "loss": 0.007181126624345779, + "step": 195030 + }, + { + "epoch": 27.68488289567069, + "grad_norm": 0.7816558480262756, + "learning_rate": 7.232661462029808e-05, + "loss": 0.010111193358898162, + "step": 195040 + }, + { + "epoch": 27.686302342086588, + "grad_norm": 1.1121755838394165, + "learning_rate": 7.232519517388219e-05, + "loss": 0.025676465034484862, + "step": 195050 + }, + { + "epoch": 27.687721788502483, + "grad_norm": 9.076603889465332, + "learning_rate": 7.232377572746629e-05, + "loss": 0.02847360968589783, + "step": 195060 + }, + { + "epoch": 27.68914123491838, + "grad_norm": 2.218515396118164, + "learning_rate": 7.23223562810504e-05, + "loss": 0.01635710895061493, + "step": 195070 + }, + { + "epoch": 27.69056068133428, + "grad_norm": 0.38722386956214905, + "learning_rate": 7.23209368346345e-05, + "loss": 0.05265829563140869, + "step": 195080 + }, + { + "epoch": 27.691980127750178, + "grad_norm": 0.1485675573348999, + "learning_rate": 7.23195173882186e-05, + "loss": 0.025959739089012147, + "step": 195090 + }, + { + "epoch": 27.693399574166076, + "grad_norm": 0.7034738659858704, + "learning_rate": 7.231809794180269e-05, + "loss": 0.06505147218704224, + "step": 195100 + }, + { + "epoch": 27.694819020581974, + "grad_norm": 0.12425894290208817, + "learning_rate": 7.23166784953868e-05, + "loss": 0.024385082721710204, + "step": 195110 + }, + { + "epoch": 27.696238466997873, + "grad_norm": 0.01121490728110075, + "learning_rate": 7.23152590489709e-05, + "loss": 0.009697312116622924, + "step": 195120 + }, + { + "epoch": 27.697657913413767, + "grad_norm": 4.961036205291748, + "learning_rate": 7.231383960255501e-05, + "loss": 0.005331876501441002, + "step": 195130 + }, + { + "epoch": 27.699077359829666, + "grad_norm": 0.1718050092458725, + "learning_rate": 7.231242015613911e-05, + "loss": 0.015516605973243714, + "step": 195140 + }, + { + "epoch": 27.700496806245564, + "grad_norm": 0.32257142663002014, + "learning_rate": 7.231100070972321e-05, + "loss": 0.008072951436042785, + "step": 195150 + }, + { + "epoch": 27.701916252661462, + "grad_norm": 0.03477536886930466, + "learning_rate": 7.230958126330732e-05, + "loss": 0.006568338721990585, + "step": 195160 + }, + { + "epoch": 27.70333569907736, + "grad_norm": 0.06493669003248215, + "learning_rate": 7.230816181689142e-05, + "loss": 0.0229690283536911, + "step": 195170 + }, + { + "epoch": 27.70475514549326, + "grad_norm": 0.04142182320356369, + "learning_rate": 7.230674237047553e-05, + "loss": 0.06965481638908386, + "step": 195180 + }, + { + "epoch": 27.706174591909157, + "grad_norm": 0.21643191576004028, + "learning_rate": 7.230532292405961e-05, + "loss": 0.001161164790391922, + "step": 195190 + }, + { + "epoch": 27.707594038325052, + "grad_norm": 4.001688480377197, + "learning_rate": 7.230390347764372e-05, + "loss": 0.015329080820083617, + "step": 195200 + }, + { + "epoch": 27.70901348474095, + "grad_norm": 0.04379692301154137, + "learning_rate": 7.230248403122782e-05, + "loss": 0.03216356039047241, + "step": 195210 + }, + { + "epoch": 27.71043293115685, + "grad_norm": 0.6879659295082092, + "learning_rate": 7.230106458481193e-05, + "loss": 0.007211415469646454, + "step": 195220 + }, + { + "epoch": 27.711852377572747, + "grad_norm": 1.5124330520629883, + "learning_rate": 7.229964513839603e-05, + "loss": 0.010853489488363266, + "step": 195230 + }, + { + "epoch": 27.713271823988645, + "grad_norm": 0.8192140460014343, + "learning_rate": 7.229822569198012e-05, + "loss": 0.02339167147874832, + "step": 195240 + }, + { + "epoch": 27.714691270404543, + "grad_norm": 1.0280641317367554, + "learning_rate": 7.229680624556423e-05, + "loss": 0.049205487966537474, + "step": 195250 + }, + { + "epoch": 27.71611071682044, + "grad_norm": 0.8649991750717163, + "learning_rate": 7.229538679914833e-05, + "loss": 0.021330724656581878, + "step": 195260 + }, + { + "epoch": 27.717530163236336, + "grad_norm": 1.1615266799926758, + "learning_rate": 7.229396735273244e-05, + "loss": 0.011471347510814666, + "step": 195270 + }, + { + "epoch": 27.718949609652235, + "grad_norm": 21.656644821166992, + "learning_rate": 7.229254790631654e-05, + "loss": 0.03397437930107117, + "step": 195280 + }, + { + "epoch": 27.720369056068133, + "grad_norm": 0.10183900594711304, + "learning_rate": 7.229112845990065e-05, + "loss": 0.019610747694969177, + "step": 195290 + }, + { + "epoch": 27.72178850248403, + "grad_norm": 14.611618995666504, + "learning_rate": 7.228970901348474e-05, + "loss": 0.023715467751026155, + "step": 195300 + }, + { + "epoch": 27.72320794889993, + "grad_norm": 0.7317876219749451, + "learning_rate": 7.228828956706885e-05, + "loss": 0.01131685972213745, + "step": 195310 + }, + { + "epoch": 27.724627395315828, + "grad_norm": 0.16173005104064941, + "learning_rate": 7.228687012065294e-05, + "loss": 0.01416686773300171, + "step": 195320 + }, + { + "epoch": 27.726046841731726, + "grad_norm": 0.060016240924596786, + "learning_rate": 7.228545067423705e-05, + "loss": 0.029252329468727113, + "step": 195330 + }, + { + "epoch": 27.72746628814762, + "grad_norm": 6.6732306480407715, + "learning_rate": 7.228403122782115e-05, + "loss": 0.017310263216495515, + "step": 195340 + }, + { + "epoch": 27.72888573456352, + "grad_norm": 0.8807457685470581, + "learning_rate": 7.228261178140525e-05, + "loss": 0.018696826696395875, + "step": 195350 + }, + { + "epoch": 27.730305180979418, + "grad_norm": 3.393733024597168, + "learning_rate": 7.228119233498936e-05, + "loss": 0.011850952357053756, + "step": 195360 + }, + { + "epoch": 27.731724627395316, + "grad_norm": 10.120708465576172, + "learning_rate": 7.227977288857346e-05, + "loss": 0.019175444543361665, + "step": 195370 + }, + { + "epoch": 27.733144073811214, + "grad_norm": 0.020537642762064934, + "learning_rate": 7.227835344215757e-05, + "loss": 0.010271267592906952, + "step": 195380 + }, + { + "epoch": 27.734563520227113, + "grad_norm": 0.7466514110565186, + "learning_rate": 7.227693399574167e-05, + "loss": 0.011257757246494294, + "step": 195390 + }, + { + "epoch": 27.73598296664301, + "grad_norm": 5.105926513671875, + "learning_rate": 7.227551454932576e-05, + "loss": 0.005511688068509102, + "step": 195400 + }, + { + "epoch": 27.737402413058906, + "grad_norm": 0.6490784287452698, + "learning_rate": 7.227409510290986e-05, + "loss": 0.0471150815486908, + "step": 195410 + }, + { + "epoch": 27.738821859474804, + "grad_norm": 25.379005432128906, + "learning_rate": 7.227267565649397e-05, + "loss": 0.0894334852695465, + "step": 195420 + }, + { + "epoch": 27.740241305890702, + "grad_norm": 0.5977299213409424, + "learning_rate": 7.227125621007807e-05, + "loss": 0.04154510200023651, + "step": 195430 + }, + { + "epoch": 27.7416607523066, + "grad_norm": 0.025555068626999855, + "learning_rate": 7.226983676366218e-05, + "loss": 0.003989645466208458, + "step": 195440 + }, + { + "epoch": 27.7430801987225, + "grad_norm": 1.8891655206680298, + "learning_rate": 7.226841731724628e-05, + "loss": 0.018021613359451294, + "step": 195450 + }, + { + "epoch": 27.744499645138397, + "grad_norm": 0.08590295910835266, + "learning_rate": 7.226699787083037e-05, + "loss": 0.006217148527503014, + "step": 195460 + }, + { + "epoch": 27.745919091554295, + "grad_norm": 0.04482267051935196, + "learning_rate": 7.226557842441449e-05, + "loss": 0.005164816230535507, + "step": 195470 + }, + { + "epoch": 27.74733853797019, + "grad_norm": 0.5699446797370911, + "learning_rate": 7.226415897799858e-05, + "loss": 0.05494263172149658, + "step": 195480 + }, + { + "epoch": 27.74875798438609, + "grad_norm": 0.1351570338010788, + "learning_rate": 7.22627395315827e-05, + "loss": 0.03165498077869415, + "step": 195490 + }, + { + "epoch": 27.750177430801987, + "grad_norm": 19.405200958251953, + "learning_rate": 7.226132008516678e-05, + "loss": 0.03313029408454895, + "step": 195500 + }, + { + "epoch": 27.750177430801987, + "eval_accuracy": 0.9804158453614803, + "eval_loss": 0.0852934792637825, + "eval_runtime": 33.0079, + "eval_samples_per_second": 476.462, + "eval_steps_per_second": 14.906, + "step": 195500 + }, + { + "epoch": 27.751596877217885, + "grad_norm": 3.2079951763153076, + "learning_rate": 7.225990063875089e-05, + "loss": 0.01673412322998047, + "step": 195510 + }, + { + "epoch": 27.753016323633783, + "grad_norm": 0.08611336350440979, + "learning_rate": 7.225848119233499e-05, + "loss": 0.005447463691234588, + "step": 195520 + }, + { + "epoch": 27.75443577004968, + "grad_norm": 10.965370178222656, + "learning_rate": 7.22570617459191e-05, + "loss": 0.024515870213508605, + "step": 195530 + }, + { + "epoch": 27.75585521646558, + "grad_norm": 0.07020526379346848, + "learning_rate": 7.225564229950321e-05, + "loss": 0.010080046206712722, + "step": 195540 + }, + { + "epoch": 27.757274662881475, + "grad_norm": 7.355412006378174, + "learning_rate": 7.225422285308729e-05, + "loss": 0.022935329377651213, + "step": 195550 + }, + { + "epoch": 27.758694109297373, + "grad_norm": 0.009539064951241016, + "learning_rate": 7.22528034066714e-05, + "loss": 0.004200587794184685, + "step": 195560 + }, + { + "epoch": 27.76011355571327, + "grad_norm": 0.14298197627067566, + "learning_rate": 7.22513839602555e-05, + "loss": 0.0022917695343494414, + "step": 195570 + }, + { + "epoch": 27.76153300212917, + "grad_norm": 4.363316059112549, + "learning_rate": 7.224996451383961e-05, + "loss": 0.007618290930986404, + "step": 195580 + }, + { + "epoch": 27.762952448545068, + "grad_norm": 7.496452808380127, + "learning_rate": 7.224854506742371e-05, + "loss": 0.03720631897449493, + "step": 195590 + }, + { + "epoch": 27.764371894960966, + "grad_norm": 0.6040679216384888, + "learning_rate": 7.22471256210078e-05, + "loss": 0.04841427803039551, + "step": 195600 + }, + { + "epoch": 27.765791341376865, + "grad_norm": 0.27305111289024353, + "learning_rate": 7.22457061745919e-05, + "loss": 0.0325771689414978, + "step": 195610 + }, + { + "epoch": 27.76721078779276, + "grad_norm": 7.946808815002441, + "learning_rate": 7.224428672817601e-05, + "loss": 0.03217897415161133, + "step": 195620 + }, + { + "epoch": 27.768630234208658, + "grad_norm": 14.612802505493164, + "learning_rate": 7.224286728176011e-05, + "loss": 0.019825957715511322, + "step": 195630 + }, + { + "epoch": 27.770049680624556, + "grad_norm": 1.671360969543457, + "learning_rate": 7.224144783534422e-05, + "loss": 0.018645979464054108, + "step": 195640 + }, + { + "epoch": 27.771469127040454, + "grad_norm": 0.46237823367118835, + "learning_rate": 7.224002838892833e-05, + "loss": 0.024891313910484315, + "step": 195650 + }, + { + "epoch": 27.772888573456353, + "grad_norm": 0.004809950012713671, + "learning_rate": 7.223860894251242e-05, + "loss": 0.014558212459087371, + "step": 195660 + }, + { + "epoch": 27.77430801987225, + "grad_norm": 1.6154356002807617, + "learning_rate": 7.223718949609653e-05, + "loss": 0.051657140254974365, + "step": 195670 + }, + { + "epoch": 27.77572746628815, + "grad_norm": 0.23611977696418762, + "learning_rate": 7.223577004968063e-05, + "loss": 0.01677328646183014, + "step": 195680 + }, + { + "epoch": 27.777146912704044, + "grad_norm": 0.013814530335366726, + "learning_rate": 7.223435060326474e-05, + "loss": 0.0012115325778722763, + "step": 195690 + }, + { + "epoch": 27.778566359119942, + "grad_norm": 2.200650215148926, + "learning_rate": 7.223293115684883e-05, + "loss": 0.02278186082839966, + "step": 195700 + }, + { + "epoch": 27.77998580553584, + "grad_norm": 0.7044551968574524, + "learning_rate": 7.223151171043293e-05, + "loss": 0.008098477125167846, + "step": 195710 + }, + { + "epoch": 27.78140525195174, + "grad_norm": 2.3344523906707764, + "learning_rate": 7.223009226401703e-05, + "loss": 0.0151710644364357, + "step": 195720 + }, + { + "epoch": 27.782824698367637, + "grad_norm": 0.19724419713020325, + "learning_rate": 7.222867281760114e-05, + "loss": 0.005675962567329407, + "step": 195730 + }, + { + "epoch": 27.784244144783536, + "grad_norm": 0.021958615630865097, + "learning_rate": 7.222725337118525e-05, + "loss": 0.02097931653261185, + "step": 195740 + }, + { + "epoch": 27.785663591199434, + "grad_norm": 0.14099405705928802, + "learning_rate": 7.222583392476935e-05, + "loss": 0.0014429066330194473, + "step": 195750 + }, + { + "epoch": 27.78708303761533, + "grad_norm": 0.6696363091468811, + "learning_rate": 7.222441447835345e-05, + "loss": 0.012246986478567123, + "step": 195760 + }, + { + "epoch": 27.788502484031227, + "grad_norm": 0.009496288374066353, + "learning_rate": 7.222313697657914e-05, + "loss": 0.02800978422164917, + "step": 195770 + }, + { + "epoch": 27.789921930447125, + "grad_norm": 2.439403533935547, + "learning_rate": 7.222171753016324e-05, + "loss": 0.012010474503040314, + "step": 195780 + }, + { + "epoch": 27.791341376863024, + "grad_norm": 0.2963767945766449, + "learning_rate": 7.222029808374734e-05, + "loss": 0.0031566999852657316, + "step": 195790 + }, + { + "epoch": 27.792760823278922, + "grad_norm": 4.495771408081055, + "learning_rate": 7.221887863733145e-05, + "loss": 0.02125660628080368, + "step": 195800 + }, + { + "epoch": 27.79418026969482, + "grad_norm": 0.0068011959083378315, + "learning_rate": 7.221745919091555e-05, + "loss": 0.008391124755144119, + "step": 195810 + }, + { + "epoch": 27.79559971611072, + "grad_norm": 0.09259974956512451, + "learning_rate": 7.221603974449966e-05, + "loss": 0.0044161248952150345, + "step": 195820 + }, + { + "epoch": 27.797019162526613, + "grad_norm": 0.527831494808197, + "learning_rate": 7.221462029808374e-05, + "loss": 0.002505551651120186, + "step": 195830 + }, + { + "epoch": 27.79843860894251, + "grad_norm": 0.03873242437839508, + "learning_rate": 7.221320085166785e-05, + "loss": 0.014333748817443847, + "step": 195840 + }, + { + "epoch": 27.79985805535841, + "grad_norm": 0.016269400715827942, + "learning_rate": 7.221178140525195e-05, + "loss": 0.0010628513991832733, + "step": 195850 + }, + { + "epoch": 27.801277501774308, + "grad_norm": 0.05642767995595932, + "learning_rate": 7.221036195883606e-05, + "loss": 0.014198535680770874, + "step": 195860 + }, + { + "epoch": 27.802696948190206, + "grad_norm": 0.009565449319779873, + "learning_rate": 7.220894251242016e-05, + "loss": 0.008198869973421096, + "step": 195870 + }, + { + "epoch": 27.804116394606105, + "grad_norm": 0.4807228744029999, + "learning_rate": 7.220752306600426e-05, + "loss": 0.0018737975507974625, + "step": 195880 + }, + { + "epoch": 27.805535841022003, + "grad_norm": 0.03412025421857834, + "learning_rate": 7.220610361958837e-05, + "loss": 0.028191691637039183, + "step": 195890 + }, + { + "epoch": 27.806955287437898, + "grad_norm": 1.9286863803863525, + "learning_rate": 7.220468417317246e-05, + "loss": 0.010618482530117036, + "step": 195900 + }, + { + "epoch": 27.808374733853796, + "grad_norm": 0.0553281269967556, + "learning_rate": 7.220326472675657e-05, + "loss": 0.005354196205735207, + "step": 195910 + }, + { + "epoch": 27.809794180269694, + "grad_norm": 0.5310750603675842, + "learning_rate": 7.220184528034067e-05, + "loss": 0.04126610159873963, + "step": 195920 + }, + { + "epoch": 27.811213626685593, + "grad_norm": 0.028735937550663948, + "learning_rate": 7.220042583392477e-05, + "loss": 0.04494394063949585, + "step": 195930 + }, + { + "epoch": 27.81263307310149, + "grad_norm": 0.6776896119117737, + "learning_rate": 7.219900638750887e-05, + "loss": 0.01818709373474121, + "step": 195940 + }, + { + "epoch": 27.81405251951739, + "grad_norm": 0.150242418050766, + "learning_rate": 7.219758694109298e-05, + "loss": 0.003761991485953331, + "step": 195950 + }, + { + "epoch": 27.815471965933288, + "grad_norm": 0.3891546428203583, + "learning_rate": 7.219616749467708e-05, + "loss": 0.0037734467536211015, + "step": 195960 + }, + { + "epoch": 27.816891412349182, + "grad_norm": 0.013967500068247318, + "learning_rate": 7.219474804826119e-05, + "loss": 0.019118155539035796, + "step": 195970 + }, + { + "epoch": 27.81831085876508, + "grad_norm": 10.336224555969238, + "learning_rate": 7.219332860184528e-05, + "loss": 0.02341310977935791, + "step": 195980 + }, + { + "epoch": 27.81973030518098, + "grad_norm": 0.7042390704154968, + "learning_rate": 7.219190915542938e-05, + "loss": 0.0029676958918571474, + "step": 195990 + }, + { + "epoch": 27.821149751596877, + "grad_norm": 0.01748873107135296, + "learning_rate": 7.219048970901349e-05, + "loss": 0.014799433946609496, + "step": 196000 + }, + { + "epoch": 27.821149751596877, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.05816978961229324, + "eval_runtime": 31.4541, + "eval_samples_per_second": 499.999, + "eval_steps_per_second": 15.642, + "step": 196000 + }, + { + "epoch": 27.822569198012776, + "grad_norm": 3.202516555786133, + "learning_rate": 7.218907026259759e-05, + "loss": 0.00773472934961319, + "step": 196010 + }, + { + "epoch": 27.823988644428674, + "grad_norm": 4.440473556518555, + "learning_rate": 7.21876508161817e-05, + "loss": 0.026421040296554565, + "step": 196020 + }, + { + "epoch": 27.825408090844572, + "grad_norm": 18.325536727905273, + "learning_rate": 7.21862313697658e-05, + "loss": 0.045433855056762694, + "step": 196030 + }, + { + "epoch": 27.826827537260467, + "grad_norm": 0.31022679805755615, + "learning_rate": 7.21848119233499e-05, + "loss": 0.004244120419025421, + "step": 196040 + }, + { + "epoch": 27.828246983676365, + "grad_norm": 0.058783888816833496, + "learning_rate": 7.218339247693399e-05, + "loss": 0.020369292795658113, + "step": 196050 + }, + { + "epoch": 27.829666430092264, + "grad_norm": 1.1233352422714233, + "learning_rate": 7.21819730305181e-05, + "loss": 0.008905446529388428, + "step": 196060 + }, + { + "epoch": 27.831085876508162, + "grad_norm": 0.1584385335445404, + "learning_rate": 7.21805535841022e-05, + "loss": 0.031159216165542604, + "step": 196070 + }, + { + "epoch": 27.83250532292406, + "grad_norm": 8.977508544921875, + "learning_rate": 7.217913413768631e-05, + "loss": 0.02352686822414398, + "step": 196080 + }, + { + "epoch": 27.83392476933996, + "grad_norm": 0.36895960569381714, + "learning_rate": 7.217771469127041e-05, + "loss": 0.0008228413760662078, + "step": 196090 + }, + { + "epoch": 27.835344215755857, + "grad_norm": 1.4826003313064575, + "learning_rate": 7.21762952448545e-05, + "loss": 0.03235381245613098, + "step": 196100 + }, + { + "epoch": 27.83676366217175, + "grad_norm": 0.7131540179252625, + "learning_rate": 7.217487579843862e-05, + "loss": 0.017043325304985046, + "step": 196110 + }, + { + "epoch": 27.83818310858765, + "grad_norm": 0.017890824005007744, + "learning_rate": 7.217345635202271e-05, + "loss": 0.03177411258220673, + "step": 196120 + }, + { + "epoch": 27.839602555003548, + "grad_norm": 11.785734176635742, + "learning_rate": 7.217203690560683e-05, + "loss": 0.10161924362182617, + "step": 196130 + }, + { + "epoch": 27.841022001419446, + "grad_norm": 0.17480364441871643, + "learning_rate": 7.217061745919091e-05, + "loss": 0.005538706853985787, + "step": 196140 + }, + { + "epoch": 27.842441447835345, + "grad_norm": 1.9322599172592163, + "learning_rate": 7.216919801277502e-05, + "loss": 0.02024470716714859, + "step": 196150 + }, + { + "epoch": 27.843860894251243, + "grad_norm": 0.20100684463977814, + "learning_rate": 7.216777856635912e-05, + "loss": 0.004709966108202934, + "step": 196160 + }, + { + "epoch": 27.84528034066714, + "grad_norm": 4.35283088684082, + "learning_rate": 7.216635911994323e-05, + "loss": 0.012483496963977814, + "step": 196170 + }, + { + "epoch": 27.846699787083036, + "grad_norm": 0.17801815271377563, + "learning_rate": 7.216493967352733e-05, + "loss": 0.01861228346824646, + "step": 196180 + }, + { + "epoch": 27.848119233498934, + "grad_norm": 0.18230082094669342, + "learning_rate": 7.216352022711142e-05, + "loss": 0.0198050856590271, + "step": 196190 + }, + { + "epoch": 27.849538679914833, + "grad_norm": 0.43672215938568115, + "learning_rate": 7.216210078069553e-05, + "loss": 0.0030043676495552065, + "step": 196200 + }, + { + "epoch": 27.85095812633073, + "grad_norm": 0.5704371929168701, + "learning_rate": 7.216068133427963e-05, + "loss": 0.008674360811710358, + "step": 196210 + }, + { + "epoch": 27.85237757274663, + "grad_norm": 0.23682750761508942, + "learning_rate": 7.215926188786374e-05, + "loss": 0.004060474038124084, + "step": 196220 + }, + { + "epoch": 27.853797019162528, + "grad_norm": 15.669677734375, + "learning_rate": 7.215784244144784e-05, + "loss": 0.042201900482177736, + "step": 196230 + }, + { + "epoch": 27.855216465578426, + "grad_norm": 0.7276280522346497, + "learning_rate": 7.215642299503194e-05, + "loss": 0.028686460852622987, + "step": 196240 + }, + { + "epoch": 27.85663591199432, + "grad_norm": 0.02831590175628662, + "learning_rate": 7.215500354861603e-05, + "loss": 0.014327746629714967, + "step": 196250 + }, + { + "epoch": 27.85805535841022, + "grad_norm": 0.13088825345039368, + "learning_rate": 7.215358410220015e-05, + "loss": 0.005600549653172493, + "step": 196260 + }, + { + "epoch": 27.859474804826117, + "grad_norm": 0.8352158069610596, + "learning_rate": 7.215216465578424e-05, + "loss": 0.002998282387852669, + "step": 196270 + }, + { + "epoch": 27.860894251242016, + "grad_norm": 0.0882214903831482, + "learning_rate": 7.215074520936835e-05, + "loss": 0.0012226562947034835, + "step": 196280 + }, + { + "epoch": 27.862313697657914, + "grad_norm": 0.026040390133857727, + "learning_rate": 7.214932576295245e-05, + "loss": 0.002722466364502907, + "step": 196290 + }, + { + "epoch": 27.863733144073812, + "grad_norm": 0.4221002459526062, + "learning_rate": 7.214790631653655e-05, + "loss": 0.006022071838378907, + "step": 196300 + }, + { + "epoch": 27.86515259048971, + "grad_norm": 0.28177472949028015, + "learning_rate": 7.214648687012066e-05, + "loss": 0.0278471440076828, + "step": 196310 + }, + { + "epoch": 27.866572036905605, + "grad_norm": 2.722510576248169, + "learning_rate": 7.214506742370476e-05, + "loss": 0.027727881073951723, + "step": 196320 + }, + { + "epoch": 27.867991483321504, + "grad_norm": 0.1798604130744934, + "learning_rate": 7.214364797728887e-05, + "loss": 0.01567129194736481, + "step": 196330 + }, + { + "epoch": 27.869410929737402, + "grad_norm": 0.238104447722435, + "learning_rate": 7.214222853087297e-05, + "loss": 0.0038753625005483626, + "step": 196340 + }, + { + "epoch": 27.8708303761533, + "grad_norm": 0.790936291217804, + "learning_rate": 7.214080908445706e-05, + "loss": 0.015289196372032165, + "step": 196350 + }, + { + "epoch": 27.8722498225692, + "grad_norm": 9.410913467407227, + "learning_rate": 7.213938963804116e-05, + "loss": 0.05014730095863342, + "step": 196360 + }, + { + "epoch": 27.873669268985097, + "grad_norm": 0.7661629319190979, + "learning_rate": 7.213797019162527e-05, + "loss": 0.01914660930633545, + "step": 196370 + }, + { + "epoch": 27.875088715400995, + "grad_norm": 0.0902065709233284, + "learning_rate": 7.213655074520937e-05, + "loss": 0.013210347294807434, + "step": 196380 + }, + { + "epoch": 27.87650816181689, + "grad_norm": 0.24666078388690948, + "learning_rate": 7.213513129879348e-05, + "loss": 0.020490935444831847, + "step": 196390 + }, + { + "epoch": 27.87792760823279, + "grad_norm": 0.14868269860744476, + "learning_rate": 7.213371185237758e-05, + "loss": 0.0036780834197998047, + "step": 196400 + }, + { + "epoch": 27.879347054648687, + "grad_norm": 0.3767184317111969, + "learning_rate": 7.213229240596167e-05, + "loss": 0.019197601079940795, + "step": 196410 + }, + { + "epoch": 27.880766501064585, + "grad_norm": 0.015511476434767246, + "learning_rate": 7.213087295954578e-05, + "loss": 0.012267293035984039, + "step": 196420 + }, + { + "epoch": 27.882185947480483, + "grad_norm": 0.10580259561538696, + "learning_rate": 7.212945351312988e-05, + "loss": 0.023472702503204344, + "step": 196430 + }, + { + "epoch": 27.88360539389638, + "grad_norm": 0.035181425511837006, + "learning_rate": 7.212803406671399e-05, + "loss": 0.002192778140306473, + "step": 196440 + }, + { + "epoch": 27.88502484031228, + "grad_norm": 0.0754452794790268, + "learning_rate": 7.212661462029808e-05, + "loss": 0.0009433422237634659, + "step": 196450 + }, + { + "epoch": 27.886444286728175, + "grad_norm": 0.2751689553260803, + "learning_rate": 7.212519517388219e-05, + "loss": 0.021418365836143493, + "step": 196460 + }, + { + "epoch": 27.887863733144073, + "grad_norm": 0.022266777232289314, + "learning_rate": 7.212377572746629e-05, + "loss": 0.0046696729958057405, + "step": 196470 + }, + { + "epoch": 27.88928317955997, + "grad_norm": 12.688689231872559, + "learning_rate": 7.21223562810504e-05, + "loss": 0.015338225662708283, + "step": 196480 + }, + { + "epoch": 27.89070262597587, + "grad_norm": 15.233657836914062, + "learning_rate": 7.212093683463451e-05, + "loss": 0.02160208225250244, + "step": 196490 + }, + { + "epoch": 27.892122072391768, + "grad_norm": 0.1920890063047409, + "learning_rate": 7.211951738821859e-05, + "loss": 0.012282843887805938, + "step": 196500 + }, + { + "epoch": 27.892122072391768, + "eval_accuracy": 0.9910981115279456, + "eval_loss": 0.03396998718380928, + "eval_runtime": 31.6261, + "eval_samples_per_second": 497.279, + "eval_steps_per_second": 15.557, + "step": 196500 + }, + { + "epoch": 27.893541518807666, + "grad_norm": 0.015977807343006134, + "learning_rate": 7.21180979418027e-05, + "loss": 0.017745059728622437, + "step": 196510 + }, + { + "epoch": 27.894960965223564, + "grad_norm": 0.07754994928836823, + "learning_rate": 7.21166784953868e-05, + "loss": 0.010474695265293122, + "step": 196520 + }, + { + "epoch": 27.89638041163946, + "grad_norm": 14.751699447631836, + "learning_rate": 7.211525904897091e-05, + "loss": 0.021279922127723692, + "step": 196530 + }, + { + "epoch": 27.897799858055357, + "grad_norm": 1.9557873010635376, + "learning_rate": 7.211383960255501e-05, + "loss": 0.010947492718696595, + "step": 196540 + }, + { + "epoch": 27.899219304471256, + "grad_norm": 0.894644558429718, + "learning_rate": 7.21124201561391e-05, + "loss": 0.05013212561607361, + "step": 196550 + }, + { + "epoch": 27.900638750887154, + "grad_norm": 2.4753148555755615, + "learning_rate": 7.21110007097232e-05, + "loss": 0.040475794672966005, + "step": 196560 + }, + { + "epoch": 27.902058197303052, + "grad_norm": 0.039384014904499054, + "learning_rate": 7.210958126330731e-05, + "loss": 0.003228927031159401, + "step": 196570 + }, + { + "epoch": 27.90347764371895, + "grad_norm": 1.6691234111785889, + "learning_rate": 7.210816181689142e-05, + "loss": 0.006879208236932754, + "step": 196580 + }, + { + "epoch": 27.90489709013485, + "grad_norm": 1.0567352771759033, + "learning_rate": 7.210674237047552e-05, + "loss": 0.008606959879398347, + "step": 196590 + }, + { + "epoch": 27.906316536550744, + "grad_norm": 7.474575519561768, + "learning_rate": 7.210532292405962e-05, + "loss": 0.003155270218849182, + "step": 196600 + }, + { + "epoch": 27.907735982966642, + "grad_norm": 0.05868200212717056, + "learning_rate": 7.210390347764372e-05, + "loss": 0.001469600573182106, + "step": 196610 + }, + { + "epoch": 27.90915542938254, + "grad_norm": 0.930448591709137, + "learning_rate": 7.210248403122783e-05, + "loss": 0.008321698009967803, + "step": 196620 + }, + { + "epoch": 27.91057487579844, + "grad_norm": 0.0486099012196064, + "learning_rate": 7.210106458481192e-05, + "loss": 0.011988931894302368, + "step": 196630 + }, + { + "epoch": 27.911994322214337, + "grad_norm": 0.021828671917319298, + "learning_rate": 7.209964513839604e-05, + "loss": 0.0030845098197460176, + "step": 196640 + }, + { + "epoch": 27.913413768630235, + "grad_norm": 0.09635590016841888, + "learning_rate": 7.209822569198012e-05, + "loss": 0.018718485534191132, + "step": 196650 + }, + { + "epoch": 27.914833215046134, + "grad_norm": 0.19309654831886292, + "learning_rate": 7.209680624556423e-05, + "loss": 0.001201820746064186, + "step": 196660 + }, + { + "epoch": 27.91625266146203, + "grad_norm": 1.6386436223983765, + "learning_rate": 7.209538679914834e-05, + "loss": 0.0465065598487854, + "step": 196670 + }, + { + "epoch": 27.917672107877927, + "grad_norm": 0.7625183463096619, + "learning_rate": 7.209396735273244e-05, + "loss": 0.00553046315908432, + "step": 196680 + }, + { + "epoch": 27.919091554293825, + "grad_norm": 0.11850006133317947, + "learning_rate": 7.209254790631655e-05, + "loss": 0.004502750560641288, + "step": 196690 + }, + { + "epoch": 27.920511000709723, + "grad_norm": 12.267292022705078, + "learning_rate": 7.209112845990065e-05, + "loss": 0.015410494804382325, + "step": 196700 + }, + { + "epoch": 27.92193044712562, + "grad_norm": 0.4487343728542328, + "learning_rate": 7.208970901348474e-05, + "loss": 0.012404867261648179, + "step": 196710 + }, + { + "epoch": 27.92334989354152, + "grad_norm": 5.475649356842041, + "learning_rate": 7.208828956706884e-05, + "loss": 0.005877850949764252, + "step": 196720 + }, + { + "epoch": 27.924769339957418, + "grad_norm": 0.15953722596168518, + "learning_rate": 7.208687012065295e-05, + "loss": 0.003426235169172287, + "step": 196730 + }, + { + "epoch": 27.926188786373313, + "grad_norm": 4.611114025115967, + "learning_rate": 7.208545067423705e-05, + "loss": 0.012696824967861176, + "step": 196740 + }, + { + "epoch": 27.92760823278921, + "grad_norm": 1.58597993850708, + "learning_rate": 7.208403122782116e-05, + "loss": 0.026545101404190065, + "step": 196750 + }, + { + "epoch": 27.92902767920511, + "grad_norm": 1.151830792427063, + "learning_rate": 7.208261178140526e-05, + "loss": 0.02718064486980438, + "step": 196760 + }, + { + "epoch": 27.930447125621008, + "grad_norm": 5.487363338470459, + "learning_rate": 7.208119233498936e-05, + "loss": 0.011050444096326828, + "step": 196770 + }, + { + "epoch": 27.931866572036906, + "grad_norm": 5.407829761505127, + "learning_rate": 7.207977288857347e-05, + "loss": 0.004123302176594734, + "step": 196780 + }, + { + "epoch": 27.933286018452804, + "grad_norm": 0.9770786166191101, + "learning_rate": 7.207835344215756e-05, + "loss": 0.028893208503723143, + "step": 196790 + }, + { + "epoch": 27.934705464868703, + "grad_norm": 0.047465790063142776, + "learning_rate": 7.207693399574167e-05, + "loss": 0.002733064442873001, + "step": 196800 + }, + { + "epoch": 27.936124911284598, + "grad_norm": 7.1878509521484375, + "learning_rate": 7.207551454932576e-05, + "loss": 0.015134266018867493, + "step": 196810 + }, + { + "epoch": 27.937544357700496, + "grad_norm": 4.875556468963623, + "learning_rate": 7.207409510290987e-05, + "loss": 0.020148199796676636, + "step": 196820 + }, + { + "epoch": 27.938963804116394, + "grad_norm": 0.1687821000814438, + "learning_rate": 7.207267565649397e-05, + "loss": 0.032853943109512326, + "step": 196830 + }, + { + "epoch": 27.940383250532292, + "grad_norm": 0.3626244366168976, + "learning_rate": 7.207125621007808e-05, + "loss": 0.02275071442127228, + "step": 196840 + }, + { + "epoch": 27.94180269694819, + "grad_norm": 3.954660415649414, + "learning_rate": 7.206983676366218e-05, + "loss": 0.02497764527797699, + "step": 196850 + }, + { + "epoch": 27.94322214336409, + "grad_norm": 0.011945686303079128, + "learning_rate": 7.206841731724627e-05, + "loss": 0.004671375080943107, + "step": 196860 + }, + { + "epoch": 27.944641589779987, + "grad_norm": 0.00846192054450512, + "learning_rate": 7.206699787083038e-05, + "loss": 0.01798683702945709, + "step": 196870 + }, + { + "epoch": 27.946061036195882, + "grad_norm": 0.016244517639279366, + "learning_rate": 7.206557842441448e-05, + "loss": 0.01294199824333191, + "step": 196880 + }, + { + "epoch": 27.94748048261178, + "grad_norm": 0.004536167718470097, + "learning_rate": 7.206415897799859e-05, + "loss": 0.018833526968955995, + "step": 196890 + }, + { + "epoch": 27.94889992902768, + "grad_norm": 0.054087527096271515, + "learning_rate": 7.206273953158269e-05, + "loss": 0.005039751902222634, + "step": 196900 + }, + { + "epoch": 27.950319375443577, + "grad_norm": 7.645169258117676, + "learning_rate": 7.206132008516679e-05, + "loss": 0.033840930461883544, + "step": 196910 + }, + { + "epoch": 27.951738821859475, + "grad_norm": 0.2119237780570984, + "learning_rate": 7.205990063875088e-05, + "loss": 0.002596522495150566, + "step": 196920 + }, + { + "epoch": 27.953158268275374, + "grad_norm": 2.505303382873535, + "learning_rate": 7.2058481192335e-05, + "loss": 0.0023514777421951294, + "step": 196930 + }, + { + "epoch": 27.954577714691272, + "grad_norm": 1.7436269521713257, + "learning_rate": 7.205706174591909e-05, + "loss": 0.020000995695590974, + "step": 196940 + }, + { + "epoch": 27.955997161107167, + "grad_norm": 2.9171667098999023, + "learning_rate": 7.20556422995032e-05, + "loss": 0.003373851627111435, + "step": 196950 + }, + { + "epoch": 27.957416607523065, + "grad_norm": 6.403369903564453, + "learning_rate": 7.20542228530873e-05, + "loss": 0.03035634756088257, + "step": 196960 + }, + { + "epoch": 27.958836053938963, + "grad_norm": 0.04917192831635475, + "learning_rate": 7.20528034066714e-05, + "loss": 0.01263948380947113, + "step": 196970 + }, + { + "epoch": 27.96025550035486, + "grad_norm": 0.040008582174777985, + "learning_rate": 7.205138396025551e-05, + "loss": 0.013172194361686707, + "step": 196980 + }, + { + "epoch": 27.96167494677076, + "grad_norm": 4.007813453674316, + "learning_rate": 7.20499645138396e-05, + "loss": 0.02734639644622803, + "step": 196990 + }, + { + "epoch": 27.96309439318666, + "grad_norm": 0.05434262752532959, + "learning_rate": 7.204854506742372e-05, + "loss": 0.004586571455001831, + "step": 197000 + }, + { + "epoch": 27.96309439318666, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.05427782982587814, + "eval_runtime": 33.1277, + "eval_samples_per_second": 474.739, + "eval_steps_per_second": 14.852, + "step": 197000 + }, + { + "epoch": 27.964513839602557, + "grad_norm": 0.02058180421590805, + "learning_rate": 7.20471256210078e-05, + "loss": 0.031101173162460326, + "step": 197010 + }, + { + "epoch": 27.96593328601845, + "grad_norm": 0.17209230363368988, + "learning_rate": 7.204570617459191e-05, + "loss": 0.015036331117153167, + "step": 197020 + }, + { + "epoch": 27.96735273243435, + "grad_norm": 4.290534973144531, + "learning_rate": 7.204428672817601e-05, + "loss": 0.00812450349330902, + "step": 197030 + }, + { + "epoch": 27.968772178850248, + "grad_norm": 0.004593354649841785, + "learning_rate": 7.204286728176012e-05, + "loss": 0.010486821085214615, + "step": 197040 + }, + { + "epoch": 27.970191625266146, + "grad_norm": 1.8192740678787231, + "learning_rate": 7.204144783534422e-05, + "loss": 0.0023676156997680664, + "step": 197050 + }, + { + "epoch": 27.971611071682045, + "grad_norm": 0.0873919352889061, + "learning_rate": 7.204002838892833e-05, + "loss": 0.00465797670185566, + "step": 197060 + }, + { + "epoch": 27.973030518097943, + "grad_norm": 0.20137883722782135, + "learning_rate": 7.203860894251243e-05, + "loss": 0.004657364636659622, + "step": 197070 + }, + { + "epoch": 27.97444996451384, + "grad_norm": 0.5232149362564087, + "learning_rate": 7.203718949609652e-05, + "loss": 0.006436198949813843, + "step": 197080 + }, + { + "epoch": 27.975869410929736, + "grad_norm": 0.058417655527591705, + "learning_rate": 7.203577004968063e-05, + "loss": 0.004280613735318184, + "step": 197090 + }, + { + "epoch": 27.977288857345634, + "grad_norm": 16.746103286743164, + "learning_rate": 7.203435060326473e-05, + "loss": 0.03008431792259216, + "step": 197100 + }, + { + "epoch": 27.978708303761533, + "grad_norm": 0.6859573125839233, + "learning_rate": 7.203293115684884e-05, + "loss": 0.013034147024154664, + "step": 197110 + }, + { + "epoch": 27.98012775017743, + "grad_norm": 0.9392169713973999, + "learning_rate": 7.203151171043293e-05, + "loss": 0.026000013947486876, + "step": 197120 + }, + { + "epoch": 27.98154719659333, + "grad_norm": 0.34899571537971497, + "learning_rate": 7.203009226401704e-05, + "loss": 0.004338917508721352, + "step": 197130 + }, + { + "epoch": 27.982966643009227, + "grad_norm": 0.02204129472374916, + "learning_rate": 7.202867281760113e-05, + "loss": 0.008269874751567841, + "step": 197140 + }, + { + "epoch": 27.984386089425126, + "grad_norm": 0.24300560355186462, + "learning_rate": 7.202725337118525e-05, + "loss": 0.005595936253666878, + "step": 197150 + }, + { + "epoch": 27.98580553584102, + "grad_norm": 0.09030269831418991, + "learning_rate": 7.202583392476934e-05, + "loss": 0.0013947762548923493, + "step": 197160 + }, + { + "epoch": 27.98722498225692, + "grad_norm": 0.008267288096249104, + "learning_rate": 7.202441447835344e-05, + "loss": 0.004983485490083694, + "step": 197170 + }, + { + "epoch": 27.988644428672817, + "grad_norm": 0.015466787852346897, + "learning_rate": 7.202299503193755e-05, + "loss": 0.004759443551301956, + "step": 197180 + }, + { + "epoch": 27.990063875088715, + "grad_norm": 0.12740759551525116, + "learning_rate": 7.202157558552165e-05, + "loss": 0.04835011661052704, + "step": 197190 + }, + { + "epoch": 27.991483321504614, + "grad_norm": 0.36005616188049316, + "learning_rate": 7.202015613910576e-05, + "loss": 0.019107127189636232, + "step": 197200 + }, + { + "epoch": 27.992902767920512, + "grad_norm": 20.398941040039062, + "learning_rate": 7.201873669268986e-05, + "loss": 0.013062147796154023, + "step": 197210 + }, + { + "epoch": 27.99432221433641, + "grad_norm": 7.655109405517578, + "learning_rate": 7.201731724627395e-05, + "loss": 0.007917039096355438, + "step": 197220 + }, + { + "epoch": 27.995741660752305, + "grad_norm": 0.31312838196754456, + "learning_rate": 7.201589779985805e-05, + "loss": 0.009162022173404694, + "step": 197230 + }, + { + "epoch": 27.997161107168203, + "grad_norm": 1.7821460962295532, + "learning_rate": 7.201447835344216e-05, + "loss": 0.012963850796222687, + "step": 197240 + }, + { + "epoch": 27.9985805535841, + "grad_norm": 0.1480364203453064, + "learning_rate": 7.201305890702626e-05, + "loss": 0.026187264919281007, + "step": 197250 + }, + { + "epoch": 28.0, + "grad_norm": 0.1937294900417328, + "learning_rate": 7.201163946061037e-05, + "loss": 0.0207354798913002, + "step": 197260 + }, + { + "epoch": 28.0014194464159, + "grad_norm": 0.007205414120107889, + "learning_rate": 7.201022001419447e-05, + "loss": 0.009515534341335296, + "step": 197270 + }, + { + "epoch": 28.002838892831797, + "grad_norm": 0.41986364126205444, + "learning_rate": 7.200880056777857e-05, + "loss": 0.004502810165286064, + "step": 197280 + }, + { + "epoch": 28.004258339247695, + "grad_norm": 0.05151788517832756, + "learning_rate": 7.200738112136268e-05, + "loss": 0.005695433169603348, + "step": 197290 + }, + { + "epoch": 28.00567778566359, + "grad_norm": 0.062323302030563354, + "learning_rate": 7.200596167494677e-05, + "loss": 0.020927271246910094, + "step": 197300 + }, + { + "epoch": 28.007097232079488, + "grad_norm": 1.9327094554901123, + "learning_rate": 7.200454222853088e-05, + "loss": 0.007038970291614532, + "step": 197310 + }, + { + "epoch": 28.008516678495386, + "grad_norm": 7.736158847808838, + "learning_rate": 7.200312278211497e-05, + "loss": 0.014287589490413666, + "step": 197320 + }, + { + "epoch": 28.009936124911285, + "grad_norm": 2.971510648727417, + "learning_rate": 7.200170333569908e-05, + "loss": 0.01183139830827713, + "step": 197330 + }, + { + "epoch": 28.011355571327183, + "grad_norm": 1.8957841396331787, + "learning_rate": 7.200028388928318e-05, + "loss": 0.002474386617541313, + "step": 197340 + }, + { + "epoch": 28.01277501774308, + "grad_norm": 0.035539571195840836, + "learning_rate": 7.199886444286729e-05, + "loss": 0.002197153866291046, + "step": 197350 + }, + { + "epoch": 28.01419446415898, + "grad_norm": 7.421849727630615, + "learning_rate": 7.199744499645139e-05, + "loss": 0.006468897312879562, + "step": 197360 + }, + { + "epoch": 28.015613910574874, + "grad_norm": 0.27351483702659607, + "learning_rate": 7.199602555003548e-05, + "loss": 0.0027830816805362703, + "step": 197370 + }, + { + "epoch": 28.017033356990773, + "grad_norm": 17.80516815185547, + "learning_rate": 7.19946061036196e-05, + "loss": 0.02043018937110901, + "step": 197380 + }, + { + "epoch": 28.01845280340667, + "grad_norm": 5.399798393249512, + "learning_rate": 7.199318665720369e-05, + "loss": 0.006493811309337616, + "step": 197390 + }, + { + "epoch": 28.01987224982257, + "grad_norm": 7.734511375427246, + "learning_rate": 7.19917672107878e-05, + "loss": 0.0033462245017290117, + "step": 197400 + }, + { + "epoch": 28.021291696238467, + "grad_norm": 0.004023308400064707, + "learning_rate": 7.19903477643719e-05, + "loss": 0.002702037617564201, + "step": 197410 + }, + { + "epoch": 28.022711142654366, + "grad_norm": 3.4379525184631348, + "learning_rate": 7.198892831795601e-05, + "loss": 0.023367282748222352, + "step": 197420 + }, + { + "epoch": 28.024130589070264, + "grad_norm": 0.5034056901931763, + "learning_rate": 7.19875088715401e-05, + "loss": 0.029448109865188598, + "step": 197430 + }, + { + "epoch": 28.02555003548616, + "grad_norm": 16.412874221801758, + "learning_rate": 7.19860894251242e-05, + "loss": 0.02157161235809326, + "step": 197440 + }, + { + "epoch": 28.026969481902057, + "grad_norm": 0.007392359431833029, + "learning_rate": 7.19846699787083e-05, + "loss": 0.040921729803085324, + "step": 197450 + }, + { + "epoch": 28.028388928317955, + "grad_norm": 0.8295202851295471, + "learning_rate": 7.198325053229241e-05, + "loss": 0.015540862083435058, + "step": 197460 + }, + { + "epoch": 28.029808374733854, + "grad_norm": 0.018157152459025383, + "learning_rate": 7.198183108587651e-05, + "loss": 0.0032026030123233793, + "step": 197470 + }, + { + "epoch": 28.031227821149752, + "grad_norm": 0.017876973375678062, + "learning_rate": 7.198041163946061e-05, + "loss": 0.010091834515333176, + "step": 197480 + }, + { + "epoch": 28.03264726756565, + "grad_norm": 0.03589218854904175, + "learning_rate": 7.197899219304472e-05, + "loss": 0.008807872235774995, + "step": 197490 + }, + { + "epoch": 28.03406671398155, + "grad_norm": 2.148751735687256, + "learning_rate": 7.197757274662882e-05, + "loss": 0.018628576397895814, + "step": 197500 + }, + { + "epoch": 28.03406671398155, + "eval_accuracy": 0.9848667895975075, + "eval_loss": 0.06490203738212585, + "eval_runtime": 30.5763, + "eval_samples_per_second": 514.353, + "eval_steps_per_second": 16.091, + "step": 197500 + }, + { + "epoch": 28.035486160397443, + "grad_norm": 0.2341729998588562, + "learning_rate": 7.197615330021293e-05, + "loss": 0.013950210809707642, + "step": 197510 + }, + { + "epoch": 28.03690560681334, + "grad_norm": 0.8227326273918152, + "learning_rate": 7.197473385379702e-05, + "loss": 0.00590089000761509, + "step": 197520 + }, + { + "epoch": 28.03832505322924, + "grad_norm": 6.782509803771973, + "learning_rate": 7.197331440738112e-05, + "loss": 0.03757445514202118, + "step": 197530 + }, + { + "epoch": 28.03974449964514, + "grad_norm": 0.04665340855717659, + "learning_rate": 7.197189496096522e-05, + "loss": 0.0008298262953758239, + "step": 197540 + }, + { + "epoch": 28.041163946061037, + "grad_norm": 0.038444507867097855, + "learning_rate": 7.197047551454933e-05, + "loss": 0.04781551361083984, + "step": 197550 + }, + { + "epoch": 28.042583392476935, + "grad_norm": 12.246047019958496, + "learning_rate": 7.196905606813343e-05, + "loss": 0.03273476362228393, + "step": 197560 + }, + { + "epoch": 28.044002838892833, + "grad_norm": 2.431135892868042, + "learning_rate": 7.196763662171754e-05, + "loss": 0.021167969703674315, + "step": 197570 + }, + { + "epoch": 28.045422285308728, + "grad_norm": 0.1523563712835312, + "learning_rate": 7.196621717530164e-05, + "loss": 0.022345447540283205, + "step": 197580 + }, + { + "epoch": 28.046841731724626, + "grad_norm": 0.21549147367477417, + "learning_rate": 7.196479772888573e-05, + "loss": 0.005263157933950424, + "step": 197590 + }, + { + "epoch": 28.048261178140525, + "grad_norm": 0.23399338126182556, + "learning_rate": 7.196337828246984e-05, + "loss": 0.01917850971221924, + "step": 197600 + }, + { + "epoch": 28.049680624556423, + "grad_norm": 1.4820919036865234, + "learning_rate": 7.196195883605394e-05, + "loss": 0.011961136013269424, + "step": 197610 + }, + { + "epoch": 28.05110007097232, + "grad_norm": 1.8727582693099976, + "learning_rate": 7.196053938963805e-05, + "loss": 0.018534672260284425, + "step": 197620 + }, + { + "epoch": 28.05251951738822, + "grad_norm": 13.281856536865234, + "learning_rate": 7.195911994322214e-05, + "loss": 0.01692683845758438, + "step": 197630 + }, + { + "epoch": 28.053938963804118, + "grad_norm": 0.4172228276729584, + "learning_rate": 7.195770049680625e-05, + "loss": 0.055352813005447386, + "step": 197640 + }, + { + "epoch": 28.055358410220013, + "grad_norm": 0.17731967568397522, + "learning_rate": 7.195628105039034e-05, + "loss": 0.007279900461435318, + "step": 197650 + }, + { + "epoch": 28.05677785663591, + "grad_norm": 10.687019348144531, + "learning_rate": 7.195486160397446e-05, + "loss": 0.008152425289154053, + "step": 197660 + }, + { + "epoch": 28.05819730305181, + "grad_norm": 0.022167515009641647, + "learning_rate": 7.195344215755855e-05, + "loss": 0.008441905677318572, + "step": 197670 + }, + { + "epoch": 28.059616749467708, + "grad_norm": 18.648780822753906, + "learning_rate": 7.195202271114265e-05, + "loss": 0.005746489018201828, + "step": 197680 + }, + { + "epoch": 28.061036195883606, + "grad_norm": 1.4361165761947632, + "learning_rate": 7.195060326472676e-05, + "loss": 0.006795208156108856, + "step": 197690 + }, + { + "epoch": 28.062455642299504, + "grad_norm": 0.33552148938179016, + "learning_rate": 7.194918381831086e-05, + "loss": 0.009232573211193085, + "step": 197700 + }, + { + "epoch": 28.063875088715402, + "grad_norm": 1.3302404880523682, + "learning_rate": 7.194776437189497e-05, + "loss": 0.0030688103288412094, + "step": 197710 + }, + { + "epoch": 28.065294535131297, + "grad_norm": 3.4767777919769287, + "learning_rate": 7.194634492547907e-05, + "loss": 0.012692062556743622, + "step": 197720 + }, + { + "epoch": 28.066713981547196, + "grad_norm": 0.2721578776836395, + "learning_rate": 7.194492547906316e-05, + "loss": 0.0011021491140127183, + "step": 197730 + }, + { + "epoch": 28.068133427963094, + "grad_norm": 10.803467750549316, + "learning_rate": 7.194350603264726e-05, + "loss": 0.011439882963895798, + "step": 197740 + }, + { + "epoch": 28.069552874378992, + "grad_norm": 0.010600858367979527, + "learning_rate": 7.194208658623137e-05, + "loss": 0.02299305498600006, + "step": 197750 + }, + { + "epoch": 28.07097232079489, + "grad_norm": 0.03232559189200401, + "learning_rate": 7.194066713981547e-05, + "loss": 0.08097992539405822, + "step": 197760 + }, + { + "epoch": 28.07239176721079, + "grad_norm": 12.43992805480957, + "learning_rate": 7.193924769339958e-05, + "loss": 0.02055570185184479, + "step": 197770 + }, + { + "epoch": 28.073811213626687, + "grad_norm": 0.0013353305403143167, + "learning_rate": 7.193782824698368e-05, + "loss": 0.014677926898002625, + "step": 197780 + }, + { + "epoch": 28.075230660042582, + "grad_norm": 1.2344592809677124, + "learning_rate": 7.193640880056778e-05, + "loss": 0.024467730522155763, + "step": 197790 + }, + { + "epoch": 28.07665010645848, + "grad_norm": 0.0438481941819191, + "learning_rate": 7.193498935415189e-05, + "loss": 0.003815813362598419, + "step": 197800 + }, + { + "epoch": 28.07806955287438, + "grad_norm": 10.437585830688477, + "learning_rate": 7.193356990773598e-05, + "loss": 0.008255415409803391, + "step": 197810 + }, + { + "epoch": 28.079488999290277, + "grad_norm": 0.06908150017261505, + "learning_rate": 7.19321504613201e-05, + "loss": 0.0023287318646907805, + "step": 197820 + }, + { + "epoch": 28.080908445706175, + "grad_norm": 2.350159168243408, + "learning_rate": 7.193073101490419e-05, + "loss": 0.00908368155360222, + "step": 197830 + }, + { + "epoch": 28.082327892122073, + "grad_norm": 0.4862111210823059, + "learning_rate": 7.192931156848829e-05, + "loss": 0.035428833961486814, + "step": 197840 + }, + { + "epoch": 28.08374733853797, + "grad_norm": 0.004656804259866476, + "learning_rate": 7.192789212207239e-05, + "loss": 0.009353318065404893, + "step": 197850 + }, + { + "epoch": 28.085166784953866, + "grad_norm": 0.4168454110622406, + "learning_rate": 7.19264726756565e-05, + "loss": 0.004357517138123513, + "step": 197860 + }, + { + "epoch": 28.086586231369765, + "grad_norm": 10.851113319396973, + "learning_rate": 7.19250532292406e-05, + "loss": 0.03369134068489075, + "step": 197870 + }, + { + "epoch": 28.088005677785663, + "grad_norm": 6.733954906463623, + "learning_rate": 7.19236337828247e-05, + "loss": 0.0316934734582901, + "step": 197880 + }, + { + "epoch": 28.08942512420156, + "grad_norm": 0.3141199052333832, + "learning_rate": 7.19222143364088e-05, + "loss": 0.001387074589729309, + "step": 197890 + }, + { + "epoch": 28.09084457061746, + "grad_norm": 3.2625467777252197, + "learning_rate": 7.19207948899929e-05, + "loss": 0.012239275127649307, + "step": 197900 + }, + { + "epoch": 28.092264017033358, + "grad_norm": 0.019242562353610992, + "learning_rate": 7.191937544357701e-05, + "loss": 0.005796531587839127, + "step": 197910 + }, + { + "epoch": 28.093683463449256, + "grad_norm": 0.03959595039486885, + "learning_rate": 7.191795599716111e-05, + "loss": 0.04893697798252106, + "step": 197920 + }, + { + "epoch": 28.09510290986515, + "grad_norm": 9.220747947692871, + "learning_rate": 7.191653655074522e-05, + "loss": 0.013614200055599213, + "step": 197930 + }, + { + "epoch": 28.09652235628105, + "grad_norm": 0.04213254153728485, + "learning_rate": 7.19151171043293e-05, + "loss": 0.01168680116534233, + "step": 197940 + }, + { + "epoch": 28.097941802696948, + "grad_norm": 1.4768685102462769, + "learning_rate": 7.191369765791341e-05, + "loss": 0.005559194087982178, + "step": 197950 + }, + { + "epoch": 28.099361249112846, + "grad_norm": 0.17015276849269867, + "learning_rate": 7.191227821149751e-05, + "loss": 0.01774754822254181, + "step": 197960 + }, + { + "epoch": 28.100780695528744, + "grad_norm": 8.010993957519531, + "learning_rate": 7.191085876508162e-05, + "loss": 0.024683889746665955, + "step": 197970 + }, + { + "epoch": 28.102200141944643, + "grad_norm": 0.19568544626235962, + "learning_rate": 7.190943931866573e-05, + "loss": 0.015067589282989503, + "step": 197980 + }, + { + "epoch": 28.10361958836054, + "grad_norm": 0.02012958563864231, + "learning_rate": 7.190801987224982e-05, + "loss": 0.0009410724043846131, + "step": 197990 + }, + { + "epoch": 28.105039034776436, + "grad_norm": 7.969406604766846, + "learning_rate": 7.190660042583393e-05, + "loss": 0.021896132826805116, + "step": 198000 + }, + { + "epoch": 28.105039034776436, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.048556551337242126, + "eval_runtime": 31.8672, + "eval_samples_per_second": 493.516, + "eval_steps_per_second": 15.439, + "step": 198000 + }, + { + "epoch": 28.106458481192334, + "grad_norm": 0.01310727745294571, + "learning_rate": 7.190518097941803e-05, + "loss": 0.0008733481168746948, + "step": 198010 + }, + { + "epoch": 28.107877927608232, + "grad_norm": 0.27461862564086914, + "learning_rate": 7.190376153300214e-05, + "loss": 0.024348944425582886, + "step": 198020 + }, + { + "epoch": 28.10929737402413, + "grad_norm": 0.10911380499601364, + "learning_rate": 7.190234208658623e-05, + "loss": 0.004792207479476928, + "step": 198030 + }, + { + "epoch": 28.11071682044003, + "grad_norm": 0.3167087733745575, + "learning_rate": 7.190092264017033e-05, + "loss": 0.016757354140281677, + "step": 198040 + }, + { + "epoch": 28.112136266855927, + "grad_norm": 0.7130719423294067, + "learning_rate": 7.189950319375443e-05, + "loss": 0.006646868586540222, + "step": 198050 + }, + { + "epoch": 28.113555713271825, + "grad_norm": 0.7811514139175415, + "learning_rate": 7.189808374733854e-05, + "loss": 0.009232310950756073, + "step": 198060 + }, + { + "epoch": 28.11497515968772, + "grad_norm": 19.997610092163086, + "learning_rate": 7.189666430092265e-05, + "loss": 0.05856020450592041, + "step": 198070 + }, + { + "epoch": 28.11639460610362, + "grad_norm": 0.08689016848802567, + "learning_rate": 7.189524485450675e-05, + "loss": 0.01347748190164566, + "step": 198080 + }, + { + "epoch": 28.117814052519517, + "grad_norm": 5.306333065032959, + "learning_rate": 7.189382540809086e-05, + "loss": 0.05943218469619751, + "step": 198090 + }, + { + "epoch": 28.119233498935415, + "grad_norm": 0.013262351974844933, + "learning_rate": 7.189240596167494e-05, + "loss": 0.006382211297750473, + "step": 198100 + }, + { + "epoch": 28.120652945351313, + "grad_norm": 3.477543830871582, + "learning_rate": 7.189098651525905e-05, + "loss": 0.004603825137019157, + "step": 198110 + }, + { + "epoch": 28.12207239176721, + "grad_norm": 0.11147823929786682, + "learning_rate": 7.188956706884315e-05, + "loss": 0.004256194829940796, + "step": 198120 + }, + { + "epoch": 28.12349183818311, + "grad_norm": 0.0762336403131485, + "learning_rate": 7.188814762242726e-05, + "loss": 0.00526341050863266, + "step": 198130 + }, + { + "epoch": 28.124911284599005, + "grad_norm": 0.025875143706798553, + "learning_rate": 7.188672817601136e-05, + "loss": 0.014555910229682922, + "step": 198140 + }, + { + "epoch": 28.126330731014903, + "grad_norm": 0.6375128030776978, + "learning_rate": 7.188530872959546e-05, + "loss": 0.026440003514289857, + "step": 198150 + }, + { + "epoch": 28.1277501774308, + "grad_norm": 0.5141092538833618, + "learning_rate": 7.188388928317957e-05, + "loss": 0.03513275980949402, + "step": 198160 + }, + { + "epoch": 28.1291696238467, + "grad_norm": 0.011617250740528107, + "learning_rate": 7.188246983676367e-05, + "loss": 0.0028238240629434586, + "step": 198170 + }, + { + "epoch": 28.130589070262598, + "grad_norm": 0.029884925112128258, + "learning_rate": 7.188105039034778e-05, + "loss": 0.02199074923992157, + "step": 198180 + }, + { + "epoch": 28.132008516678496, + "grad_norm": 12.836271286010742, + "learning_rate": 7.187963094393187e-05, + "loss": 0.010666737705469132, + "step": 198190 + }, + { + "epoch": 28.133427963094395, + "grad_norm": 0.010467709973454475, + "learning_rate": 7.187821149751597e-05, + "loss": 0.029071936011314393, + "step": 198200 + }, + { + "epoch": 28.13484740951029, + "grad_norm": 2.7529191970825195, + "learning_rate": 7.187679205110007e-05, + "loss": 0.005079149454832077, + "step": 198210 + }, + { + "epoch": 28.136266855926188, + "grad_norm": 0.028197595849633217, + "learning_rate": 7.187537260468418e-05, + "loss": 0.01154220923781395, + "step": 198220 + }, + { + "epoch": 28.137686302342086, + "grad_norm": 0.03586457297205925, + "learning_rate": 7.187395315826828e-05, + "loss": 0.003774514049291611, + "step": 198230 + }, + { + "epoch": 28.139105748757984, + "grad_norm": 1.96076238155365, + "learning_rate": 7.187253371185239e-05, + "loss": 0.02948676347732544, + "step": 198240 + }, + { + "epoch": 28.140525195173883, + "grad_norm": 8.81346607208252, + "learning_rate": 7.187111426543649e-05, + "loss": 0.010899047553539275, + "step": 198250 + }, + { + "epoch": 28.14194464158978, + "grad_norm": 7.6110968589782715, + "learning_rate": 7.186969481902058e-05, + "loss": 0.008594915270805359, + "step": 198260 + }, + { + "epoch": 28.14336408800568, + "grad_norm": 9.378966331481934, + "learning_rate": 7.18682753726047e-05, + "loss": 0.02859974205493927, + "step": 198270 + }, + { + "epoch": 28.144783534421574, + "grad_norm": 0.8050486445426941, + "learning_rate": 7.186685592618879e-05, + "loss": 0.06831099987030029, + "step": 198280 + }, + { + "epoch": 28.146202980837472, + "grad_norm": 3.2453904151916504, + "learning_rate": 7.18654364797729e-05, + "loss": 0.03622580766677856, + "step": 198290 + }, + { + "epoch": 28.14762242725337, + "grad_norm": 0.07101834565401077, + "learning_rate": 7.186401703335699e-05, + "loss": 0.0017596103250980377, + "step": 198300 + }, + { + "epoch": 28.14904187366927, + "grad_norm": 0.004266354721039534, + "learning_rate": 7.18625975869411e-05, + "loss": 0.022190752625465392, + "step": 198310 + }, + { + "epoch": 28.150461320085167, + "grad_norm": 1.182930588722229, + "learning_rate": 7.18611781405252e-05, + "loss": 0.013122211396694183, + "step": 198320 + }, + { + "epoch": 28.151880766501066, + "grad_norm": 0.09809020161628723, + "learning_rate": 7.18597586941093e-05, + "loss": 0.0005894452333450317, + "step": 198330 + }, + { + "epoch": 28.153300212916964, + "grad_norm": 0.15074580907821655, + "learning_rate": 7.18583392476934e-05, + "loss": 0.018449503183364867, + "step": 198340 + }, + { + "epoch": 28.15471965933286, + "grad_norm": 13.118376731872559, + "learning_rate": 7.18569198012775e-05, + "loss": 0.003278408944606781, + "step": 198350 + }, + { + "epoch": 28.156139105748757, + "grad_norm": 0.38612309098243713, + "learning_rate": 7.185550035486161e-05, + "loss": 0.0036501802504062654, + "step": 198360 + }, + { + "epoch": 28.157558552164655, + "grad_norm": 3.771529197692871, + "learning_rate": 7.185408090844571e-05, + "loss": 0.005601323768496513, + "step": 198370 + }, + { + "epoch": 28.158977998580554, + "grad_norm": 0.008784791454672813, + "learning_rate": 7.185266146202982e-05, + "loss": 0.013539333641529084, + "step": 198380 + }, + { + "epoch": 28.160397444996452, + "grad_norm": 0.01171286404132843, + "learning_rate": 7.185124201561392e-05, + "loss": 0.003564298152923584, + "step": 198390 + }, + { + "epoch": 28.16181689141235, + "grad_norm": 23.086275100708008, + "learning_rate": 7.184982256919801e-05, + "loss": 0.022434885799884795, + "step": 198400 + }, + { + "epoch": 28.16323633782825, + "grad_norm": 0.027869975194334984, + "learning_rate": 7.184840312278211e-05, + "loss": 0.0021821025758981703, + "step": 198410 + }, + { + "epoch": 28.164655784244143, + "grad_norm": 0.07125645130872726, + "learning_rate": 7.184698367636622e-05, + "loss": 0.055779337882995605, + "step": 198420 + }, + { + "epoch": 28.16607523066004, + "grad_norm": 0.021911224350333214, + "learning_rate": 7.184556422995032e-05, + "loss": 0.012522101402282715, + "step": 198430 + }, + { + "epoch": 28.16749467707594, + "grad_norm": 7.393925666809082, + "learning_rate": 7.184414478353443e-05, + "loss": 0.018500661849975585, + "step": 198440 + }, + { + "epoch": 28.168914123491838, + "grad_norm": 31.21116828918457, + "learning_rate": 7.184272533711853e-05, + "loss": 0.015891030430793762, + "step": 198450 + }, + { + "epoch": 28.170333569907736, + "grad_norm": 0.11650710552930832, + "learning_rate": 7.184130589070263e-05, + "loss": 0.006344839930534363, + "step": 198460 + }, + { + "epoch": 28.171753016323635, + "grad_norm": 8.760160446166992, + "learning_rate": 7.183988644428674e-05, + "loss": 0.028544661402702332, + "step": 198470 + }, + { + "epoch": 28.173172462739533, + "grad_norm": 1.0291469097137451, + "learning_rate": 7.183846699787083e-05, + "loss": 0.02699621915817261, + "step": 198480 + }, + { + "epoch": 28.174591909155428, + "grad_norm": 0.005658768583089113, + "learning_rate": 7.183704755145494e-05, + "loss": 0.020065045356750487, + "step": 198490 + }, + { + "epoch": 28.176011355571326, + "grad_norm": 12.156325340270996, + "learning_rate": 7.183562810503904e-05, + "loss": 0.015373395383358001, + "step": 198500 + }, + { + "epoch": 28.176011355571326, + "eval_accuracy": 0.9895084885865073, + "eval_loss": 0.04113214462995529, + "eval_runtime": 32.3437, + "eval_samples_per_second": 486.247, + "eval_steps_per_second": 15.212, + "step": 198500 + }, + { + "epoch": 28.177430801987224, + "grad_norm": 2.3689663410186768, + "learning_rate": 7.183420865862314e-05, + "loss": 0.008307093381881714, + "step": 198510 + }, + { + "epoch": 28.178850248403123, + "grad_norm": 9.846131324768066, + "learning_rate": 7.183278921220724e-05, + "loss": 0.009163714200258254, + "step": 198520 + }, + { + "epoch": 28.18026969481902, + "grad_norm": 0.2540414333343506, + "learning_rate": 7.183136976579135e-05, + "loss": 0.0050778228789567946, + "step": 198530 + }, + { + "epoch": 28.18168914123492, + "grad_norm": 0.9883116483688354, + "learning_rate": 7.182995031937544e-05, + "loss": 0.008128891885280608, + "step": 198540 + }, + { + "epoch": 28.183108587650818, + "grad_norm": 0.1541709154844284, + "learning_rate": 7.182853087295956e-05, + "loss": 0.02032747268676758, + "step": 198550 + }, + { + "epoch": 28.184528034066712, + "grad_norm": 0.07162367552518845, + "learning_rate": 7.182711142654365e-05, + "loss": 0.0007797852158546448, + "step": 198560 + }, + { + "epoch": 28.18594748048261, + "grad_norm": 0.028829459100961685, + "learning_rate": 7.182569198012775e-05, + "loss": 0.019412782788276673, + "step": 198570 + }, + { + "epoch": 28.18736692689851, + "grad_norm": 0.2605767846107483, + "learning_rate": 7.182427253371186e-05, + "loss": 0.013767091929912567, + "step": 198580 + }, + { + "epoch": 28.188786373314407, + "grad_norm": 1.3772989511489868, + "learning_rate": 7.182285308729596e-05, + "loss": 0.005471619591116905, + "step": 198590 + }, + { + "epoch": 28.190205819730306, + "grad_norm": 0.1791374385356903, + "learning_rate": 7.182143364088007e-05, + "loss": 0.006050721183419228, + "step": 198600 + }, + { + "epoch": 28.191625266146204, + "grad_norm": 0.015200753696262836, + "learning_rate": 7.182001419446415e-05, + "loss": 0.022774499654769898, + "step": 198610 + }, + { + "epoch": 28.193044712562102, + "grad_norm": 1.3517003059387207, + "learning_rate": 7.181859474804826e-05, + "loss": 0.009259931743144989, + "step": 198620 + }, + { + "epoch": 28.194464158977997, + "grad_norm": 0.04107215255498886, + "learning_rate": 7.181717530163236e-05, + "loss": 0.021351344883441925, + "step": 198630 + }, + { + "epoch": 28.195883605393895, + "grad_norm": 0.4483104944229126, + "learning_rate": 7.181575585521647e-05, + "loss": 0.02177102267742157, + "step": 198640 + }, + { + "epoch": 28.197303051809794, + "grad_norm": 0.024949485436081886, + "learning_rate": 7.181433640880057e-05, + "loss": 0.007724618911743164, + "step": 198650 + }, + { + "epoch": 28.198722498225692, + "grad_norm": 0.01198497973382473, + "learning_rate": 7.181291696238467e-05, + "loss": 0.02693368196487427, + "step": 198660 + }, + { + "epoch": 28.20014194464159, + "grad_norm": 0.06220359355211258, + "learning_rate": 7.181149751596878e-05, + "loss": 0.002040211111307144, + "step": 198670 + }, + { + "epoch": 28.20156139105749, + "grad_norm": 2.3693745136260986, + "learning_rate": 7.181007806955288e-05, + "loss": 0.009099221974611282, + "step": 198680 + }, + { + "epoch": 28.202980837473387, + "grad_norm": 0.5160547494888306, + "learning_rate": 7.180865862313699e-05, + "loss": 0.0016773134469985963, + "step": 198690 + }, + { + "epoch": 28.20440028388928, + "grad_norm": 0.4052281379699707, + "learning_rate": 7.180723917672108e-05, + "loss": 0.006355802714824677, + "step": 198700 + }, + { + "epoch": 28.20581973030518, + "grad_norm": 0.03703802451491356, + "learning_rate": 7.180581973030518e-05, + "loss": 0.0018402468413114548, + "step": 198710 + }, + { + "epoch": 28.207239176721078, + "grad_norm": 0.028685545548796654, + "learning_rate": 7.180440028388928e-05, + "loss": 0.0025254156440496444, + "step": 198720 + }, + { + "epoch": 28.208658623136976, + "grad_norm": 0.2931233048439026, + "learning_rate": 7.180298083747339e-05, + "loss": 0.010500217974185943, + "step": 198730 + }, + { + "epoch": 28.210078069552875, + "grad_norm": 6.687626838684082, + "learning_rate": 7.180156139105749e-05, + "loss": 0.025516217947006224, + "step": 198740 + }, + { + "epoch": 28.211497515968773, + "grad_norm": 0.008754665963351727, + "learning_rate": 7.18001419446416e-05, + "loss": 0.003577231988310814, + "step": 198750 + }, + { + "epoch": 28.21291696238467, + "grad_norm": 0.2194419503211975, + "learning_rate": 7.17987224982257e-05, + "loss": 0.016340966522693633, + "step": 198760 + }, + { + "epoch": 28.214336408800566, + "grad_norm": 0.8157954216003418, + "learning_rate": 7.179730305180979e-05, + "loss": 0.03520582914352417, + "step": 198770 + }, + { + "epoch": 28.215755855216464, + "grad_norm": 11.297907829284668, + "learning_rate": 7.17958836053939e-05, + "loss": 0.010773224383592605, + "step": 198780 + }, + { + "epoch": 28.217175301632363, + "grad_norm": 0.15266498923301697, + "learning_rate": 7.1794464158978e-05, + "loss": 0.02926681041717529, + "step": 198790 + }, + { + "epoch": 28.21859474804826, + "grad_norm": 0.022505726665258408, + "learning_rate": 7.179304471256211e-05, + "loss": 0.00801514759659767, + "step": 198800 + }, + { + "epoch": 28.22001419446416, + "grad_norm": 0.2645977735519409, + "learning_rate": 7.179162526614621e-05, + "loss": 0.012408627569675446, + "step": 198810 + }, + { + "epoch": 28.221433640880058, + "grad_norm": 0.29010510444641113, + "learning_rate": 7.179020581973031e-05, + "loss": 0.007039777934551239, + "step": 198820 + }, + { + "epoch": 28.222853087295956, + "grad_norm": 5.933087348937988, + "learning_rate": 7.17887863733144e-05, + "loss": 0.007259468734264374, + "step": 198830 + }, + { + "epoch": 28.22427253371185, + "grad_norm": 1.4356882572174072, + "learning_rate": 7.178736692689852e-05, + "loss": 0.003610849380493164, + "step": 198840 + }, + { + "epoch": 28.22569198012775, + "grad_norm": 0.3860517144203186, + "learning_rate": 7.178594748048261e-05, + "loss": 0.0037702079862356186, + "step": 198850 + }, + { + "epoch": 28.227111426543647, + "grad_norm": 8.35910701751709, + "learning_rate": 7.178452803406672e-05, + "loss": 0.0038511686027050017, + "step": 198860 + }, + { + "epoch": 28.228530872959546, + "grad_norm": 0.20437362790107727, + "learning_rate": 7.178310858765082e-05, + "loss": 0.00913584902882576, + "step": 198870 + }, + { + "epoch": 28.229950319375444, + "grad_norm": 0.5130347013473511, + "learning_rate": 7.178168914123492e-05, + "loss": 0.028618553280830385, + "step": 198880 + }, + { + "epoch": 28.231369765791342, + "grad_norm": 13.002286911010742, + "learning_rate": 7.178026969481903e-05, + "loss": 0.017091858386993408, + "step": 198890 + }, + { + "epoch": 28.23278921220724, + "grad_norm": 7.101986885070801, + "learning_rate": 7.177885024840313e-05, + "loss": 0.01884608268737793, + "step": 198900 + }, + { + "epoch": 28.234208658623135, + "grad_norm": 0.0462958961725235, + "learning_rate": 7.177743080198724e-05, + "loss": 0.006444811820983887, + "step": 198910 + }, + { + "epoch": 28.235628105039034, + "grad_norm": 0.23477627336978912, + "learning_rate": 7.177601135557132e-05, + "loss": 0.0007540851831436158, + "step": 198920 + }, + { + "epoch": 28.237047551454932, + "grad_norm": 0.026311807334423065, + "learning_rate": 7.177459190915543e-05, + "loss": 0.023505859076976776, + "step": 198930 + }, + { + "epoch": 28.23846699787083, + "grad_norm": 3.452911138534546, + "learning_rate": 7.177317246273953e-05, + "loss": 0.004402830451726914, + "step": 198940 + }, + { + "epoch": 28.23988644428673, + "grad_norm": 2.5646767616271973, + "learning_rate": 7.177175301632364e-05, + "loss": 0.005493636429309845, + "step": 198950 + }, + { + "epoch": 28.241305890702627, + "grad_norm": 0.09005062282085419, + "learning_rate": 7.177033356990774e-05, + "loss": 0.020497852563858034, + "step": 198960 + }, + { + "epoch": 28.242725337118525, + "grad_norm": 0.030750466510653496, + "learning_rate": 7.176891412349184e-05, + "loss": 0.0037500407546758653, + "step": 198970 + }, + { + "epoch": 28.24414478353442, + "grad_norm": 1.5486388206481934, + "learning_rate": 7.176749467707595e-05, + "loss": 0.008060748875141143, + "step": 198980 + }, + { + "epoch": 28.24556422995032, + "grad_norm": 1.5135867595672607, + "learning_rate": 7.176607523066004e-05, + "loss": 0.018971875309944153, + "step": 198990 + }, + { + "epoch": 28.246983676366217, + "grad_norm": 4.243948936462402, + "learning_rate": 7.176465578424415e-05, + "loss": 0.031149354577064515, + "step": 199000 + }, + { + "epoch": 28.246983676366217, + "eval_accuracy": 0.9869015069625485, + "eval_loss": 0.056913524866104126, + "eval_runtime": 32.0073, + "eval_samples_per_second": 491.357, + "eval_steps_per_second": 15.371, + "step": 199000 + }, + { + "epoch": 28.248403122782115, + "grad_norm": 0.008811921812593937, + "learning_rate": 7.176323633782825e-05, + "loss": 0.0014320608228445054, + "step": 199010 + }, + { + "epoch": 28.249822569198013, + "grad_norm": 1.7880204916000366, + "learning_rate": 7.176181689141235e-05, + "loss": 0.02174696922302246, + "step": 199020 + }, + { + "epoch": 28.25124201561391, + "grad_norm": 14.044326782226562, + "learning_rate": 7.176039744499645e-05, + "loss": 0.026323074102401735, + "step": 199030 + }, + { + "epoch": 28.25266146202981, + "grad_norm": 0.6186167001724243, + "learning_rate": 7.175897799858056e-05, + "loss": 0.022926564514636993, + "step": 199040 + }, + { + "epoch": 28.254080908445705, + "grad_norm": 0.16053526103496552, + "learning_rate": 7.175755855216465e-05, + "loss": 0.01862682104110718, + "step": 199050 + }, + { + "epoch": 28.255500354861603, + "grad_norm": 0.6314814686775208, + "learning_rate": 7.175613910574877e-05, + "loss": 0.0073211416602134705, + "step": 199060 + }, + { + "epoch": 28.2569198012775, + "grad_norm": 1.032983422279358, + "learning_rate": 7.175471965933286e-05, + "loss": 0.003830196335911751, + "step": 199070 + }, + { + "epoch": 28.2583392476934, + "grad_norm": 2.4819090366363525, + "learning_rate": 7.175330021291696e-05, + "loss": 0.003858542814850807, + "step": 199080 + }, + { + "epoch": 28.259758694109298, + "grad_norm": 0.8770925402641296, + "learning_rate": 7.175188076650107e-05, + "loss": 0.008220294117927551, + "step": 199090 + }, + { + "epoch": 28.261178140525196, + "grad_norm": 14.4719877243042, + "learning_rate": 7.175046132008517e-05, + "loss": 0.07126365303993225, + "step": 199100 + }, + { + "epoch": 28.262597586941094, + "grad_norm": 0.015148174948990345, + "learning_rate": 7.174904187366928e-05, + "loss": 0.014373761415481568, + "step": 199110 + }, + { + "epoch": 28.26401703335699, + "grad_norm": 4.965343952178955, + "learning_rate": 7.174762242725336e-05, + "loss": 0.037719517946243286, + "step": 199120 + }, + { + "epoch": 28.265436479772887, + "grad_norm": 2.49552321434021, + "learning_rate": 7.174620298083747e-05, + "loss": 0.00984981432557106, + "step": 199130 + }, + { + "epoch": 28.266855926188786, + "grad_norm": 0.28633618354797363, + "learning_rate": 7.174478353442157e-05, + "loss": 0.0200341135263443, + "step": 199140 + }, + { + "epoch": 28.268275372604684, + "grad_norm": 0.054994624108076096, + "learning_rate": 7.174336408800568e-05, + "loss": 0.02704460918903351, + "step": 199150 + }, + { + "epoch": 28.269694819020582, + "grad_norm": 2.9216079711914062, + "learning_rate": 7.174194464158978e-05, + "loss": 0.002173898369073868, + "step": 199160 + }, + { + "epoch": 28.27111426543648, + "grad_norm": 0.2569696307182312, + "learning_rate": 7.174052519517389e-05, + "loss": 0.0018134471029043198, + "step": 199170 + }, + { + "epoch": 28.27253371185238, + "grad_norm": 0.01953631266951561, + "learning_rate": 7.173910574875799e-05, + "loss": 0.008378922939300537, + "step": 199180 + }, + { + "epoch": 28.273953158268274, + "grad_norm": 0.6290520429611206, + "learning_rate": 7.173768630234209e-05, + "loss": 0.04822434782981873, + "step": 199190 + }, + { + "epoch": 28.275372604684172, + "grad_norm": 0.6607650518417358, + "learning_rate": 7.17362668559262e-05, + "loss": 0.0004989232867956162, + "step": 199200 + }, + { + "epoch": 28.27679205110007, + "grad_norm": 0.046599987894296646, + "learning_rate": 7.17348474095103e-05, + "loss": 0.009035972505807876, + "step": 199210 + }, + { + "epoch": 28.27821149751597, + "grad_norm": 0.4652818739414215, + "learning_rate": 7.17334279630944e-05, + "loss": 0.0015244346112012863, + "step": 199220 + }, + { + "epoch": 28.279630943931867, + "grad_norm": 1.5538313388824463, + "learning_rate": 7.173200851667849e-05, + "loss": 0.05766324400901794, + "step": 199230 + }, + { + "epoch": 28.281050390347765, + "grad_norm": 0.011028612963855267, + "learning_rate": 7.17305890702626e-05, + "loss": 0.015723395347595214, + "step": 199240 + }, + { + "epoch": 28.282469836763664, + "grad_norm": 0.023747524246573448, + "learning_rate": 7.17291696238467e-05, + "loss": 0.005992559716105461, + "step": 199250 + }, + { + "epoch": 28.28388928317956, + "grad_norm": 0.7069027423858643, + "learning_rate": 7.172775017743081e-05, + "loss": 0.01044887602329254, + "step": 199260 + }, + { + "epoch": 28.285308729595457, + "grad_norm": 0.13326124846935272, + "learning_rate": 7.172633073101492e-05, + "loss": 0.02148534804582596, + "step": 199270 + }, + { + "epoch": 28.286728176011355, + "grad_norm": 0.029064437374472618, + "learning_rate": 7.1724911284599e-05, + "loss": 0.05683501958847046, + "step": 199280 + }, + { + "epoch": 28.288147622427253, + "grad_norm": 0.7665511965751648, + "learning_rate": 7.172349183818311e-05, + "loss": 0.009576819837093353, + "step": 199290 + }, + { + "epoch": 28.28956706884315, + "grad_norm": 6.022651672363281, + "learning_rate": 7.172207239176721e-05, + "loss": 0.006903210282325744, + "step": 199300 + }, + { + "epoch": 28.29098651525905, + "grad_norm": 10.64051628112793, + "learning_rate": 7.172065294535132e-05, + "loss": 0.03999493420124054, + "step": 199310 + }, + { + "epoch": 28.292405961674948, + "grad_norm": 0.08619394153356552, + "learning_rate": 7.171923349893542e-05, + "loss": 0.00856240838766098, + "step": 199320 + }, + { + "epoch": 28.293825408090843, + "grad_norm": 0.27907872200012207, + "learning_rate": 7.171781405251952e-05, + "loss": 0.029816418886184692, + "step": 199330 + }, + { + "epoch": 28.29524485450674, + "grad_norm": 28.75792694091797, + "learning_rate": 7.171639460610361e-05, + "loss": 0.03215681314468384, + "step": 199340 + }, + { + "epoch": 28.29666430092264, + "grad_norm": 0.5865610837936401, + "learning_rate": 7.171497515968773e-05, + "loss": 0.0431529700756073, + "step": 199350 + }, + { + "epoch": 28.298083747338538, + "grad_norm": 2.3216912746429443, + "learning_rate": 7.171355571327184e-05, + "loss": 0.00583549365401268, + "step": 199360 + }, + { + "epoch": 28.299503193754436, + "grad_norm": 0.10580426454544067, + "learning_rate": 7.171213626685593e-05, + "loss": 0.020073053240776063, + "step": 199370 + }, + { + "epoch": 28.300922640170334, + "grad_norm": 12.224957466125488, + "learning_rate": 7.171071682044003e-05, + "loss": 0.032395750284194946, + "step": 199380 + }, + { + "epoch": 28.302342086586233, + "grad_norm": 0.4430438280105591, + "learning_rate": 7.170929737402413e-05, + "loss": 0.017880120873451234, + "step": 199390 + }, + { + "epoch": 28.303761533002127, + "grad_norm": 26.471162796020508, + "learning_rate": 7.170787792760824e-05, + "loss": 0.05289289951324463, + "step": 199400 + }, + { + "epoch": 28.305180979418026, + "grad_norm": 0.3009703457355499, + "learning_rate": 7.170645848119234e-05, + "loss": 0.005714060738682747, + "step": 199410 + }, + { + "epoch": 28.306600425833924, + "grad_norm": 2.3661813735961914, + "learning_rate": 7.170503903477645e-05, + "loss": 0.019633665680885315, + "step": 199420 + }, + { + "epoch": 28.308019872249822, + "grad_norm": 7.0483479499816895, + "learning_rate": 7.170361958836053e-05, + "loss": 0.04410728216171265, + "step": 199430 + }, + { + "epoch": 28.30943931866572, + "grad_norm": 0.10131903737783432, + "learning_rate": 7.170220014194464e-05, + "loss": 0.01712010204792023, + "step": 199440 + }, + { + "epoch": 28.31085876508162, + "grad_norm": 0.09238538146018982, + "learning_rate": 7.170078069552875e-05, + "loss": 0.0025883521884679794, + "step": 199450 + }, + { + "epoch": 28.312278211497517, + "grad_norm": 4.472650051116943, + "learning_rate": 7.169936124911285e-05, + "loss": 0.06176661252975464, + "step": 199460 + }, + { + "epoch": 28.313697657913412, + "grad_norm": 0.11615011841058731, + "learning_rate": 7.169794180269696e-05, + "loss": 0.013327789306640626, + "step": 199470 + }, + { + "epoch": 28.31511710432931, + "grad_norm": 0.12947621941566467, + "learning_rate": 7.169652235628105e-05, + "loss": 0.012386147677898408, + "step": 199480 + }, + { + "epoch": 28.31653655074521, + "grad_norm": 1.6139812469482422, + "learning_rate": 7.169510290986516e-05, + "loss": 0.01017812266945839, + "step": 199490 + }, + { + "epoch": 28.317955997161107, + "grad_norm": 0.032480016350746155, + "learning_rate": 7.169368346344925e-05, + "loss": 0.002866869792342186, + "step": 199500 + }, + { + "epoch": 28.317955997161107, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04443558305501938, + "eval_runtime": 31.7971, + "eval_samples_per_second": 494.606, + "eval_steps_per_second": 15.473, + "step": 199500 + }, + { + "epoch": 28.319375443577005, + "grad_norm": 0.024049408733844757, + "learning_rate": 7.169226401703336e-05, + "loss": 0.005744737759232521, + "step": 199510 + }, + { + "epoch": 28.320794889992904, + "grad_norm": 1.0568923950195312, + "learning_rate": 7.169084457061746e-05, + "loss": 0.010489782691001892, + "step": 199520 + }, + { + "epoch": 28.322214336408802, + "grad_norm": 0.04778973013162613, + "learning_rate": 7.168942512420157e-05, + "loss": 0.002705581858754158, + "step": 199530 + }, + { + "epoch": 28.323633782824697, + "grad_norm": 0.04460413008928299, + "learning_rate": 7.168800567778567e-05, + "loss": 0.0045249246060848234, + "step": 199540 + }, + { + "epoch": 28.325053229240595, + "grad_norm": 0.04954056441783905, + "learning_rate": 7.168658623136977e-05, + "loss": 0.001981794461607933, + "step": 199550 + }, + { + "epoch": 28.326472675656493, + "grad_norm": 0.30288413166999817, + "learning_rate": 7.168516678495388e-05, + "loss": 0.049859821796417236, + "step": 199560 + }, + { + "epoch": 28.32789212207239, + "grad_norm": 0.09125131368637085, + "learning_rate": 7.168374733853798e-05, + "loss": 0.020841658115386963, + "step": 199570 + }, + { + "epoch": 28.32931156848829, + "grad_norm": 0.03496944531798363, + "learning_rate": 7.168232789212209e-05, + "loss": 0.005935343354940415, + "step": 199580 + }, + { + "epoch": 28.330731014904188, + "grad_norm": 0.07191208750009537, + "learning_rate": 7.168090844570617e-05, + "loss": 0.004611283168196678, + "step": 199590 + }, + { + "epoch": 28.332150461320087, + "grad_norm": 0.10300350189208984, + "learning_rate": 7.167948899929028e-05, + "loss": 0.010794149339199066, + "step": 199600 + }, + { + "epoch": 28.33356990773598, + "grad_norm": 7.027735233306885, + "learning_rate": 7.167806955287438e-05, + "loss": 0.01220475286245346, + "step": 199610 + }, + { + "epoch": 28.33498935415188, + "grad_norm": 0.5602710843086243, + "learning_rate": 7.167665010645849e-05, + "loss": 0.0009965412318706512, + "step": 199620 + }, + { + "epoch": 28.336408800567778, + "grad_norm": 0.002534120576456189, + "learning_rate": 7.167523066004259e-05, + "loss": 0.008245989680290222, + "step": 199630 + }, + { + "epoch": 28.337828246983676, + "grad_norm": 0.05539355054497719, + "learning_rate": 7.167381121362668e-05, + "loss": 0.006839954853057861, + "step": 199640 + }, + { + "epoch": 28.339247693399575, + "grad_norm": 1.7122923135757446, + "learning_rate": 7.16723917672108e-05, + "loss": 0.030858173966407776, + "step": 199650 + }, + { + "epoch": 28.340667139815473, + "grad_norm": 0.0066008479334414005, + "learning_rate": 7.167097232079489e-05, + "loss": 0.001421067863702774, + "step": 199660 + }, + { + "epoch": 28.34208658623137, + "grad_norm": 7.4089860916137695, + "learning_rate": 7.1669552874379e-05, + "loss": 0.019107531011104583, + "step": 199670 + }, + { + "epoch": 28.343506032647266, + "grad_norm": 1.4592430591583252, + "learning_rate": 7.16681334279631e-05, + "loss": 0.03201393187046051, + "step": 199680 + }, + { + "epoch": 28.344925479063164, + "grad_norm": 0.06095936521887779, + "learning_rate": 7.16667139815472e-05, + "loss": 0.005807359889149666, + "step": 199690 + }, + { + "epoch": 28.346344925479062, + "grad_norm": 0.014647936448454857, + "learning_rate": 7.16652945351313e-05, + "loss": 0.022246617078781127, + "step": 199700 + }, + { + "epoch": 28.34776437189496, + "grad_norm": 2.3722426891326904, + "learning_rate": 7.166387508871541e-05, + "loss": 0.008600006997585296, + "step": 199710 + }, + { + "epoch": 28.34918381831086, + "grad_norm": 7.695586204528809, + "learning_rate": 7.16624556422995e-05, + "loss": 0.03615226447582245, + "step": 199720 + }, + { + "epoch": 28.350603264726757, + "grad_norm": 0.03161124885082245, + "learning_rate": 7.166103619588362e-05, + "loss": 0.0029363609850406648, + "step": 199730 + }, + { + "epoch": 28.352022711142656, + "grad_norm": 0.04630604013800621, + "learning_rate": 7.165961674946771e-05, + "loss": 0.009469537436962128, + "step": 199740 + }, + { + "epoch": 28.35344215755855, + "grad_norm": 0.009826400317251682, + "learning_rate": 7.165819730305181e-05, + "loss": 0.02344583123922348, + "step": 199750 + }, + { + "epoch": 28.35486160397445, + "grad_norm": 9.853654861450195, + "learning_rate": 7.165677785663592e-05, + "loss": 0.006203539296984672, + "step": 199760 + }, + { + "epoch": 28.356281050390347, + "grad_norm": 0.3137839138507843, + "learning_rate": 7.165535841022002e-05, + "loss": 0.0012408483773469925, + "step": 199770 + }, + { + "epoch": 28.357700496806245, + "grad_norm": 11.4270601272583, + "learning_rate": 7.165393896380413e-05, + "loss": 0.032888668775558474, + "step": 199780 + }, + { + "epoch": 28.359119943222144, + "grad_norm": 3.1693248748779297, + "learning_rate": 7.165251951738821e-05, + "loss": 0.013329139351844788, + "step": 199790 + }, + { + "epoch": 28.360539389638042, + "grad_norm": 0.006694951094686985, + "learning_rate": 7.165110007097232e-05, + "loss": 0.0017468467354774474, + "step": 199800 + }, + { + "epoch": 28.36195883605394, + "grad_norm": 0.27150556445121765, + "learning_rate": 7.164968062455642e-05, + "loss": 0.005452800914645195, + "step": 199810 + }, + { + "epoch": 28.363378282469835, + "grad_norm": 17.533571243286133, + "learning_rate": 7.164826117814053e-05, + "loss": 0.008142508566379547, + "step": 199820 + }, + { + "epoch": 28.364797728885733, + "grad_norm": 0.28385552763938904, + "learning_rate": 7.164684173172463e-05, + "loss": 0.004101857170462608, + "step": 199830 + }, + { + "epoch": 28.36621717530163, + "grad_norm": 1.666866421699524, + "learning_rate": 7.164542228530873e-05, + "loss": 0.0014512058347463607, + "step": 199840 + }, + { + "epoch": 28.36763662171753, + "grad_norm": 0.4900670647621155, + "learning_rate": 7.164400283889284e-05, + "loss": 0.0020558807998895647, + "step": 199850 + }, + { + "epoch": 28.36905606813343, + "grad_norm": 0.7592693567276001, + "learning_rate": 7.164258339247694e-05, + "loss": 0.0011012662202119828, + "step": 199860 + }, + { + "epoch": 28.370475514549327, + "grad_norm": 0.01078292541205883, + "learning_rate": 7.164116394606105e-05, + "loss": 0.019426554441452026, + "step": 199870 + }, + { + "epoch": 28.371894960965225, + "grad_norm": 0.02211497724056244, + "learning_rate": 7.163974449964514e-05, + "loss": 0.00027601979672908783, + "step": 199880 + }, + { + "epoch": 28.37331440738112, + "grad_norm": 0.3910161256790161, + "learning_rate": 7.163832505322925e-05, + "loss": 0.03205226957798004, + "step": 199890 + }, + { + "epoch": 28.374733853797018, + "grad_norm": 0.04133027791976929, + "learning_rate": 7.163690560681334e-05, + "loss": 0.003622538596391678, + "step": 199900 + }, + { + "epoch": 28.376153300212916, + "grad_norm": 0.2644760310649872, + "learning_rate": 7.163548616039745e-05, + "loss": 0.00599166676402092, + "step": 199910 + }, + { + "epoch": 28.377572746628815, + "grad_norm": 0.2640079855918884, + "learning_rate": 7.163406671398155e-05, + "loss": 0.0070194870233535765, + "step": 199920 + }, + { + "epoch": 28.378992193044713, + "grad_norm": 0.23896099627017975, + "learning_rate": 7.163264726756566e-05, + "loss": 0.011405879259109497, + "step": 199930 + }, + { + "epoch": 28.38041163946061, + "grad_norm": 0.22049710154533386, + "learning_rate": 7.163122782114975e-05, + "loss": 0.0033630184829235078, + "step": 199940 + }, + { + "epoch": 28.38183108587651, + "grad_norm": 6.791482448577881, + "learning_rate": 7.162980837473385e-05, + "loss": 0.038791999220848083, + "step": 199950 + }, + { + "epoch": 28.383250532292404, + "grad_norm": 0.3172001540660858, + "learning_rate": 7.162838892831796e-05, + "loss": 0.008529084920883178, + "step": 199960 + }, + { + "epoch": 28.384669978708303, + "grad_norm": 5.540979862213135, + "learning_rate": 7.162696948190206e-05, + "loss": 0.014269909262657166, + "step": 199970 + }, + { + "epoch": 28.3860894251242, + "grad_norm": 0.6270046234130859, + "learning_rate": 7.162555003548617e-05, + "loss": 0.018737608194351198, + "step": 199980 + }, + { + "epoch": 28.3875088715401, + "grad_norm": 0.029504798352718353, + "learning_rate": 7.162413058907027e-05, + "loss": 0.00437907911837101, + "step": 199990 + }, + { + "epoch": 28.388928317955997, + "grad_norm": 0.3192567825317383, + "learning_rate": 7.162271114265437e-05, + "loss": 0.005898748710751534, + "step": 200000 + }, + { + "epoch": 28.388928317955997, + "eval_accuracy": 0.9867107522095759, + "eval_loss": 0.05083899572491646, + "eval_runtime": 31.888, + "eval_samples_per_second": 493.195, + "eval_steps_per_second": 15.429, + "step": 200000 + }, + { + "epoch": 28.390347764371896, + "grad_norm": 0.014561841264367104, + "learning_rate": 7.162129169623846e-05, + "loss": 0.004395398870110511, + "step": 200010 + }, + { + "epoch": 28.391767210787794, + "grad_norm": 0.9770172834396362, + "learning_rate": 7.161987224982257e-05, + "loss": 0.005071106553077698, + "step": 200020 + }, + { + "epoch": 28.39318665720369, + "grad_norm": 0.4028139114379883, + "learning_rate": 7.161845280340667e-05, + "loss": 0.00954567790031433, + "step": 200030 + }, + { + "epoch": 28.394606103619587, + "grad_norm": 0.07561185956001282, + "learning_rate": 7.161703335699078e-05, + "loss": 0.0753661572933197, + "step": 200040 + }, + { + "epoch": 28.396025550035485, + "grad_norm": 0.3420023024082184, + "learning_rate": 7.161561391057488e-05, + "loss": 0.04048577547073364, + "step": 200050 + }, + { + "epoch": 28.397444996451384, + "grad_norm": 0.8845858573913574, + "learning_rate": 7.161419446415898e-05, + "loss": 0.005366546660661697, + "step": 200060 + }, + { + "epoch": 28.398864442867282, + "grad_norm": 0.07881252467632294, + "learning_rate": 7.161277501774309e-05, + "loss": 0.0010100070387125016, + "step": 200070 + }, + { + "epoch": 28.40028388928318, + "grad_norm": 0.03851988911628723, + "learning_rate": 7.161135557132719e-05, + "loss": 0.020969492197036744, + "step": 200080 + }, + { + "epoch": 28.40170333569908, + "grad_norm": 0.022107118740677834, + "learning_rate": 7.16099361249113e-05, + "loss": 0.01428898423910141, + "step": 200090 + }, + { + "epoch": 28.403122782114973, + "grad_norm": 0.00697977002710104, + "learning_rate": 7.160851667849538e-05, + "loss": 0.0028931450098752974, + "step": 200100 + }, + { + "epoch": 28.40454222853087, + "grad_norm": 0.1388837993144989, + "learning_rate": 7.160709723207949e-05, + "loss": 0.0022350024431943894, + "step": 200110 + }, + { + "epoch": 28.40596167494677, + "grad_norm": 0.11818819493055344, + "learning_rate": 7.160567778566359e-05, + "loss": 0.005878336727619171, + "step": 200120 + }, + { + "epoch": 28.40738112136267, + "grad_norm": 3.0885379314422607, + "learning_rate": 7.16042583392477e-05, + "loss": 0.010032711178064346, + "step": 200130 + }, + { + "epoch": 28.408800567778567, + "grad_norm": 6.217708587646484, + "learning_rate": 7.16028388928318e-05, + "loss": 0.005864279344677925, + "step": 200140 + }, + { + "epoch": 28.410220014194465, + "grad_norm": 2.295971155166626, + "learning_rate": 7.16014194464159e-05, + "loss": 0.06425633430480956, + "step": 200150 + }, + { + "epoch": 28.411639460610363, + "grad_norm": 6.3594160079956055, + "learning_rate": 7.16e-05, + "loss": 0.013936981558799744, + "step": 200160 + }, + { + "epoch": 28.413058907026258, + "grad_norm": 25.968408584594727, + "learning_rate": 7.15985805535841e-05, + "loss": 0.04863928258419037, + "step": 200170 + }, + { + "epoch": 28.414478353442156, + "grad_norm": 8.849706649780273, + "learning_rate": 7.159716110716821e-05, + "loss": 0.02050606906414032, + "step": 200180 + }, + { + "epoch": 28.415897799858055, + "grad_norm": 1.0743964910507202, + "learning_rate": 7.159574166075231e-05, + "loss": 0.0019052378833293916, + "step": 200190 + }, + { + "epoch": 28.417317246273953, + "grad_norm": 7.449214935302734, + "learning_rate": 7.159432221433641e-05, + "loss": 0.018017114698886873, + "step": 200200 + }, + { + "epoch": 28.41873669268985, + "grad_norm": 0.3196099102497101, + "learning_rate": 7.15929027679205e-05, + "loss": 0.013986250758171082, + "step": 200210 + }, + { + "epoch": 28.42015613910575, + "grad_norm": 0.03765508905053139, + "learning_rate": 7.159148332150462e-05, + "loss": 0.030044052004814147, + "step": 200220 + }, + { + "epoch": 28.421575585521648, + "grad_norm": 0.22359715402126312, + "learning_rate": 7.159006387508871e-05, + "loss": 0.0029845152050256727, + "step": 200230 + }, + { + "epoch": 28.422995031937543, + "grad_norm": 5.429706573486328, + "learning_rate": 7.158864442867283e-05, + "loss": 0.029593482613563538, + "step": 200240 + }, + { + "epoch": 28.42441447835344, + "grad_norm": 0.14061623811721802, + "learning_rate": 7.158736692689851e-05, + "loss": 0.03704796135425568, + "step": 200250 + }, + { + "epoch": 28.42583392476934, + "grad_norm": 0.5723813772201538, + "learning_rate": 7.158594748048262e-05, + "loss": 0.02042677104473114, + "step": 200260 + }, + { + "epoch": 28.427253371185238, + "grad_norm": 0.08260179311037064, + "learning_rate": 7.158452803406672e-05, + "loss": 0.0069740571081638334, + "step": 200270 + }, + { + "epoch": 28.428672817601136, + "grad_norm": 0.9766865968704224, + "learning_rate": 7.158310858765082e-05, + "loss": 0.047246554493904115, + "step": 200280 + }, + { + "epoch": 28.430092264017034, + "grad_norm": 0.7394446730613708, + "learning_rate": 7.158168914123491e-05, + "loss": 0.01779528856277466, + "step": 200290 + }, + { + "epoch": 28.431511710432932, + "grad_norm": 0.4314347803592682, + "learning_rate": 7.158026969481902e-05, + "loss": 0.010377894341945647, + "step": 200300 + }, + { + "epoch": 28.432931156848827, + "grad_norm": 1.1557605266571045, + "learning_rate": 7.157885024840314e-05, + "loss": 0.004726681113243103, + "step": 200310 + }, + { + "epoch": 28.434350603264726, + "grad_norm": 3.391603469848633, + "learning_rate": 7.157743080198723e-05, + "loss": 0.008342590183019638, + "step": 200320 + }, + { + "epoch": 28.435770049680624, + "grad_norm": 0.6680954098701477, + "learning_rate": 7.157601135557133e-05, + "loss": 0.01974293291568756, + "step": 200330 + }, + { + "epoch": 28.437189496096522, + "grad_norm": 0.17281599342823029, + "learning_rate": 7.157459190915543e-05, + "loss": 0.08389627933502197, + "step": 200340 + }, + { + "epoch": 28.43860894251242, + "grad_norm": 0.04211330786347389, + "learning_rate": 7.157317246273954e-05, + "loss": 0.01716255247592926, + "step": 200350 + }, + { + "epoch": 28.44002838892832, + "grad_norm": 0.4659997820854187, + "learning_rate": 7.157175301632364e-05, + "loss": 0.04983566999435425, + "step": 200360 + }, + { + "epoch": 28.441447835344217, + "grad_norm": 14.288187026977539, + "learning_rate": 7.157033356990775e-05, + "loss": 0.029674240946769716, + "step": 200370 + }, + { + "epoch": 28.442867281760112, + "grad_norm": 0.10877423733472824, + "learning_rate": 7.156891412349183e-05, + "loss": 0.019133806228637695, + "step": 200380 + }, + { + "epoch": 28.44428672817601, + "grad_norm": 0.11252165585756302, + "learning_rate": 7.156749467707594e-05, + "loss": 0.0023531246930360793, + "step": 200390 + }, + { + "epoch": 28.44570617459191, + "grad_norm": 0.0991859957575798, + "learning_rate": 7.156607523066005e-05, + "loss": 0.012050290405750275, + "step": 200400 + }, + { + "epoch": 28.447125621007807, + "grad_norm": 0.07916679978370667, + "learning_rate": 7.156465578424415e-05, + "loss": 0.0018652509897947311, + "step": 200410 + }, + { + "epoch": 28.448545067423705, + "grad_norm": 0.030273517593741417, + "learning_rate": 7.156323633782826e-05, + "loss": 0.004591937735676765, + "step": 200420 + }, + { + "epoch": 28.449964513839603, + "grad_norm": 14.821060180664062, + "learning_rate": 7.156181689141234e-05, + "loss": 0.02718967795372009, + "step": 200430 + }, + { + "epoch": 28.4513839602555, + "grad_norm": 0.01562479417771101, + "learning_rate": 7.156039744499646e-05, + "loss": 0.004899428784847259, + "step": 200440 + }, + { + "epoch": 28.4528034066714, + "grad_norm": 0.1828615963459015, + "learning_rate": 7.155897799858055e-05, + "loss": 0.0005543474107980728, + "step": 200450 + }, + { + "epoch": 28.454222853087295, + "grad_norm": 1.9369022846221924, + "learning_rate": 7.155755855216466e-05, + "loss": 0.014918106794357299, + "step": 200460 + }, + { + "epoch": 28.455642299503193, + "grad_norm": 1.4819496870040894, + "learning_rate": 7.155613910574876e-05, + "loss": 0.00841212272644043, + "step": 200470 + }, + { + "epoch": 28.45706174591909, + "grad_norm": 0.9587283134460449, + "learning_rate": 7.155471965933286e-05, + "loss": 0.007040657103061676, + "step": 200480 + }, + { + "epoch": 28.45848119233499, + "grad_norm": 5.117340087890625, + "learning_rate": 7.155330021291697e-05, + "loss": 0.014197072386741639, + "step": 200490 + }, + { + "epoch": 28.459900638750888, + "grad_norm": 5.314828872680664, + "learning_rate": 7.155188076650107e-05, + "loss": 0.0159568190574646, + "step": 200500 + }, + { + "epoch": 28.459900638750888, + "eval_accuracy": 0.9829592420677815, + "eval_loss": 0.06781262904405594, + "eval_runtime": 31.6076, + "eval_samples_per_second": 497.57, + "eval_steps_per_second": 15.566, + "step": 200500 + }, + { + "epoch": 28.461320085166786, + "grad_norm": 0.03883549943566322, + "learning_rate": 7.155046132008518e-05, + "loss": 0.020783630013465882, + "step": 200510 + }, + { + "epoch": 28.462739531582685, + "grad_norm": 0.0678185299038887, + "learning_rate": 7.154904187366927e-05, + "loss": 0.0058690238744020466, + "step": 200520 + }, + { + "epoch": 28.46415897799858, + "grad_norm": 0.04663262516260147, + "learning_rate": 7.154762242725339e-05, + "loss": 0.042443478107452394, + "step": 200530 + }, + { + "epoch": 28.465578424414478, + "grad_norm": 0.006782514974474907, + "learning_rate": 7.154620298083747e-05, + "loss": 0.00197218656539917, + "step": 200540 + }, + { + "epoch": 28.466997870830376, + "grad_norm": 8.3377103805542, + "learning_rate": 7.154478353442158e-05, + "loss": 0.013887104392051697, + "step": 200550 + }, + { + "epoch": 28.468417317246274, + "grad_norm": 0.019197314977645874, + "learning_rate": 7.154336408800568e-05, + "loss": 0.004338526353240013, + "step": 200560 + }, + { + "epoch": 28.469836763662173, + "grad_norm": 0.08462105691432953, + "learning_rate": 7.154194464158979e-05, + "loss": 0.006431084871292114, + "step": 200570 + }, + { + "epoch": 28.47125621007807, + "grad_norm": 0.007355148904025555, + "learning_rate": 7.154052519517389e-05, + "loss": 0.008576014637947082, + "step": 200580 + }, + { + "epoch": 28.47267565649397, + "grad_norm": 0.6348217725753784, + "learning_rate": 7.153910574875798e-05, + "loss": 0.0089887335896492, + "step": 200590 + }, + { + "epoch": 28.474095102909864, + "grad_norm": 1.2608503103256226, + "learning_rate": 7.15376863023421e-05, + "loss": 0.026647067070007323, + "step": 200600 + }, + { + "epoch": 28.475514549325762, + "grad_norm": 0.007455672603100538, + "learning_rate": 7.153626685592619e-05, + "loss": 0.017994345724582674, + "step": 200610 + }, + { + "epoch": 28.47693399574166, + "grad_norm": 0.0659957230091095, + "learning_rate": 7.15348474095103e-05, + "loss": 0.013207972049713135, + "step": 200620 + }, + { + "epoch": 28.47835344215756, + "grad_norm": 1.5482605695724487, + "learning_rate": 7.15334279630944e-05, + "loss": 0.008870060741901397, + "step": 200630 + }, + { + "epoch": 28.479772888573457, + "grad_norm": 0.45310452580451965, + "learning_rate": 7.15320085166785e-05, + "loss": 0.002311305329203606, + "step": 200640 + }, + { + "epoch": 28.481192334989355, + "grad_norm": 0.10342030972242355, + "learning_rate": 7.15305890702626e-05, + "loss": 0.007543880492448807, + "step": 200650 + }, + { + "epoch": 28.482611781405254, + "grad_norm": 0.15141022205352783, + "learning_rate": 7.15291696238467e-05, + "loss": 0.014769670367240906, + "step": 200660 + }, + { + "epoch": 28.48403122782115, + "grad_norm": 0.30544427037239075, + "learning_rate": 7.15277501774308e-05, + "loss": 0.0033348985016345978, + "step": 200670 + }, + { + "epoch": 28.485450674237047, + "grad_norm": 0.028014348819851875, + "learning_rate": 7.152633073101491e-05, + "loss": 0.005842993780970573, + "step": 200680 + }, + { + "epoch": 28.486870120652945, + "grad_norm": 2.2038397789001465, + "learning_rate": 7.152491128459901e-05, + "loss": 0.020639370381832122, + "step": 200690 + }, + { + "epoch": 28.488289567068843, + "grad_norm": 0.15369045734405518, + "learning_rate": 7.152349183818311e-05, + "loss": 0.0129535973072052, + "step": 200700 + }, + { + "epoch": 28.48970901348474, + "grad_norm": 0.13632221519947052, + "learning_rate": 7.152207239176722e-05, + "loss": 0.027351272106170655, + "step": 200710 + }, + { + "epoch": 28.49112845990064, + "grad_norm": 0.46449920535087585, + "learning_rate": 7.152065294535132e-05, + "loss": 0.04103163480758667, + "step": 200720 + }, + { + "epoch": 28.49254790631654, + "grad_norm": 2.1333017349243164, + "learning_rate": 7.151923349893543e-05, + "loss": 0.0031131260097026826, + "step": 200730 + }, + { + "epoch": 28.493967352732433, + "grad_norm": 0.13914811611175537, + "learning_rate": 7.151781405251951e-05, + "loss": 0.003475377708673477, + "step": 200740 + }, + { + "epoch": 28.49538679914833, + "grad_norm": 0.10311398655176163, + "learning_rate": 7.151639460610362e-05, + "loss": 0.022569075226783752, + "step": 200750 + }, + { + "epoch": 28.49680624556423, + "grad_norm": 0.025364236906170845, + "learning_rate": 7.151497515968772e-05, + "loss": 0.022877174615859985, + "step": 200760 + }, + { + "epoch": 28.498225691980128, + "grad_norm": 12.36310863494873, + "learning_rate": 7.151355571327183e-05, + "loss": 0.012478172779083252, + "step": 200770 + }, + { + "epoch": 28.499645138396026, + "grad_norm": 0.006817180663347244, + "learning_rate": 7.151213626685593e-05, + "loss": 0.03067971467971802, + "step": 200780 + }, + { + "epoch": 28.501064584811925, + "grad_norm": 0.08835793286561966, + "learning_rate": 7.151071682044003e-05, + "loss": 0.010280901938676834, + "step": 200790 + }, + { + "epoch": 28.502484031227823, + "grad_norm": 0.11883528530597687, + "learning_rate": 7.150929737402414e-05, + "loss": 0.00915006548166275, + "step": 200800 + }, + { + "epoch": 28.503903477643718, + "grad_norm": 0.6633992791175842, + "learning_rate": 7.150787792760823e-05, + "loss": 0.021029695868492126, + "step": 200810 + }, + { + "epoch": 28.505322924059616, + "grad_norm": 1.0163832902908325, + "learning_rate": 7.150645848119235e-05, + "loss": 0.004828060418367386, + "step": 200820 + }, + { + "epoch": 28.506742370475514, + "grad_norm": 0.8251711130142212, + "learning_rate": 7.150503903477644e-05, + "loss": 0.025202789902687074, + "step": 200830 + }, + { + "epoch": 28.508161816891413, + "grad_norm": 0.033074576407670975, + "learning_rate": 7.150361958836054e-05, + "loss": 0.006814651936292648, + "step": 200840 + }, + { + "epoch": 28.50958126330731, + "grad_norm": 0.03900003060698509, + "learning_rate": 7.150220014194464e-05, + "loss": 0.034520980715751645, + "step": 200850 + }, + { + "epoch": 28.51100070972321, + "grad_norm": 0.32719460129737854, + "learning_rate": 7.150078069552875e-05, + "loss": 0.0017938043922185898, + "step": 200860 + }, + { + "epoch": 28.512420156139108, + "grad_norm": 0.1817377507686615, + "learning_rate": 7.149936124911285e-05, + "loss": 0.058192473649978635, + "step": 200870 + }, + { + "epoch": 28.513839602555002, + "grad_norm": 0.022970762103796005, + "learning_rate": 7.149794180269696e-05, + "loss": 0.0035081543028354645, + "step": 200880 + }, + { + "epoch": 28.5152590489709, + "grad_norm": 0.10131566971540451, + "learning_rate": 7.149652235628105e-05, + "loss": 0.04994455575942993, + "step": 200890 + }, + { + "epoch": 28.5166784953868, + "grad_norm": 0.22325262427330017, + "learning_rate": 7.149510290986515e-05, + "loss": 0.0005334418267011643, + "step": 200900 + }, + { + "epoch": 28.518097941802697, + "grad_norm": 0.9875782132148743, + "learning_rate": 7.149368346344926e-05, + "loss": 0.0031042627990245817, + "step": 200910 + }, + { + "epoch": 28.519517388218595, + "grad_norm": 2.293247938156128, + "learning_rate": 7.149226401703336e-05, + "loss": 0.001432936266064644, + "step": 200920 + }, + { + "epoch": 28.520936834634494, + "grad_norm": 0.012843864969909191, + "learning_rate": 7.149084457061747e-05, + "loss": 0.016012361645698546, + "step": 200930 + }, + { + "epoch": 28.522356281050392, + "grad_norm": 0.007810906507074833, + "learning_rate": 7.148942512420157e-05, + "loss": 0.036703407764434814, + "step": 200940 + }, + { + "epoch": 28.523775727466287, + "grad_norm": 4.8597412109375, + "learning_rate": 7.148800567778567e-05, + "loss": 0.01216132789850235, + "step": 200950 + }, + { + "epoch": 28.525195173882185, + "grad_norm": 0.05600254610180855, + "learning_rate": 7.148658623136976e-05, + "loss": 0.009236736595630646, + "step": 200960 + }, + { + "epoch": 28.526614620298083, + "grad_norm": 0.008248250931501389, + "learning_rate": 7.148516678495387e-05, + "loss": 0.023302638530731203, + "step": 200970 + }, + { + "epoch": 28.528034066713982, + "grad_norm": 1.464361548423767, + "learning_rate": 7.148374733853797e-05, + "loss": 0.0209849551320076, + "step": 200980 + }, + { + "epoch": 28.52945351312988, + "grad_norm": 3.5074026584625244, + "learning_rate": 7.148232789212208e-05, + "loss": 0.002058819681406021, + "step": 200990 + }, + { + "epoch": 28.53087295954578, + "grad_norm": 0.12655140459537506, + "learning_rate": 7.148090844570618e-05, + "loss": 0.03494569361209869, + "step": 201000 + }, + { + "epoch": 28.53087295954578, + "eval_accuracy": 0.9807337699497679, + "eval_loss": 0.07954235374927521, + "eval_runtime": 31.9634, + "eval_samples_per_second": 492.032, + "eval_steps_per_second": 15.393, + "step": 201000 + }, + { + "epoch": 28.532292405961677, + "grad_norm": 0.14185269176959991, + "learning_rate": 7.147948899929028e-05, + "loss": 0.027273494005203246, + "step": 201010 + }, + { + "epoch": 28.53371185237757, + "grad_norm": 0.022591564804315567, + "learning_rate": 7.147806955287439e-05, + "loss": 0.07053292989730835, + "step": 201020 + }, + { + "epoch": 28.53513129879347, + "grad_norm": 0.029211899265646935, + "learning_rate": 7.147665010645849e-05, + "loss": 0.00714716911315918, + "step": 201030 + }, + { + "epoch": 28.536550745209368, + "grad_norm": 1.3933457136154175, + "learning_rate": 7.14752306600426e-05, + "loss": 0.05901137590408325, + "step": 201040 + }, + { + "epoch": 28.537970191625266, + "grad_norm": 0.8518321514129639, + "learning_rate": 7.147381121362668e-05, + "loss": 0.019621752202510834, + "step": 201050 + }, + { + "epoch": 28.539389638041165, + "grad_norm": 7.072301864624023, + "learning_rate": 7.147239176721079e-05, + "loss": 0.04047700762748718, + "step": 201060 + }, + { + "epoch": 28.540809084457063, + "grad_norm": 3.295497179031372, + "learning_rate": 7.147097232079489e-05, + "loss": 0.08376761674880981, + "step": 201070 + }, + { + "epoch": 28.54222853087296, + "grad_norm": 0.1964966505765915, + "learning_rate": 7.1469552874379e-05, + "loss": 0.021678704023361205, + "step": 201080 + }, + { + "epoch": 28.543647977288856, + "grad_norm": 0.12770269811153412, + "learning_rate": 7.14681334279631e-05, + "loss": 0.04885146915912628, + "step": 201090 + }, + { + "epoch": 28.545067423704754, + "grad_norm": 0.01389195304363966, + "learning_rate": 7.14667139815472e-05, + "loss": 0.10965036153793335, + "step": 201100 + }, + { + "epoch": 28.546486870120653, + "grad_norm": 0.023742400109767914, + "learning_rate": 7.14652945351313e-05, + "loss": 0.001628207042813301, + "step": 201110 + }, + { + "epoch": 28.54790631653655, + "grad_norm": 0.016777511686086655, + "learning_rate": 7.14638750887154e-05, + "loss": 0.008158596605062485, + "step": 201120 + }, + { + "epoch": 28.54932576295245, + "grad_norm": 0.20625270903110504, + "learning_rate": 7.146245564229951e-05, + "loss": 0.028449133038520813, + "step": 201130 + }, + { + "epoch": 28.550745209368348, + "grad_norm": 0.3594304323196411, + "learning_rate": 7.146103619588361e-05, + "loss": 0.0013781752437353133, + "step": 201140 + }, + { + "epoch": 28.552164655784246, + "grad_norm": 14.042330741882324, + "learning_rate": 7.145961674946771e-05, + "loss": 0.08049511909484863, + "step": 201150 + }, + { + "epoch": 28.55358410220014, + "grad_norm": 0.0031500798650085926, + "learning_rate": 7.14581973030518e-05, + "loss": 0.00548042431473732, + "step": 201160 + }, + { + "epoch": 28.55500354861604, + "grad_norm": 2.3468847274780273, + "learning_rate": 7.145677785663592e-05, + "loss": 0.014012959599494935, + "step": 201170 + }, + { + "epoch": 28.556422995031937, + "grad_norm": 0.021840177476406097, + "learning_rate": 7.145535841022001e-05, + "loss": 0.0247272789478302, + "step": 201180 + }, + { + "epoch": 28.557842441447836, + "grad_norm": 3.7212748527526855, + "learning_rate": 7.145393896380412e-05, + "loss": 0.021968531608581542, + "step": 201190 + }, + { + "epoch": 28.559261887863734, + "grad_norm": 5.185697078704834, + "learning_rate": 7.145251951738822e-05, + "loss": 0.013309815526008606, + "step": 201200 + }, + { + "epoch": 28.560681334279632, + "grad_norm": 0.13574771583080292, + "learning_rate": 7.145110007097232e-05, + "loss": 0.01869424432516098, + "step": 201210 + }, + { + "epoch": 28.56210078069553, + "grad_norm": 0.39928263425827026, + "learning_rate": 7.144968062455643e-05, + "loss": 0.016870419681072234, + "step": 201220 + }, + { + "epoch": 28.563520227111425, + "grad_norm": 0.07718445360660553, + "learning_rate": 7.144826117814053e-05, + "loss": 0.0063089638948440555, + "step": 201230 + }, + { + "epoch": 28.564939673527324, + "grad_norm": 0.027910679578781128, + "learning_rate": 7.144684173172464e-05, + "loss": 0.0029162537306547166, + "step": 201240 + }, + { + "epoch": 28.566359119943222, + "grad_norm": 0.18650773167610168, + "learning_rate": 7.144542228530874e-05, + "loss": 0.02237192392349243, + "step": 201250 + }, + { + "epoch": 28.56777856635912, + "grad_norm": 0.044792938977479935, + "learning_rate": 7.144400283889283e-05, + "loss": 0.023907901346683504, + "step": 201260 + }, + { + "epoch": 28.56919801277502, + "grad_norm": 0.07591357082128525, + "learning_rate": 7.144258339247693e-05, + "loss": 0.008483633399009705, + "step": 201270 + }, + { + "epoch": 28.570617459190917, + "grad_norm": 1.7794528007507324, + "learning_rate": 7.144116394606104e-05, + "loss": 0.0021972041577100754, + "step": 201280 + }, + { + "epoch": 28.572036905606815, + "grad_norm": 0.01824699155986309, + "learning_rate": 7.143974449964514e-05, + "loss": 0.020380808413028716, + "step": 201290 + }, + { + "epoch": 28.57345635202271, + "grad_norm": 0.18070217967033386, + "learning_rate": 7.143832505322925e-05, + "loss": 0.004849312826991081, + "step": 201300 + }, + { + "epoch": 28.574875798438608, + "grad_norm": 0.30356737971305847, + "learning_rate": 7.143690560681335e-05, + "loss": 0.0023828066885471346, + "step": 201310 + }, + { + "epoch": 28.576295244854506, + "grad_norm": 0.10645544528961182, + "learning_rate": 7.143548616039744e-05, + "loss": 0.028287625312805174, + "step": 201320 + }, + { + "epoch": 28.577714691270405, + "grad_norm": 12.533944129943848, + "learning_rate": 7.143406671398156e-05, + "loss": 0.02543233036994934, + "step": 201330 + }, + { + "epoch": 28.579134137686303, + "grad_norm": 0.011086253449320793, + "learning_rate": 7.143264726756565e-05, + "loss": 0.01214851513504982, + "step": 201340 + }, + { + "epoch": 28.5805535841022, + "grad_norm": 2.9527010917663574, + "learning_rate": 7.143122782114976e-05, + "loss": 0.024786420166492462, + "step": 201350 + }, + { + "epoch": 28.5819730305181, + "grad_norm": 0.44980815052986145, + "learning_rate": 7.142980837473385e-05, + "loss": 0.004541192576289177, + "step": 201360 + }, + { + "epoch": 28.583392476933994, + "grad_norm": 1.4496196508407593, + "learning_rate": 7.142838892831796e-05, + "loss": 0.0028324633836746215, + "step": 201370 + }, + { + "epoch": 28.584811923349893, + "grad_norm": 0.051483552902936935, + "learning_rate": 7.142696948190206e-05, + "loss": 0.01790556013584137, + "step": 201380 + }, + { + "epoch": 28.58623136976579, + "grad_norm": 0.0562250055372715, + "learning_rate": 7.142555003548617e-05, + "loss": 0.001960672438144684, + "step": 201390 + }, + { + "epoch": 28.58765081618169, + "grad_norm": 0.2087841033935547, + "learning_rate": 7.142413058907026e-05, + "loss": 0.01514289528131485, + "step": 201400 + }, + { + "epoch": 28.589070262597588, + "grad_norm": 0.02181023731827736, + "learning_rate": 7.142271114265436e-05, + "loss": 0.006732319295406341, + "step": 201410 + }, + { + "epoch": 28.590489709013486, + "grad_norm": 0.33355823159217834, + "learning_rate": 7.142129169623847e-05, + "loss": 0.01378794014453888, + "step": 201420 + }, + { + "epoch": 28.591909155429384, + "grad_norm": 0.14407022297382355, + "learning_rate": 7.141987224982257e-05, + "loss": 0.03384703993797302, + "step": 201430 + }, + { + "epoch": 28.59332860184528, + "grad_norm": 0.062032874673604965, + "learning_rate": 7.141845280340668e-05, + "loss": 0.007128275185823441, + "step": 201440 + }, + { + "epoch": 28.594748048261177, + "grad_norm": 13.760283470153809, + "learning_rate": 7.141703335699078e-05, + "loss": 0.052574926614761354, + "step": 201450 + }, + { + "epoch": 28.596167494677076, + "grad_norm": 0.05171125754714012, + "learning_rate": 7.141561391057488e-05, + "loss": 0.0598525881767273, + "step": 201460 + }, + { + "epoch": 28.597586941092974, + "grad_norm": 0.20313721895217896, + "learning_rate": 7.141419446415897e-05, + "loss": 0.005070237070322036, + "step": 201470 + }, + { + "epoch": 28.599006387508872, + "grad_norm": 0.021224332973361015, + "learning_rate": 7.141277501774308e-05, + "loss": 0.014394518733024598, + "step": 201480 + }, + { + "epoch": 28.60042583392477, + "grad_norm": 1.5956634283065796, + "learning_rate": 7.141135557132718e-05, + "loss": 0.0018597181886434555, + "step": 201490 + }, + { + "epoch": 28.60184528034067, + "grad_norm": 0.1200406402349472, + "learning_rate": 7.140993612491129e-05, + "loss": 0.02257717102766037, + "step": 201500 + }, + { + "epoch": 28.60184528034067, + "eval_accuracy": 0.9886182997393018, + "eval_loss": 0.0518113449215889, + "eval_runtime": 31.4303, + "eval_samples_per_second": 500.378, + "eval_steps_per_second": 15.654, + "step": 201500 + }, + { + "epoch": 28.603264726756564, + "grad_norm": 0.30429205298423767, + "learning_rate": 7.140851667849539e-05, + "loss": 0.0012912750244140625, + "step": 201510 + }, + { + "epoch": 28.604684173172462, + "grad_norm": 3.921814441680908, + "learning_rate": 7.140709723207949e-05, + "loss": 0.00397743284702301, + "step": 201520 + }, + { + "epoch": 28.60610361958836, + "grad_norm": 0.521428644657135, + "learning_rate": 7.14056777856636e-05, + "loss": 0.00518798828125, + "step": 201530 + }, + { + "epoch": 28.60752306600426, + "grad_norm": 0.2522902488708496, + "learning_rate": 7.14042583392477e-05, + "loss": 0.017829208076000212, + "step": 201540 + }, + { + "epoch": 28.608942512420157, + "grad_norm": 0.012413930147886276, + "learning_rate": 7.14028388928318e-05, + "loss": 0.003584745526313782, + "step": 201550 + }, + { + "epoch": 28.610361958836055, + "grad_norm": 0.17087195813655853, + "learning_rate": 7.140141944641589e-05, + "loss": 0.00894251987338066, + "step": 201560 + }, + { + "epoch": 28.611781405251953, + "grad_norm": 0.1453462541103363, + "learning_rate": 7.14e-05, + "loss": 0.014499421417713165, + "step": 201570 + }, + { + "epoch": 28.613200851667848, + "grad_norm": 0.07308740168809891, + "learning_rate": 7.13985805535841e-05, + "loss": 0.009743786603212356, + "step": 201580 + }, + { + "epoch": 28.614620298083747, + "grad_norm": 0.10230106115341187, + "learning_rate": 7.139716110716821e-05, + "loss": 0.0029768593609333037, + "step": 201590 + }, + { + "epoch": 28.616039744499645, + "grad_norm": 0.1716940850019455, + "learning_rate": 7.139574166075232e-05, + "loss": 0.005616125836968422, + "step": 201600 + }, + { + "epoch": 28.617459190915543, + "grad_norm": 0.018863942474126816, + "learning_rate": 7.139432221433642e-05, + "loss": 0.0012187201529741288, + "step": 201610 + }, + { + "epoch": 28.61887863733144, + "grad_norm": 0.010432790033519268, + "learning_rate": 7.139290276792051e-05, + "loss": 0.037242719531059267, + "step": 201620 + }, + { + "epoch": 28.62029808374734, + "grad_norm": 0.7015812397003174, + "learning_rate": 7.139148332150461e-05, + "loss": 0.019269508123397828, + "step": 201630 + }, + { + "epoch": 28.621717530163238, + "grad_norm": 0.13906748592853546, + "learning_rate": 7.139006387508872e-05, + "loss": 0.043480795621871945, + "step": 201640 + }, + { + "epoch": 28.623136976579133, + "grad_norm": 0.12905274331569672, + "learning_rate": 7.138864442867282e-05, + "loss": 0.0058868277817964556, + "step": 201650 + }, + { + "epoch": 28.62455642299503, + "grad_norm": 0.3728729784488678, + "learning_rate": 7.138722498225693e-05, + "loss": 0.03309263288974762, + "step": 201660 + }, + { + "epoch": 28.62597586941093, + "grad_norm": 0.03888630494475365, + "learning_rate": 7.138580553584102e-05, + "loss": 0.008027173578739166, + "step": 201670 + }, + { + "epoch": 28.627395315826828, + "grad_norm": 0.011150514706969261, + "learning_rate": 7.138438608942513e-05, + "loss": 0.010716755688190461, + "step": 201680 + }, + { + "epoch": 28.628814762242726, + "grad_norm": 0.0036209612153470516, + "learning_rate": 7.138296664300924e-05, + "loss": 0.006768248975276947, + "step": 201690 + }, + { + "epoch": 28.630234208658624, + "grad_norm": 0.022230878472328186, + "learning_rate": 7.138154719659333e-05, + "loss": 0.0021772798150777816, + "step": 201700 + }, + { + "epoch": 28.631653655074523, + "grad_norm": 4.33188009262085, + "learning_rate": 7.138012775017745e-05, + "loss": 0.01680680811405182, + "step": 201710 + }, + { + "epoch": 28.633073101490417, + "grad_norm": 15.177000045776367, + "learning_rate": 7.137870830376153e-05, + "loss": 0.06424190998077392, + "step": 201720 + }, + { + "epoch": 28.634492547906316, + "grad_norm": 0.42514798045158386, + "learning_rate": 7.137728885734564e-05, + "loss": 0.014556774497032165, + "step": 201730 + }, + { + "epoch": 28.635911994322214, + "grad_norm": 0.4708273708820343, + "learning_rate": 7.137586941092974e-05, + "loss": 0.005245579406619072, + "step": 201740 + }, + { + "epoch": 28.637331440738112, + "grad_norm": 0.060344912111759186, + "learning_rate": 7.137444996451385e-05, + "loss": 0.006160608679056168, + "step": 201750 + }, + { + "epoch": 28.63875088715401, + "grad_norm": 8.428211212158203, + "learning_rate": 7.137303051809795e-05, + "loss": 0.02265956699848175, + "step": 201760 + }, + { + "epoch": 28.64017033356991, + "grad_norm": 2.2153990268707275, + "learning_rate": 7.137161107168204e-05, + "loss": 0.01425069123506546, + "step": 201770 + }, + { + "epoch": 28.641589779985807, + "grad_norm": 0.4592542052268982, + "learning_rate": 7.137019162526615e-05, + "loss": 0.008812656253576278, + "step": 201780 + }, + { + "epoch": 28.643009226401702, + "grad_norm": 9.430974960327148, + "learning_rate": 7.136877217885025e-05, + "loss": 0.01096549779176712, + "step": 201790 + }, + { + "epoch": 28.6444286728176, + "grad_norm": 0.0074507915414869785, + "learning_rate": 7.136735273243436e-05, + "loss": 0.022921228408813478, + "step": 201800 + }, + { + "epoch": 28.6458481192335, + "grad_norm": 2.052137613296509, + "learning_rate": 7.136593328601846e-05, + "loss": 0.012485851347446442, + "step": 201810 + }, + { + "epoch": 28.647267565649397, + "grad_norm": 0.07485529780387878, + "learning_rate": 7.136451383960256e-05, + "loss": 0.01055217981338501, + "step": 201820 + }, + { + "epoch": 28.648687012065295, + "grad_norm": 0.30901139974594116, + "learning_rate": 7.136309439318665e-05, + "loss": 0.008079621940851212, + "step": 201830 + }, + { + "epoch": 28.650106458481194, + "grad_norm": 4.283102512359619, + "learning_rate": 7.136167494677077e-05, + "loss": 0.005812462046742439, + "step": 201840 + }, + { + "epoch": 28.651525904897092, + "grad_norm": 1.2103368043899536, + "learning_rate": 7.136025550035486e-05, + "loss": 0.006260716915130615, + "step": 201850 + }, + { + "epoch": 28.652945351312987, + "grad_norm": 16.560836791992188, + "learning_rate": 7.135883605393897e-05, + "loss": 0.0348685622215271, + "step": 201860 + }, + { + "epoch": 28.654364797728885, + "grad_norm": 4.2399139404296875, + "learning_rate": 7.135741660752306e-05, + "loss": 0.006447078287601471, + "step": 201870 + }, + { + "epoch": 28.655784244144783, + "grad_norm": 4.215879917144775, + "learning_rate": 7.135599716110717e-05, + "loss": 0.009485115110874177, + "step": 201880 + }, + { + "epoch": 28.65720369056068, + "grad_norm": 0.02776987850666046, + "learning_rate": 7.135457771469128e-05, + "loss": 0.061675029993057254, + "step": 201890 + }, + { + "epoch": 28.65862313697658, + "grad_norm": 0.441116601228714, + "learning_rate": 7.135315826827538e-05, + "loss": 0.01486339420080185, + "step": 201900 + }, + { + "epoch": 28.660042583392478, + "grad_norm": 0.014494401402771473, + "learning_rate": 7.135173882185949e-05, + "loss": 0.019713598489761352, + "step": 201910 + }, + { + "epoch": 28.661462029808376, + "grad_norm": 0.056995145976543427, + "learning_rate": 7.135031937544357e-05, + "loss": 0.004585661366581917, + "step": 201920 + }, + { + "epoch": 28.66288147622427, + "grad_norm": 12.703169822692871, + "learning_rate": 7.134889992902768e-05, + "loss": 0.04791524708271026, + "step": 201930 + }, + { + "epoch": 28.66430092264017, + "grad_norm": 0.3268194794654846, + "learning_rate": 7.134748048261178e-05, + "loss": 0.025073090195655824, + "step": 201940 + }, + { + "epoch": 28.665720369056068, + "grad_norm": 1.6788805723190308, + "learning_rate": 7.134606103619589e-05, + "loss": 0.029867920279502868, + "step": 201950 + }, + { + "epoch": 28.667139815471966, + "grad_norm": 8.03248119354248, + "learning_rate": 7.134464158977999e-05, + "loss": 0.01902386248111725, + "step": 201960 + }, + { + "epoch": 28.668559261887864, + "grad_norm": 0.27529528737068176, + "learning_rate": 7.13432221433641e-05, + "loss": 0.016209705173969267, + "step": 201970 + }, + { + "epoch": 28.669978708303763, + "grad_norm": 0.014563340693712234, + "learning_rate": 7.13418026969482e-05, + "loss": 0.004392774030566216, + "step": 201980 + }, + { + "epoch": 28.67139815471966, + "grad_norm": 0.23559710383415222, + "learning_rate": 7.13403832505323e-05, + "loss": 0.0448177844285965, + "step": 201990 + }, + { + "epoch": 28.672817601135556, + "grad_norm": 11.354026794433594, + "learning_rate": 7.13389638041164e-05, + "loss": 0.011315601319074631, + "step": 202000 + }, + { + "epoch": 28.672817601135556, + "eval_accuracy": 0.9898899980924525, + "eval_loss": 0.0369279570877552, + "eval_runtime": 31.8775, + "eval_samples_per_second": 493.358, + "eval_steps_per_second": 15.434, + "step": 202000 + }, + { + "epoch": 28.674237047551454, + "grad_norm": 0.778715968132019, + "learning_rate": 7.13375443577005e-05, + "loss": 0.002616829052567482, + "step": 202010 + }, + { + "epoch": 28.675656493967352, + "grad_norm": 2.294604778289795, + "learning_rate": 7.133612491128461e-05, + "loss": 0.003241322562098503, + "step": 202020 + }, + { + "epoch": 28.67707594038325, + "grad_norm": 0.2107202261686325, + "learning_rate": 7.13347054648687e-05, + "loss": 0.004210730269551277, + "step": 202030 + }, + { + "epoch": 28.67849538679915, + "grad_norm": 0.07087211310863495, + "learning_rate": 7.133328601845281e-05, + "loss": 0.0015810679644346238, + "step": 202040 + }, + { + "epoch": 28.679914833215047, + "grad_norm": 2.5907280445098877, + "learning_rate": 7.13318665720369e-05, + "loss": 0.008369927108287812, + "step": 202050 + }, + { + "epoch": 28.681334279630946, + "grad_norm": 0.9335490465164185, + "learning_rate": 7.133044712562102e-05, + "loss": 0.008552919328212737, + "step": 202060 + }, + { + "epoch": 28.68275372604684, + "grad_norm": 0.008625762537121773, + "learning_rate": 7.132902767920511e-05, + "loss": 0.009244018793106079, + "step": 202070 + }, + { + "epoch": 28.68417317246274, + "grad_norm": 13.225590705871582, + "learning_rate": 7.132760823278921e-05, + "loss": 0.037289321422576904, + "step": 202080 + }, + { + "epoch": 28.685592618878637, + "grad_norm": 0.07448019832372665, + "learning_rate": 7.132618878637332e-05, + "loss": 0.03514640331268311, + "step": 202090 + }, + { + "epoch": 28.687012065294535, + "grad_norm": 7.817112922668457, + "learning_rate": 7.132476933995742e-05, + "loss": 0.017178672552108764, + "step": 202100 + }, + { + "epoch": 28.688431511710434, + "grad_norm": 18.98403549194336, + "learning_rate": 7.132334989354153e-05, + "loss": 0.03109595477581024, + "step": 202110 + }, + { + "epoch": 28.689850958126332, + "grad_norm": 18.124650955200195, + "learning_rate": 7.132193044712563e-05, + "loss": 0.01609461009502411, + "step": 202120 + }, + { + "epoch": 28.69127040454223, + "grad_norm": 10.608287811279297, + "learning_rate": 7.132051100070972e-05, + "loss": 0.022077572345733643, + "step": 202130 + }, + { + "epoch": 28.692689850958125, + "grad_norm": 0.057866573333740234, + "learning_rate": 7.131909155429382e-05, + "loss": 0.0142441526055336, + "step": 202140 + }, + { + "epoch": 28.694109297374023, + "grad_norm": 0.05763170123100281, + "learning_rate": 7.131767210787793e-05, + "loss": 0.0034998584538698196, + "step": 202150 + }, + { + "epoch": 28.69552874378992, + "grad_norm": 10.73918628692627, + "learning_rate": 7.131625266146203e-05, + "loss": 0.00424346886575222, + "step": 202160 + }, + { + "epoch": 28.69694819020582, + "grad_norm": 0.03236997872591019, + "learning_rate": 7.131483321504614e-05, + "loss": 0.0226372629404068, + "step": 202170 + }, + { + "epoch": 28.698367636621718, + "grad_norm": 0.020143333822488785, + "learning_rate": 7.131341376863024e-05, + "loss": 0.0021693214774131777, + "step": 202180 + }, + { + "epoch": 28.699787083037616, + "grad_norm": 0.11743684858083725, + "learning_rate": 7.131199432221434e-05, + "loss": 0.013487254083156586, + "step": 202190 + }, + { + "epoch": 28.701206529453515, + "grad_norm": 9.648536682128906, + "learning_rate": 7.131057487579845e-05, + "loss": 0.023142118752002717, + "step": 202200 + }, + { + "epoch": 28.70262597586941, + "grad_norm": 0.106868214905262, + "learning_rate": 7.130915542938254e-05, + "loss": 0.060021281242370605, + "step": 202210 + }, + { + "epoch": 28.704045422285308, + "grad_norm": 0.2116779088973999, + "learning_rate": 7.130773598296666e-05, + "loss": 0.03513970673084259, + "step": 202220 + }, + { + "epoch": 28.705464868701206, + "grad_norm": 2.68109130859375, + "learning_rate": 7.130631653655074e-05, + "loss": 0.002965981513261795, + "step": 202230 + }, + { + "epoch": 28.706884315117104, + "grad_norm": 0.9324289560317993, + "learning_rate": 7.130489709013485e-05, + "loss": 0.048745962977409366, + "step": 202240 + }, + { + "epoch": 28.708303761533003, + "grad_norm": 0.38824379444122314, + "learning_rate": 7.130347764371895e-05, + "loss": 0.017177388072013855, + "step": 202250 + }, + { + "epoch": 28.7097232079489, + "grad_norm": 0.019488513469696045, + "learning_rate": 7.130205819730306e-05, + "loss": 0.01657674163579941, + "step": 202260 + }, + { + "epoch": 28.7111426543648, + "grad_norm": 0.02164277993142605, + "learning_rate": 7.130063875088716e-05, + "loss": 0.0036540981382131577, + "step": 202270 + }, + { + "epoch": 28.712562100780694, + "grad_norm": 1.0430368185043335, + "learning_rate": 7.129921930447125e-05, + "loss": 0.04119579493999481, + "step": 202280 + }, + { + "epoch": 28.713981547196592, + "grad_norm": 0.20720230042934418, + "learning_rate": 7.129779985805536e-05, + "loss": 0.025897923111915588, + "step": 202290 + }, + { + "epoch": 28.71540099361249, + "grad_norm": 0.29412785172462463, + "learning_rate": 7.129638041163946e-05, + "loss": 0.010587500035762787, + "step": 202300 + }, + { + "epoch": 28.71682044002839, + "grad_norm": 0.32188525795936584, + "learning_rate": 7.129496096522357e-05, + "loss": 0.002680741250514984, + "step": 202310 + }, + { + "epoch": 28.718239886444287, + "grad_norm": 0.0546279177069664, + "learning_rate": 7.129354151880767e-05, + "loss": 0.026097610592842102, + "step": 202320 + }, + { + "epoch": 28.719659332860186, + "grad_norm": 0.15916693210601807, + "learning_rate": 7.129212207239178e-05, + "loss": 0.008837858587503434, + "step": 202330 + }, + { + "epoch": 28.721078779276084, + "grad_norm": 0.10597345978021622, + "learning_rate": 7.129070262597586e-05, + "loss": 0.008176721632480621, + "step": 202340 + }, + { + "epoch": 28.72249822569198, + "grad_norm": 0.07630429416894913, + "learning_rate": 7.128928317955998e-05, + "loss": 0.0031533293426036836, + "step": 202350 + }, + { + "epoch": 28.723917672107877, + "grad_norm": 0.0434570275247097, + "learning_rate": 7.128786373314407e-05, + "loss": 0.015704724192619323, + "step": 202360 + }, + { + "epoch": 28.725337118523775, + "grad_norm": 1.580925464630127, + "learning_rate": 7.128644428672818e-05, + "loss": 0.030556422472000123, + "step": 202370 + }, + { + "epoch": 28.726756564939674, + "grad_norm": 0.2505705952644348, + "learning_rate": 7.128502484031228e-05, + "loss": 0.002517157047986984, + "step": 202380 + }, + { + "epoch": 28.728176011355572, + "grad_norm": 0.23514892160892487, + "learning_rate": 7.128360539389638e-05, + "loss": 0.0209683895111084, + "step": 202390 + }, + { + "epoch": 28.72959545777147, + "grad_norm": 12.20423698425293, + "learning_rate": 7.128218594748049e-05, + "loss": 0.018271414935588835, + "step": 202400 + }, + { + "epoch": 28.73101490418737, + "grad_norm": 0.46912723779678345, + "learning_rate": 7.128076650106459e-05, + "loss": 0.004479492455720902, + "step": 202410 + }, + { + "epoch": 28.732434350603263, + "grad_norm": 12.878448486328125, + "learning_rate": 7.12793470546487e-05, + "loss": 0.016088175773620605, + "step": 202420 + }, + { + "epoch": 28.73385379701916, + "grad_norm": 0.6154642105102539, + "learning_rate": 7.12779276082328e-05, + "loss": 0.03649974465370178, + "step": 202430 + }, + { + "epoch": 28.73527324343506, + "grad_norm": 0.14902591705322266, + "learning_rate": 7.127650816181689e-05, + "loss": 0.011256423592567445, + "step": 202440 + }, + { + "epoch": 28.73669268985096, + "grad_norm": 0.18846531212329865, + "learning_rate": 7.127508871540099e-05, + "loss": 0.018817995488643647, + "step": 202450 + }, + { + "epoch": 28.738112136266857, + "grad_norm": 2.739961624145508, + "learning_rate": 7.12736692689851e-05, + "loss": 0.03426278829574585, + "step": 202460 + }, + { + "epoch": 28.739531582682755, + "grad_norm": 0.2478046715259552, + "learning_rate": 7.12722498225692e-05, + "loss": 0.02281680405139923, + "step": 202470 + }, + { + "epoch": 28.740951029098653, + "grad_norm": 0.023704340681433678, + "learning_rate": 7.127083037615331e-05, + "loss": 0.005149252712726593, + "step": 202480 + }, + { + "epoch": 28.742370475514548, + "grad_norm": 0.01790465973317623, + "learning_rate": 7.12694109297374e-05, + "loss": 0.015261209011077881, + "step": 202490 + }, + { + "epoch": 28.743789921930446, + "grad_norm": 0.8043876886367798, + "learning_rate": 7.12679914833215e-05, + "loss": 0.03828992247581482, + "step": 202500 + }, + { + "epoch": 28.743789921930446, + "eval_accuracy": 0.988999809245247, + "eval_loss": 0.04034363850951195, + "eval_runtime": 31.8383, + "eval_samples_per_second": 493.965, + "eval_steps_per_second": 15.453, + "step": 202500 + }, + { + "epoch": 28.745209368346345, + "grad_norm": 0.4067186713218689, + "learning_rate": 7.126657203690561e-05, + "loss": 0.012552118301391602, + "step": 202510 + }, + { + "epoch": 28.746628814762243, + "grad_norm": 0.010445885360240936, + "learning_rate": 7.126515259048971e-05, + "loss": 0.001368100568652153, + "step": 202520 + }, + { + "epoch": 28.74804826117814, + "grad_norm": 0.07868316024541855, + "learning_rate": 7.126373314407382e-05, + "loss": 0.05089434385299683, + "step": 202530 + }, + { + "epoch": 28.74946770759404, + "grad_norm": 2.0863685607910156, + "learning_rate": 7.126245564229951e-05, + "loss": 0.011597948521375656, + "step": 202540 + }, + { + "epoch": 28.750887154009938, + "grad_norm": 0.16860847175121307, + "learning_rate": 7.126103619588362e-05, + "loss": 0.05231243371963501, + "step": 202550 + }, + { + "epoch": 28.752306600425833, + "grad_norm": 0.0034578396007418633, + "learning_rate": 7.12596167494677e-05, + "loss": 0.015253201127052307, + "step": 202560 + }, + { + "epoch": 28.75372604684173, + "grad_norm": 2.4222702980041504, + "learning_rate": 7.125819730305181e-05, + "loss": 0.013857361674308778, + "step": 202570 + }, + { + "epoch": 28.75514549325763, + "grad_norm": 2.776827335357666, + "learning_rate": 7.125677785663591e-05, + "loss": 0.006524787098169327, + "step": 202580 + }, + { + "epoch": 28.756564939673527, + "grad_norm": 5.432077407836914, + "learning_rate": 7.125535841022002e-05, + "loss": 0.008483321964740753, + "step": 202590 + }, + { + "epoch": 28.757984386089426, + "grad_norm": 0.18716326355934143, + "learning_rate": 7.125393896380412e-05, + "loss": 0.004110977053642273, + "step": 202600 + }, + { + "epoch": 28.759403832505324, + "grad_norm": 0.017783429473638535, + "learning_rate": 7.125251951738822e-05, + "loss": 0.031185391545295715, + "step": 202610 + }, + { + "epoch": 28.760823278921222, + "grad_norm": 0.1760314553976059, + "learning_rate": 7.125110007097231e-05, + "loss": 0.0028496194630861283, + "step": 202620 + }, + { + "epoch": 28.762242725337117, + "grad_norm": 0.31703877449035645, + "learning_rate": 7.124968062455643e-05, + "loss": 0.0017588146030902863, + "step": 202630 + }, + { + "epoch": 28.763662171753015, + "grad_norm": 0.0892600268125534, + "learning_rate": 7.124826117814054e-05, + "loss": 0.004255275428295136, + "step": 202640 + }, + { + "epoch": 28.765081618168914, + "grad_norm": 0.030280420556664467, + "learning_rate": 7.124684173172463e-05, + "loss": 0.0009103197604417801, + "step": 202650 + }, + { + "epoch": 28.766501064584812, + "grad_norm": 1.3672741651535034, + "learning_rate": 7.124542228530874e-05, + "loss": 0.007998555898666382, + "step": 202660 + }, + { + "epoch": 28.76792051100071, + "grad_norm": 1.3897072076797485, + "learning_rate": 7.124400283889283e-05, + "loss": 0.003999896720051766, + "step": 202670 + }, + { + "epoch": 28.76933995741661, + "grad_norm": 0.16279710829257965, + "learning_rate": 7.124258339247694e-05, + "loss": 0.011800149083137512, + "step": 202680 + }, + { + "epoch": 28.770759403832507, + "grad_norm": 0.4213109314441681, + "learning_rate": 7.124116394606104e-05, + "loss": 0.006886686384677887, + "step": 202690 + }, + { + "epoch": 28.7721788502484, + "grad_norm": 0.01768977753818035, + "learning_rate": 7.123974449964515e-05, + "loss": 0.01238137185573578, + "step": 202700 + }, + { + "epoch": 28.7735982966643, + "grad_norm": 2.7302703857421875, + "learning_rate": 7.123832505322924e-05, + "loss": 0.03521769344806671, + "step": 202710 + }, + { + "epoch": 28.7750177430802, + "grad_norm": 0.28604769706726074, + "learning_rate": 7.123690560681334e-05, + "loss": 0.018197962641716005, + "step": 202720 + }, + { + "epoch": 28.776437189496097, + "grad_norm": 0.0497569777071476, + "learning_rate": 7.123548616039745e-05, + "loss": 0.0027666930109262466, + "step": 202730 + }, + { + "epoch": 28.777856635911995, + "grad_norm": 0.021718090400099754, + "learning_rate": 7.123406671398155e-05, + "loss": 0.004326367750763893, + "step": 202740 + }, + { + "epoch": 28.779276082327893, + "grad_norm": 0.01891990192234516, + "learning_rate": 7.123264726756566e-05, + "loss": 0.006567257642745972, + "step": 202750 + }, + { + "epoch": 28.78069552874379, + "grad_norm": 0.6301097273826599, + "learning_rate": 7.123122782114976e-05, + "loss": 0.01024438589811325, + "step": 202760 + }, + { + "epoch": 28.782114975159686, + "grad_norm": 0.037447765469551086, + "learning_rate": 7.122980837473386e-05, + "loss": 0.02538682222366333, + "step": 202770 + }, + { + "epoch": 28.783534421575585, + "grad_norm": 0.0027806598227471113, + "learning_rate": 7.122838892831795e-05, + "loss": 0.01094139739871025, + "step": 202780 + }, + { + "epoch": 28.784953867991483, + "grad_norm": 0.028766172006726265, + "learning_rate": 7.122696948190206e-05, + "loss": 0.028862008452415468, + "step": 202790 + }, + { + "epoch": 28.78637331440738, + "grad_norm": 0.13067938387393951, + "learning_rate": 7.122555003548616e-05, + "loss": 0.013009685277938842, + "step": 202800 + }, + { + "epoch": 28.78779276082328, + "grad_norm": 0.26093804836273193, + "learning_rate": 7.122413058907027e-05, + "loss": 0.005754271894693375, + "step": 202810 + }, + { + "epoch": 28.789212207239178, + "grad_norm": 0.005758289713412523, + "learning_rate": 7.122271114265437e-05, + "loss": 0.00479193776845932, + "step": 202820 + }, + { + "epoch": 28.790631653655076, + "grad_norm": 0.3155323266983032, + "learning_rate": 7.122129169623847e-05, + "loss": 0.0025750327855348586, + "step": 202830 + }, + { + "epoch": 28.79205110007097, + "grad_norm": 0.3970525562763214, + "learning_rate": 7.121987224982258e-05, + "loss": 0.003557312861084938, + "step": 202840 + }, + { + "epoch": 28.79347054648687, + "grad_norm": 0.036359190940856934, + "learning_rate": 7.121845280340668e-05, + "loss": 0.01635591685771942, + "step": 202850 + }, + { + "epoch": 28.794889992902768, + "grad_norm": 5.662169456481934, + "learning_rate": 7.121703335699079e-05, + "loss": 0.011308480054140091, + "step": 202860 + }, + { + "epoch": 28.796309439318666, + "grad_norm": 0.6503276824951172, + "learning_rate": 7.121561391057487e-05, + "loss": 0.009040129184722901, + "step": 202870 + }, + { + "epoch": 28.797728885734564, + "grad_norm": 0.01877494715154171, + "learning_rate": 7.121419446415898e-05, + "loss": 0.011672577261924744, + "step": 202880 + }, + { + "epoch": 28.799148332150462, + "grad_norm": 21.004018783569336, + "learning_rate": 7.121277501774308e-05, + "loss": 0.04437762498855591, + "step": 202890 + }, + { + "epoch": 28.80056777856636, + "grad_norm": 0.19134217500686646, + "learning_rate": 7.121135557132719e-05, + "loss": 0.012965710461139679, + "step": 202900 + }, + { + "epoch": 28.801987224982255, + "grad_norm": 0.4114138185977936, + "learning_rate": 7.120993612491129e-05, + "loss": 0.04741590321063995, + "step": 202910 + }, + { + "epoch": 28.803406671398154, + "grad_norm": 7.3049516677856445, + "learning_rate": 7.120851667849538e-05, + "loss": 0.042461469769477844, + "step": 202920 + }, + { + "epoch": 28.804826117814052, + "grad_norm": 0.003979488741606474, + "learning_rate": 7.12070972320795e-05, + "loss": 0.015451483428478241, + "step": 202930 + }, + { + "epoch": 28.80624556422995, + "grad_norm": 0.2440037727355957, + "learning_rate": 7.120567778566359e-05, + "loss": 0.03024878203868866, + "step": 202940 + }, + { + "epoch": 28.80766501064585, + "grad_norm": 0.2930488586425781, + "learning_rate": 7.12042583392477e-05, + "loss": 0.001177312433719635, + "step": 202950 + }, + { + "epoch": 28.809084457061747, + "grad_norm": 5.873632907867432, + "learning_rate": 7.12028388928318e-05, + "loss": 0.033549338579177856, + "step": 202960 + }, + { + "epoch": 28.810503903477645, + "grad_norm": 9.886422157287598, + "learning_rate": 7.12014194464159e-05, + "loss": 0.04775959849357605, + "step": 202970 + }, + { + "epoch": 28.81192334989354, + "grad_norm": 0.017815109342336655, + "learning_rate": 7.12e-05, + "loss": 0.0017503224313259124, + "step": 202980 + }, + { + "epoch": 28.81334279630944, + "grad_norm": 0.051639020442962646, + "learning_rate": 7.119858055358411e-05, + "loss": 0.0009091340005397796, + "step": 202990 + }, + { + "epoch": 28.814762242725337, + "grad_norm": 0.10275249183177948, + "learning_rate": 7.11971611071682e-05, + "loss": 0.005060353130102157, + "step": 203000 + }, + { + "epoch": 28.814762242725337, + "eval_accuracy": 0.9902079226807401, + "eval_loss": 0.03589695692062378, + "eval_runtime": 31.6609, + "eval_samples_per_second": 496.733, + "eval_steps_per_second": 15.54, + "step": 203000 + }, + { + "epoch": 28.816181689141235, + "grad_norm": 6.375905990600586, + "learning_rate": 7.119574166075232e-05, + "loss": 0.012361995130777358, + "step": 203010 + }, + { + "epoch": 28.817601135557133, + "grad_norm": 0.8486672639846802, + "learning_rate": 7.119432221433641e-05, + "loss": 0.008955194056034088, + "step": 203020 + }, + { + "epoch": 28.81902058197303, + "grad_norm": 1.8105239868164062, + "learning_rate": 7.119290276792051e-05, + "loss": 0.03748617470264435, + "step": 203030 + }, + { + "epoch": 28.82044002838893, + "grad_norm": 0.03347370773553848, + "learning_rate": 7.119148332150462e-05, + "loss": 0.0375461757183075, + "step": 203040 + }, + { + "epoch": 28.821859474804825, + "grad_norm": 0.2949555516242981, + "learning_rate": 7.119006387508872e-05, + "loss": 0.019121986627578736, + "step": 203050 + }, + { + "epoch": 28.823278921220723, + "grad_norm": 0.752108097076416, + "learning_rate": 7.118864442867283e-05, + "loss": 0.05761125087738037, + "step": 203060 + }, + { + "epoch": 28.82469836763662, + "grad_norm": 0.13393545150756836, + "learning_rate": 7.118722498225693e-05, + "loss": 0.0058804254978895186, + "step": 203070 + }, + { + "epoch": 28.82611781405252, + "grad_norm": 0.058786869049072266, + "learning_rate": 7.118580553584102e-05, + "loss": 0.0017414472997188569, + "step": 203080 + }, + { + "epoch": 28.827537260468418, + "grad_norm": 0.1039448231458664, + "learning_rate": 7.118438608942512e-05, + "loss": 0.011464646458625794, + "step": 203090 + }, + { + "epoch": 28.828956706884316, + "grad_norm": 0.048438090831041336, + "learning_rate": 7.118296664300923e-05, + "loss": 0.004000085592269898, + "step": 203100 + }, + { + "epoch": 28.830376153300215, + "grad_norm": 12.092706680297852, + "learning_rate": 7.118154719659333e-05, + "loss": 0.01029500812292099, + "step": 203110 + }, + { + "epoch": 28.83179559971611, + "grad_norm": 3.242499589920044, + "learning_rate": 7.118012775017744e-05, + "loss": 0.004942065104842186, + "step": 203120 + }, + { + "epoch": 28.833215046132008, + "grad_norm": 9.638948440551758, + "learning_rate": 7.117870830376154e-05, + "loss": 0.01819678544998169, + "step": 203130 + }, + { + "epoch": 28.834634492547906, + "grad_norm": 5.077639102935791, + "learning_rate": 7.117728885734564e-05, + "loss": 0.03603287935256958, + "step": 203140 + }, + { + "epoch": 28.836053938963804, + "grad_norm": 10.142051696777344, + "learning_rate": 7.117586941092975e-05, + "loss": 0.01690348833799362, + "step": 203150 + }, + { + "epoch": 28.837473385379703, + "grad_norm": 0.016948331147432327, + "learning_rate": 7.117444996451384e-05, + "loss": 0.05102584958076477, + "step": 203160 + }, + { + "epoch": 28.8388928317956, + "grad_norm": 1.4875142574310303, + "learning_rate": 7.117303051809795e-05, + "loss": 0.005738548561930656, + "step": 203170 + }, + { + "epoch": 28.8403122782115, + "grad_norm": 1.8814880847930908, + "learning_rate": 7.117161107168204e-05, + "loss": 0.0019484378397464752, + "step": 203180 + }, + { + "epoch": 28.841731724627394, + "grad_norm": 0.17972427606582642, + "learning_rate": 7.117019162526615e-05, + "loss": 0.04090189337730408, + "step": 203190 + }, + { + "epoch": 28.843151171043292, + "grad_norm": 0.03174563869833946, + "learning_rate": 7.116877217885025e-05, + "loss": 0.013512611389160156, + "step": 203200 + }, + { + "epoch": 28.84457061745919, + "grad_norm": 4.256505489349365, + "learning_rate": 7.116735273243436e-05, + "loss": 0.053637909889221194, + "step": 203210 + }, + { + "epoch": 28.84599006387509, + "grad_norm": 3.8101108074188232, + "learning_rate": 7.116593328601845e-05, + "loss": 0.02142321616411209, + "step": 203220 + }, + { + "epoch": 28.847409510290987, + "grad_norm": 0.35324493050575256, + "learning_rate": 7.116451383960255e-05, + "loss": 0.004821119457483291, + "step": 203230 + }, + { + "epoch": 28.848828956706885, + "grad_norm": 0.02553342841565609, + "learning_rate": 7.116309439318666e-05, + "loss": 0.015905463695526124, + "step": 203240 + }, + { + "epoch": 28.850248403122784, + "grad_norm": 0.5993332266807556, + "learning_rate": 7.116167494677076e-05, + "loss": 0.0037181202322244646, + "step": 203250 + }, + { + "epoch": 28.85166784953868, + "grad_norm": 0.007250367198139429, + "learning_rate": 7.116025550035487e-05, + "loss": 0.022462160885334016, + "step": 203260 + }, + { + "epoch": 28.853087295954577, + "grad_norm": 0.0966406837105751, + "learning_rate": 7.115883605393897e-05, + "loss": 0.04971932172775269, + "step": 203270 + }, + { + "epoch": 28.854506742370475, + "grad_norm": 0.05555707588791847, + "learning_rate": 7.115741660752307e-05, + "loss": 0.0021825633943080903, + "step": 203280 + }, + { + "epoch": 28.855926188786373, + "grad_norm": 0.6990764737129211, + "learning_rate": 7.115599716110716e-05, + "loss": 0.00954635813832283, + "step": 203290 + }, + { + "epoch": 28.85734563520227, + "grad_norm": 0.056228529661893845, + "learning_rate": 7.115457771469127e-05, + "loss": 0.02022438943386078, + "step": 203300 + }, + { + "epoch": 28.85876508161817, + "grad_norm": 3.569765567779541, + "learning_rate": 7.115315826827537e-05, + "loss": 0.012749548256397247, + "step": 203310 + }, + { + "epoch": 28.86018452803407, + "grad_norm": 0.008093277923762798, + "learning_rate": 7.115173882185948e-05, + "loss": 0.013204763829708099, + "step": 203320 + }, + { + "epoch": 28.861603974449963, + "grad_norm": 10.956599235534668, + "learning_rate": 7.115031937544358e-05, + "loss": 0.03327171802520752, + "step": 203330 + }, + { + "epoch": 28.86302342086586, + "grad_norm": 7.087300777435303, + "learning_rate": 7.114889992902768e-05, + "loss": 0.004003441333770752, + "step": 203340 + }, + { + "epoch": 28.86444286728176, + "grad_norm": 0.6270055770874023, + "learning_rate": 7.114748048261179e-05, + "loss": 0.011266354471445084, + "step": 203350 + }, + { + "epoch": 28.865862313697658, + "grad_norm": 9.235366821289062, + "learning_rate": 7.114606103619589e-05, + "loss": 0.007889777421951294, + "step": 203360 + }, + { + "epoch": 28.867281760113556, + "grad_norm": 0.016537608578801155, + "learning_rate": 7.114464158978e-05, + "loss": 0.022802813351154326, + "step": 203370 + }, + { + "epoch": 28.868701206529455, + "grad_norm": 2.114942789077759, + "learning_rate": 7.11432221433641e-05, + "loss": 0.006257279217243195, + "step": 203380 + }, + { + "epoch": 28.870120652945353, + "grad_norm": 0.015868322923779488, + "learning_rate": 7.114180269694819e-05, + "loss": 0.030576300621032716, + "step": 203390 + }, + { + "epoch": 28.871540099361248, + "grad_norm": 0.0796002522110939, + "learning_rate": 7.114038325053229e-05, + "loss": 0.016757123172283173, + "step": 203400 + }, + { + "epoch": 28.872959545777146, + "grad_norm": 0.6198431253433228, + "learning_rate": 7.11389638041164e-05, + "loss": 0.0027645982801914213, + "step": 203410 + }, + { + "epoch": 28.874378992193044, + "grad_norm": 0.07583162188529968, + "learning_rate": 7.11375443577005e-05, + "loss": 0.029992347955703734, + "step": 203420 + }, + { + "epoch": 28.875798438608943, + "grad_norm": 12.846670150756836, + "learning_rate": 7.113612491128461e-05, + "loss": 0.015633615851402282, + "step": 203430 + }, + { + "epoch": 28.87721788502484, + "grad_norm": 18.967647552490234, + "learning_rate": 7.11347054648687e-05, + "loss": 0.024274301528930665, + "step": 203440 + }, + { + "epoch": 28.87863733144074, + "grad_norm": 0.017806854099035263, + "learning_rate": 7.11332860184528e-05, + "loss": 0.003165776655077934, + "step": 203450 + }, + { + "epoch": 28.880056777856637, + "grad_norm": 0.06402713060379028, + "learning_rate": 7.113186657203691e-05, + "loss": 0.008560384809970855, + "step": 203460 + }, + { + "epoch": 28.881476224272532, + "grad_norm": 0.07992161065340042, + "learning_rate": 7.113044712562101e-05, + "loss": 0.017873361706733704, + "step": 203470 + }, + { + "epoch": 28.88289567068843, + "grad_norm": 6.097292900085449, + "learning_rate": 7.112902767920512e-05, + "loss": 0.007194198668003082, + "step": 203480 + }, + { + "epoch": 28.88431511710433, + "grad_norm": 0.5403692126274109, + "learning_rate": 7.11276082327892e-05, + "loss": 0.005505822598934174, + "step": 203490 + }, + { + "epoch": 28.885734563520227, + "grad_norm": 0.07727505266666412, + "learning_rate": 7.112618878637332e-05, + "loss": 0.03034776747226715, + "step": 203500 + }, + { + "epoch": 28.885734563520227, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.07463093847036362, + "eval_runtime": 32.0524, + "eval_samples_per_second": 490.666, + "eval_steps_per_second": 15.35, + "step": 203500 + }, + { + "epoch": 28.887154009936125, + "grad_norm": 0.11138415336608887, + "learning_rate": 7.112476933995741e-05, + "loss": 0.00584210678935051, + "step": 203510 + }, + { + "epoch": 28.888573456352024, + "grad_norm": 0.12948386371135712, + "learning_rate": 7.112334989354153e-05, + "loss": 0.00881311222910881, + "step": 203520 + }, + { + "epoch": 28.889992902767922, + "grad_norm": 1.5104658603668213, + "learning_rate": 7.112193044712562e-05, + "loss": 0.004806894809007645, + "step": 203530 + }, + { + "epoch": 28.891412349183817, + "grad_norm": 0.20965075492858887, + "learning_rate": 7.112051100070972e-05, + "loss": 0.007732678949832916, + "step": 203540 + }, + { + "epoch": 28.892831795599715, + "grad_norm": 0.025547917932271957, + "learning_rate": 7.111909155429383e-05, + "loss": 0.005097228661179542, + "step": 203550 + }, + { + "epoch": 28.894251242015613, + "grad_norm": 5.040984630584717, + "learning_rate": 7.111767210787793e-05, + "loss": 0.04183858036994934, + "step": 203560 + }, + { + "epoch": 28.89567068843151, + "grad_norm": 0.22921733558177948, + "learning_rate": 7.111625266146204e-05, + "loss": 0.01963055282831192, + "step": 203570 + }, + { + "epoch": 28.89709013484741, + "grad_norm": 0.012625638395547867, + "learning_rate": 7.111483321504614e-05, + "loss": 0.0029355812817811965, + "step": 203580 + }, + { + "epoch": 28.89850958126331, + "grad_norm": 0.9004891514778137, + "learning_rate": 7.111341376863023e-05, + "loss": 0.0098853200674057, + "step": 203590 + }, + { + "epoch": 28.899929027679207, + "grad_norm": 0.564095139503479, + "learning_rate": 7.111199432221433e-05, + "loss": 0.011549536138772964, + "step": 203600 + }, + { + "epoch": 28.9013484740951, + "grad_norm": 0.13215500116348267, + "learning_rate": 7.111057487579844e-05, + "loss": 0.009898069500923156, + "step": 203610 + }, + { + "epoch": 28.902767920511, + "grad_norm": 0.036609210073947906, + "learning_rate": 7.110915542938254e-05, + "loss": 0.0012260224670171739, + "step": 203620 + }, + { + "epoch": 28.904187366926898, + "grad_norm": 0.43108728528022766, + "learning_rate": 7.110773598296665e-05, + "loss": 0.008975581079721451, + "step": 203630 + }, + { + "epoch": 28.905606813342796, + "grad_norm": 0.013503863476216793, + "learning_rate": 7.110631653655075e-05, + "loss": 0.005596206709742546, + "step": 203640 + }, + { + "epoch": 28.907026259758695, + "grad_norm": 0.01203615590929985, + "learning_rate": 7.110489709013485e-05, + "loss": 0.009315300732851028, + "step": 203650 + }, + { + "epoch": 28.908445706174593, + "grad_norm": 0.0009439372806809843, + "learning_rate": 7.110347764371896e-05, + "loss": 0.0016260620206594468, + "step": 203660 + }, + { + "epoch": 28.90986515259049, + "grad_norm": 8.978412628173828, + "learning_rate": 7.110205819730305e-05, + "loss": 0.01562500447034836, + "step": 203670 + }, + { + "epoch": 28.911284599006386, + "grad_norm": 0.5893821120262146, + "learning_rate": 7.110063875088716e-05, + "loss": 0.00616813413798809, + "step": 203680 + }, + { + "epoch": 28.912704045422284, + "grad_norm": 0.2560266852378845, + "learning_rate": 7.109921930447126e-05, + "loss": 0.011034181714057923, + "step": 203690 + }, + { + "epoch": 28.914123491838183, + "grad_norm": 0.1857273280620575, + "learning_rate": 7.109779985805536e-05, + "loss": 0.008844484388828278, + "step": 203700 + }, + { + "epoch": 28.91554293825408, + "grad_norm": 1.3393546342849731, + "learning_rate": 7.109638041163946e-05, + "loss": 0.0015324067324399948, + "step": 203710 + }, + { + "epoch": 28.91696238466998, + "grad_norm": 0.02755443938076496, + "learning_rate": 7.109496096522357e-05, + "loss": 0.008205199241638183, + "step": 203720 + }, + { + "epoch": 28.918381831085878, + "grad_norm": 12.777338027954102, + "learning_rate": 7.109354151880767e-05, + "loss": 0.012330913543701172, + "step": 203730 + }, + { + "epoch": 28.919801277501776, + "grad_norm": 0.015572084113955498, + "learning_rate": 7.109212207239178e-05, + "loss": 0.015875250101089478, + "step": 203740 + }, + { + "epoch": 28.92122072391767, + "grad_norm": 0.11835578829050064, + "learning_rate": 7.109070262597587e-05, + "loss": 0.03060312271118164, + "step": 203750 + }, + { + "epoch": 28.92264017033357, + "grad_norm": 0.00970064289867878, + "learning_rate": 7.108928317955997e-05, + "loss": 0.060244697332382205, + "step": 203760 + }, + { + "epoch": 28.924059616749467, + "grad_norm": 0.13579252362251282, + "learning_rate": 7.108786373314408e-05, + "loss": 0.008389712125062943, + "step": 203770 + }, + { + "epoch": 28.925479063165366, + "grad_norm": 1.954798698425293, + "learning_rate": 7.108644428672818e-05, + "loss": 0.021739019453525542, + "step": 203780 + }, + { + "epoch": 28.926898509581264, + "grad_norm": 0.0046302578411996365, + "learning_rate": 7.108502484031229e-05, + "loss": 0.0037008166313171388, + "step": 203790 + }, + { + "epoch": 28.928317955997162, + "grad_norm": 0.08097607642412186, + "learning_rate": 7.108360539389637e-05, + "loss": 0.0019368495792150497, + "step": 203800 + }, + { + "epoch": 28.92973740241306, + "grad_norm": 1.2470359802246094, + "learning_rate": 7.108218594748048e-05, + "loss": 0.011286996304988861, + "step": 203810 + }, + { + "epoch": 28.931156848828955, + "grad_norm": 0.8522784113883972, + "learning_rate": 7.108076650106458e-05, + "loss": 0.004165459796786309, + "step": 203820 + }, + { + "epoch": 28.932576295244854, + "grad_norm": 4.433992862701416, + "learning_rate": 7.107934705464869e-05, + "loss": 0.032360440492630003, + "step": 203830 + }, + { + "epoch": 28.933995741660752, + "grad_norm": 0.08903558552265167, + "learning_rate": 7.10779276082328e-05, + "loss": 0.018746085464954376, + "step": 203840 + }, + { + "epoch": 28.93541518807665, + "grad_norm": 10.004819869995117, + "learning_rate": 7.107650816181689e-05, + "loss": 0.01810932457447052, + "step": 203850 + }, + { + "epoch": 28.93683463449255, + "grad_norm": 0.29350271821022034, + "learning_rate": 7.1075088715401e-05, + "loss": 0.020113852620124818, + "step": 203860 + }, + { + "epoch": 28.938254080908447, + "grad_norm": 5.892873287200928, + "learning_rate": 7.10736692689851e-05, + "loss": 0.012825003266334534, + "step": 203870 + }, + { + "epoch": 28.939673527324345, + "grad_norm": 0.47506770491600037, + "learning_rate": 7.107224982256921e-05, + "loss": 0.0013097725808620454, + "step": 203880 + }, + { + "epoch": 28.94109297374024, + "grad_norm": 0.1946927011013031, + "learning_rate": 7.10708303761533e-05, + "loss": 0.01863977462053299, + "step": 203890 + }, + { + "epoch": 28.942512420156138, + "grad_norm": 9.474695205688477, + "learning_rate": 7.10694109297374e-05, + "loss": 0.04268102943897247, + "step": 203900 + }, + { + "epoch": 28.943931866572036, + "grad_norm": 20.769567489624023, + "learning_rate": 7.10679914833215e-05, + "loss": 0.06048552393913269, + "step": 203910 + }, + { + "epoch": 28.945351312987935, + "grad_norm": 12.767876625061035, + "learning_rate": 7.106657203690561e-05, + "loss": 0.03310614824295044, + "step": 203920 + }, + { + "epoch": 28.946770759403833, + "grad_norm": 0.08730275928974152, + "learning_rate": 7.106515259048972e-05, + "loss": 0.010783741623163224, + "step": 203930 + }, + { + "epoch": 28.94819020581973, + "grad_norm": 1.706111192703247, + "learning_rate": 7.106373314407382e-05, + "loss": 0.0135955810546875, + "step": 203940 + }, + { + "epoch": 28.94960965223563, + "grad_norm": 0.03335109353065491, + "learning_rate": 7.106231369765792e-05, + "loss": 0.02310059517621994, + "step": 203950 + }, + { + "epoch": 28.951029098651524, + "grad_norm": 2.077681541442871, + "learning_rate": 7.106089425124201e-05, + "loss": 0.010161271691322327, + "step": 203960 + }, + { + "epoch": 28.952448545067423, + "grad_norm": 18.724382400512695, + "learning_rate": 7.105947480482612e-05, + "loss": 0.021189820766448975, + "step": 203970 + }, + { + "epoch": 28.95386799148332, + "grad_norm": 0.0031039798632264137, + "learning_rate": 7.105805535841022e-05, + "loss": 0.003978554159402847, + "step": 203980 + }, + { + "epoch": 28.95528743789922, + "grad_norm": 7.489923000335693, + "learning_rate": 7.105663591199433e-05, + "loss": 0.04710999131202698, + "step": 203990 + }, + { + "epoch": 28.956706884315118, + "grad_norm": 0.5731682181358337, + "learning_rate": 7.105521646557842e-05, + "loss": 0.045627936720848083, + "step": 204000 + }, + { + "epoch": 28.956706884315118, + "eval_accuracy": 0.9872830164684937, + "eval_loss": 0.04365848749876022, + "eval_runtime": 33.6385, + "eval_samples_per_second": 467.53, + "eval_steps_per_second": 14.626, + "step": 204000 + }, + { + "epoch": 28.958126330731016, + "grad_norm": 1.7862430810928345, + "learning_rate": 7.105379701916253e-05, + "loss": 0.014305508136749268, + "step": 204010 + }, + { + "epoch": 28.959545777146914, + "grad_norm": 0.503933310508728, + "learning_rate": 7.105237757274662e-05, + "loss": 0.016063228249549866, + "step": 204020 + }, + { + "epoch": 28.96096522356281, + "grad_norm": 27.479801177978516, + "learning_rate": 7.105095812633074e-05, + "loss": 0.07539821863174438, + "step": 204030 + }, + { + "epoch": 28.962384669978707, + "grad_norm": 0.08304733037948608, + "learning_rate": 7.104953867991485e-05, + "loss": 0.005125253275036812, + "step": 204040 + }, + { + "epoch": 28.963804116394606, + "grad_norm": 0.11767375469207764, + "learning_rate": 7.104811923349894e-05, + "loss": 0.002219578996300697, + "step": 204050 + }, + { + "epoch": 28.965223562810504, + "grad_norm": 0.01600889302790165, + "learning_rate": 7.104669978708304e-05, + "loss": 0.009784726798534394, + "step": 204060 + }, + { + "epoch": 28.966643009226402, + "grad_norm": 7.057340621948242, + "learning_rate": 7.104528034066714e-05, + "loss": 0.004014408215880394, + "step": 204070 + }, + { + "epoch": 28.9680624556423, + "grad_norm": 0.18340358138084412, + "learning_rate": 7.104386089425125e-05, + "loss": 0.006884460151195526, + "step": 204080 + }, + { + "epoch": 28.9694819020582, + "grad_norm": 0.004725892096757889, + "learning_rate": 7.104244144783535e-05, + "loss": 0.0024671796709299088, + "step": 204090 + }, + { + "epoch": 28.970901348474094, + "grad_norm": 0.028138531371951103, + "learning_rate": 7.104102200141946e-05, + "loss": 0.047716015577316286, + "step": 204100 + }, + { + "epoch": 28.972320794889992, + "grad_norm": 0.04337480664253235, + "learning_rate": 7.103960255500354e-05, + "loss": 0.0027063697576522826, + "step": 204110 + }, + { + "epoch": 28.97374024130589, + "grad_norm": 0.42466068267822266, + "learning_rate": 7.103818310858765e-05, + "loss": 0.0016052652150392531, + "step": 204120 + }, + { + "epoch": 28.97515968772179, + "grad_norm": 0.04545964300632477, + "learning_rate": 7.103676366217176e-05, + "loss": 0.008114568144083022, + "step": 204130 + }, + { + "epoch": 28.976579134137687, + "grad_norm": 0.054928723722696304, + "learning_rate": 7.103534421575586e-05, + "loss": 0.028280919790267943, + "step": 204140 + }, + { + "epoch": 28.977998580553585, + "grad_norm": 0.13161909580230713, + "learning_rate": 7.103392476933997e-05, + "loss": 0.01608346253633499, + "step": 204150 + }, + { + "epoch": 28.979418026969483, + "grad_norm": 4.270598888397217, + "learning_rate": 7.103250532292406e-05, + "loss": 0.00953405275940895, + "step": 204160 + }, + { + "epoch": 28.980837473385378, + "grad_norm": 0.2795242965221405, + "learning_rate": 7.103108587650817e-05, + "loss": 0.046723181009292604, + "step": 204170 + }, + { + "epoch": 28.982256919801276, + "grad_norm": 0.03928562253713608, + "learning_rate": 7.102966643009226e-05, + "loss": 0.0031140569597482683, + "step": 204180 + }, + { + "epoch": 28.983676366217175, + "grad_norm": 0.0290362648665905, + "learning_rate": 7.102824698367637e-05, + "loss": 0.01596488505601883, + "step": 204190 + }, + { + "epoch": 28.985095812633073, + "grad_norm": 0.5165680050849915, + "learning_rate": 7.102682753726047e-05, + "loss": 0.010950812697410583, + "step": 204200 + }, + { + "epoch": 28.98651525904897, + "grad_norm": 10.361393928527832, + "learning_rate": 7.102540809084457e-05, + "loss": 0.0211029589176178, + "step": 204210 + }, + { + "epoch": 28.98793470546487, + "grad_norm": 0.5400388836860657, + "learning_rate": 7.102398864442868e-05, + "loss": 0.0038516178727149965, + "step": 204220 + }, + { + "epoch": 28.989354151880768, + "grad_norm": 10.46307373046875, + "learning_rate": 7.102256919801278e-05, + "loss": 0.024744459986686708, + "step": 204230 + }, + { + "epoch": 28.990773598296663, + "grad_norm": 0.031439702957868576, + "learning_rate": 7.102114975159689e-05, + "loss": 0.004509336873888969, + "step": 204240 + }, + { + "epoch": 28.99219304471256, + "grad_norm": 0.3492359519004822, + "learning_rate": 7.101973030518099e-05, + "loss": 0.00856209099292755, + "step": 204250 + }, + { + "epoch": 28.99361249112846, + "grad_norm": 0.04567471519112587, + "learning_rate": 7.101831085876508e-05, + "loss": 0.006784294545650482, + "step": 204260 + }, + { + "epoch": 28.995031937544358, + "grad_norm": 0.18650266528129578, + "learning_rate": 7.101689141234918e-05, + "loss": 0.05271605849266052, + "step": 204270 + }, + { + "epoch": 28.996451383960256, + "grad_norm": 12.27700138092041, + "learning_rate": 7.101547196593329e-05, + "loss": 0.01755138039588928, + "step": 204280 + }, + { + "epoch": 28.997870830376154, + "grad_norm": 0.14445938169956207, + "learning_rate": 7.101405251951739e-05, + "loss": 0.012498818337917328, + "step": 204290 + }, + { + "epoch": 28.999290276792053, + "grad_norm": 0.015494639053940773, + "learning_rate": 7.10126330731015e-05, + "loss": 0.010929460823535918, + "step": 204300 + }, + { + "epoch": 29.000709723207947, + "grad_norm": 8.57687759399414, + "learning_rate": 7.10112136266856e-05, + "loss": 0.024114866554737092, + "step": 204310 + }, + { + "epoch": 29.002129169623846, + "grad_norm": 0.18044428527355194, + "learning_rate": 7.10097941802697e-05, + "loss": 0.0454404890537262, + "step": 204320 + }, + { + "epoch": 29.003548616039744, + "grad_norm": 0.12010136991739273, + "learning_rate": 7.10083747338538e-05, + "loss": 0.0018241804093122483, + "step": 204330 + }, + { + "epoch": 29.004968062455642, + "grad_norm": 0.035653579980134964, + "learning_rate": 7.10069552874379e-05, + "loss": 0.0015988681465387345, + "step": 204340 + }, + { + "epoch": 29.00638750887154, + "grad_norm": 0.07429961860179901, + "learning_rate": 7.100553584102201e-05, + "loss": 0.02380144000053406, + "step": 204350 + }, + { + "epoch": 29.00780695528744, + "grad_norm": 0.02331356890499592, + "learning_rate": 7.10041163946061e-05, + "loss": 0.0054002396762371065, + "step": 204360 + }, + { + "epoch": 29.009226401703337, + "grad_norm": 4.585449695587158, + "learning_rate": 7.100269694819021e-05, + "loss": 0.009194451570510864, + "step": 204370 + }, + { + "epoch": 29.010645848119232, + "grad_norm": 0.03747543692588806, + "learning_rate": 7.10012775017743e-05, + "loss": 0.01880255788564682, + "step": 204380 + }, + { + "epoch": 29.01206529453513, + "grad_norm": 0.009632320143282413, + "learning_rate": 7.099985805535842e-05, + "loss": 0.0019395578652620315, + "step": 204390 + }, + { + "epoch": 29.01348474095103, + "grad_norm": 0.010415197350084782, + "learning_rate": 7.099843860894251e-05, + "loss": 0.0008637521415948868, + "step": 204400 + }, + { + "epoch": 29.014904187366927, + "grad_norm": 0.08873456716537476, + "learning_rate": 7.099701916252663e-05, + "loss": 0.041172707080841066, + "step": 204410 + }, + { + "epoch": 29.016323633782825, + "grad_norm": 0.9284741878509521, + "learning_rate": 7.099559971611072e-05, + "loss": 0.003127608820796013, + "step": 204420 + }, + { + "epoch": 29.017743080198724, + "grad_norm": 0.0021580765023827553, + "learning_rate": 7.099418026969482e-05, + "loss": 0.0006645757704973221, + "step": 204430 + }, + { + "epoch": 29.019162526614622, + "grad_norm": 0.0798548012971878, + "learning_rate": 7.099276082327893e-05, + "loss": 0.011808426678180694, + "step": 204440 + }, + { + "epoch": 29.020581973030517, + "grad_norm": 0.043699994683265686, + "learning_rate": 7.099134137686303e-05, + "loss": 0.003213892504572868, + "step": 204450 + }, + { + "epoch": 29.022001419446415, + "grad_norm": 0.0430687852203846, + "learning_rate": 7.098992193044714e-05, + "loss": 0.0009884454309940338, + "step": 204460 + }, + { + "epoch": 29.023420865862313, + "grad_norm": 2.0042316913604736, + "learning_rate": 7.098850248403122e-05, + "loss": 0.047663706541061404, + "step": 204470 + }, + { + "epoch": 29.02484031227821, + "grad_norm": 0.028123753145337105, + "learning_rate": 7.098708303761533e-05, + "loss": 0.0204979345202446, + "step": 204480 + }, + { + "epoch": 29.02625975869411, + "grad_norm": 12.216204643249512, + "learning_rate": 7.098566359119943e-05, + "loss": 0.022230838239192963, + "step": 204490 + }, + { + "epoch": 29.027679205110008, + "grad_norm": 18.8688907623291, + "learning_rate": 7.098424414478354e-05, + "loss": 0.02816259562969208, + "step": 204500 + }, + { + "epoch": 29.027679205110008, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.0550016351044178, + "eval_runtime": 31.5865, + "eval_samples_per_second": 497.902, + "eval_steps_per_second": 15.576, + "step": 204500 + }, + { + "epoch": 29.029098651525906, + "grad_norm": 0.2398083508014679, + "learning_rate": 7.098282469836764e-05, + "loss": 0.0011149238795042037, + "step": 204510 + }, + { + "epoch": 29.0305180979418, + "grad_norm": 0.01801624521613121, + "learning_rate": 7.098140525195174e-05, + "loss": 0.016227111220359802, + "step": 204520 + }, + { + "epoch": 29.0319375443577, + "grad_norm": 0.03703325241804123, + "learning_rate": 7.097998580553585e-05, + "loss": 0.004071198403835297, + "step": 204530 + }, + { + "epoch": 29.033356990773598, + "grad_norm": 1.5501700639724731, + "learning_rate": 7.097856635911995e-05, + "loss": 0.013747961819171905, + "step": 204540 + }, + { + "epoch": 29.034776437189496, + "grad_norm": 0.017926491796970367, + "learning_rate": 7.097714691270406e-05, + "loss": 0.024194550514221192, + "step": 204550 + }, + { + "epoch": 29.036195883605394, + "grad_norm": 0.0819256454706192, + "learning_rate": 7.097572746628815e-05, + "loss": 0.02397385537624359, + "step": 204560 + }, + { + "epoch": 29.037615330021293, + "grad_norm": 0.5627143979072571, + "learning_rate": 7.097430801987225e-05, + "loss": 0.0039043005555868147, + "step": 204570 + }, + { + "epoch": 29.03903477643719, + "grad_norm": 0.043263256549835205, + "learning_rate": 7.097288857345635e-05, + "loss": 0.013953967392444611, + "step": 204580 + }, + { + "epoch": 29.040454222853086, + "grad_norm": 7.804495334625244, + "learning_rate": 7.097146912704046e-05, + "loss": 0.011853570491075516, + "step": 204590 + }, + { + "epoch": 29.041873669268984, + "grad_norm": 1.1331719160079956, + "learning_rate": 7.097004968062456e-05, + "loss": 0.004596026614308357, + "step": 204600 + }, + { + "epoch": 29.043293115684882, + "grad_norm": 0.013226081617176533, + "learning_rate": 7.096863023420867e-05, + "loss": 0.0033280547708272935, + "step": 204610 + }, + { + "epoch": 29.04471256210078, + "grad_norm": Infinity, + "learning_rate": 7.096721078779277e-05, + "loss": 0.013511869311332702, + "step": 204620 + }, + { + "epoch": 29.04613200851668, + "grad_norm": 0.008982895873486996, + "learning_rate": 7.096593328601846e-05, + "loss": 0.0320911169052124, + "step": 204630 + }, + { + "epoch": 29.047551454932577, + "grad_norm": 0.20266717672348022, + "learning_rate": 7.096451383960255e-05, + "loss": 0.019582581520080567, + "step": 204640 + }, + { + "epoch": 29.048970901348476, + "grad_norm": 0.013551585376262665, + "learning_rate": 7.096309439318666e-05, + "loss": 0.020282220840454102, + "step": 204650 + }, + { + "epoch": 29.05039034776437, + "grad_norm": 0.042248934507369995, + "learning_rate": 7.096167494677076e-05, + "loss": 0.016701823472976683, + "step": 204660 + }, + { + "epoch": 29.05180979418027, + "grad_norm": 6.371963024139404, + "learning_rate": 7.096025550035487e-05, + "loss": 0.026138174533843993, + "step": 204670 + }, + { + "epoch": 29.053229240596167, + "grad_norm": 0.15425243973731995, + "learning_rate": 7.095883605393896e-05, + "loss": 0.0027266811579465867, + "step": 204680 + }, + { + "epoch": 29.054648687012065, + "grad_norm": 0.15129446983337402, + "learning_rate": 7.095741660752306e-05, + "loss": 0.022380030155181883, + "step": 204690 + }, + { + "epoch": 29.056068133427964, + "grad_norm": 0.22657883167266846, + "learning_rate": 7.095599716110717e-05, + "loss": 0.001166776567697525, + "step": 204700 + }, + { + "epoch": 29.057487579843862, + "grad_norm": 0.04861035197973251, + "learning_rate": 7.095457771469127e-05, + "loss": 0.007727093994617462, + "step": 204710 + }, + { + "epoch": 29.05890702625976, + "grad_norm": 0.07833079993724823, + "learning_rate": 7.095315826827538e-05, + "loss": 0.0057760275900363926, + "step": 204720 + }, + { + "epoch": 29.060326472675655, + "grad_norm": 0.04368456080555916, + "learning_rate": 7.095173882185948e-05, + "loss": 0.0020632706582546234, + "step": 204730 + }, + { + "epoch": 29.061745919091553, + "grad_norm": 0.05444006994366646, + "learning_rate": 7.095031937544359e-05, + "loss": 0.013227127492427826, + "step": 204740 + }, + { + "epoch": 29.06316536550745, + "grad_norm": 0.01645474322140217, + "learning_rate": 7.094889992902767e-05, + "loss": 0.022583723068237305, + "step": 204750 + }, + { + "epoch": 29.06458481192335, + "grad_norm": 0.043432533740997314, + "learning_rate": 7.094748048261178e-05, + "loss": 0.01782907545566559, + "step": 204760 + }, + { + "epoch": 29.066004258339248, + "grad_norm": 0.07756974548101425, + "learning_rate": 7.094606103619588e-05, + "loss": 0.0012576159089803696, + "step": 204770 + }, + { + "epoch": 29.067423704755146, + "grad_norm": 0.004516893997788429, + "learning_rate": 7.094464158977999e-05, + "loss": 0.007366686314344406, + "step": 204780 + }, + { + "epoch": 29.068843151171045, + "grad_norm": 0.4319714903831482, + "learning_rate": 7.09432221433641e-05, + "loss": 0.004111701995134354, + "step": 204790 + }, + { + "epoch": 29.07026259758694, + "grad_norm": 0.28577637672424316, + "learning_rate": 7.094180269694819e-05, + "loss": 0.0036803193390369414, + "step": 204800 + }, + { + "epoch": 29.071682044002838, + "grad_norm": 0.051050931215286255, + "learning_rate": 7.09403832505323e-05, + "loss": 0.04436950981616974, + "step": 204810 + }, + { + "epoch": 29.073101490418736, + "grad_norm": 0.3771117925643921, + "learning_rate": 7.09389638041164e-05, + "loss": 0.01367611289024353, + "step": 204820 + }, + { + "epoch": 29.074520936834634, + "grad_norm": 0.6947856545448303, + "learning_rate": 7.09375443577005e-05, + "loss": 0.024499797821044923, + "step": 204830 + }, + { + "epoch": 29.075940383250533, + "grad_norm": 1.1245008707046509, + "learning_rate": 7.09361249112846e-05, + "loss": 0.007666733860969543, + "step": 204840 + }, + { + "epoch": 29.07735982966643, + "grad_norm": 0.09200689196586609, + "learning_rate": 7.09347054648687e-05, + "loss": 0.0024040088057518004, + "step": 204850 + }, + { + "epoch": 29.07877927608233, + "grad_norm": 0.10292667895555496, + "learning_rate": 7.09332860184528e-05, + "loss": 0.008451925963163376, + "step": 204860 + }, + { + "epoch": 29.080198722498224, + "grad_norm": 6.183298110961914, + "learning_rate": 7.093186657203691e-05, + "loss": 0.012903231382369994, + "step": 204870 + }, + { + "epoch": 29.081618168914122, + "grad_norm": 4.010056495666504, + "learning_rate": 7.093044712562102e-05, + "loss": 0.042326799035072325, + "step": 204880 + }, + { + "epoch": 29.08303761533002, + "grad_norm": 0.01123038399964571, + "learning_rate": 7.092902767920512e-05, + "loss": 0.0034512549638748167, + "step": 204890 + }, + { + "epoch": 29.08445706174592, + "grad_norm": 7.443796634674072, + "learning_rate": 7.092760823278921e-05, + "loss": 0.01755686104297638, + "step": 204900 + }, + { + "epoch": 29.085876508161817, + "grad_norm": 0.04091622307896614, + "learning_rate": 7.092618878637331e-05, + "loss": 0.015118768811225891, + "step": 204910 + }, + { + "epoch": 29.087295954577716, + "grad_norm": 9.117084503173828, + "learning_rate": 7.092476933995742e-05, + "loss": 0.0470307320356369, + "step": 204920 + }, + { + "epoch": 29.088715400993614, + "grad_norm": 2.689970016479492, + "learning_rate": 7.092334989354152e-05, + "loss": 0.025400426983833314, + "step": 204930 + }, + { + "epoch": 29.09013484740951, + "grad_norm": 0.1683264821767807, + "learning_rate": 7.092193044712563e-05, + "loss": 0.014978167414665223, + "step": 204940 + }, + { + "epoch": 29.091554293825407, + "grad_norm": 2.313295841217041, + "learning_rate": 7.092051100070972e-05, + "loss": 0.03770340085029602, + "step": 204950 + }, + { + "epoch": 29.092973740241305, + "grad_norm": 0.0022007140796631575, + "learning_rate": 7.091909155429383e-05, + "loss": 0.021999754011631012, + "step": 204960 + }, + { + "epoch": 29.094393186657204, + "grad_norm": 3.6474814414978027, + "learning_rate": 7.091767210787794e-05, + "loss": 0.03110913634300232, + "step": 204970 + }, + { + "epoch": 29.095812633073102, + "grad_norm": 1.8178870677947998, + "learning_rate": 7.091625266146203e-05, + "loss": 0.035340511798858644, + "step": 204980 + }, + { + "epoch": 29.097232079489, + "grad_norm": 0.30559787154197693, + "learning_rate": 7.091483321504615e-05, + "loss": 0.021605321764945985, + "step": 204990 + }, + { + "epoch": 29.0986515259049, + "grad_norm": 0.08376994729042053, + "learning_rate": 7.091341376863023e-05, + "loss": 0.011796525120735169, + "step": 205000 + }, + { + "epoch": 29.0986515259049, + "eval_accuracy": 0.9903986774337127, + "eval_loss": 0.03738699108362198, + "eval_runtime": 31.4594, + "eval_samples_per_second": 499.914, + "eval_steps_per_second": 15.639, + "step": 205000 + }, + { + "epoch": 29.100070972320793, + "grad_norm": 17.476272583007812, + "learning_rate": 7.091199432221434e-05, + "loss": 0.015693823993206023, + "step": 205010 + }, + { + "epoch": 29.10149041873669, + "grad_norm": 0.01576380990445614, + "learning_rate": 7.091057487579844e-05, + "loss": 0.004428884387016297, + "step": 205020 + }, + { + "epoch": 29.10290986515259, + "grad_norm": 0.2229311615228653, + "learning_rate": 7.090915542938255e-05, + "loss": 0.004905415698885918, + "step": 205030 + }, + { + "epoch": 29.10432931156849, + "grad_norm": 0.6823704242706299, + "learning_rate": 7.090773598296665e-05, + "loss": 0.005483600497245789, + "step": 205040 + }, + { + "epoch": 29.105748757984387, + "grad_norm": 0.37481412291526794, + "learning_rate": 7.090631653655074e-05, + "loss": 0.0054867170751094815, + "step": 205050 + }, + { + "epoch": 29.107168204400285, + "grad_norm": 8.368255615234375, + "learning_rate": 7.090489709013485e-05, + "loss": 0.022666481137275696, + "step": 205060 + }, + { + "epoch": 29.108587650816183, + "grad_norm": 0.006913583725690842, + "learning_rate": 7.090347764371895e-05, + "loss": 0.002422812953591347, + "step": 205070 + }, + { + "epoch": 29.110007097232078, + "grad_norm": 0.002643840853124857, + "learning_rate": 7.090205819730306e-05, + "loss": 0.01644216775894165, + "step": 205080 + }, + { + "epoch": 29.111426543647976, + "grad_norm": 0.3830544054508209, + "learning_rate": 7.090063875088716e-05, + "loss": 0.011379379034042358, + "step": 205090 + }, + { + "epoch": 29.112845990063875, + "grad_norm": 0.6835799217224121, + "learning_rate": 7.089921930447127e-05, + "loss": 0.006229359656572342, + "step": 205100 + }, + { + "epoch": 29.114265436479773, + "grad_norm": 0.019975069910287857, + "learning_rate": 7.089779985805535e-05, + "loss": 0.0004241444170475006, + "step": 205110 + }, + { + "epoch": 29.11568488289567, + "grad_norm": 0.1406504213809967, + "learning_rate": 7.089638041163947e-05, + "loss": 0.007870028913021087, + "step": 205120 + }, + { + "epoch": 29.11710432931157, + "grad_norm": 1.356054425239563, + "learning_rate": 7.089496096522356e-05, + "loss": 0.006814044713973999, + "step": 205130 + }, + { + "epoch": 29.118523775727468, + "grad_norm": 0.030155131593346596, + "learning_rate": 7.089354151880767e-05, + "loss": 0.011633841693401337, + "step": 205140 + }, + { + "epoch": 29.119943222143363, + "grad_norm": 16.28006362915039, + "learning_rate": 7.089212207239177e-05, + "loss": 0.049263617396354674, + "step": 205150 + }, + { + "epoch": 29.12136266855926, + "grad_norm": 0.37614044547080994, + "learning_rate": 7.089070262597587e-05, + "loss": 0.045173737406730655, + "step": 205160 + }, + { + "epoch": 29.12278211497516, + "grad_norm": 0.15188278257846832, + "learning_rate": 7.088928317955998e-05, + "loss": 0.017644035816192626, + "step": 205170 + }, + { + "epoch": 29.124201561391057, + "grad_norm": 0.17325779795646667, + "learning_rate": 7.088786373314408e-05, + "loss": 0.003933382779359817, + "step": 205180 + }, + { + "epoch": 29.125621007806956, + "grad_norm": 0.04497085511684418, + "learning_rate": 7.088644428672819e-05, + "loss": 0.008122949302196503, + "step": 205190 + }, + { + "epoch": 29.127040454222854, + "grad_norm": 0.038751836866140366, + "learning_rate": 7.088502484031229e-05, + "loss": 0.002958225831389427, + "step": 205200 + }, + { + "epoch": 29.128459900638752, + "grad_norm": 0.020191026851534843, + "learning_rate": 7.088360539389638e-05, + "loss": 0.01171911507844925, + "step": 205210 + }, + { + "epoch": 29.129879347054647, + "grad_norm": 5.495780944824219, + "learning_rate": 7.088218594748048e-05, + "loss": 0.03065202832221985, + "step": 205220 + }, + { + "epoch": 29.131298793470545, + "grad_norm": 0.18437226116657257, + "learning_rate": 7.088076650106459e-05, + "loss": 0.005065786838531494, + "step": 205230 + }, + { + "epoch": 29.132718239886444, + "grad_norm": 2.5608460903167725, + "learning_rate": 7.087934705464869e-05, + "loss": 0.0014641720801591874, + "step": 205240 + }, + { + "epoch": 29.134137686302342, + "grad_norm": 0.016177784651517868, + "learning_rate": 7.08779276082328e-05, + "loss": 0.012796314060688018, + "step": 205250 + }, + { + "epoch": 29.13555713271824, + "grad_norm": 0.042292408645153046, + "learning_rate": 7.08765081618169e-05, + "loss": 0.0036349888890981675, + "step": 205260 + }, + { + "epoch": 29.13697657913414, + "grad_norm": 9.319870948791504, + "learning_rate": 7.0875088715401e-05, + "loss": 0.011934319138526916, + "step": 205270 + }, + { + "epoch": 29.138396025550037, + "grad_norm": 1.4489409923553467, + "learning_rate": 7.08736692689851e-05, + "loss": 0.005644876882433892, + "step": 205280 + }, + { + "epoch": 29.13981547196593, + "grad_norm": 0.5526618361473083, + "learning_rate": 7.08722498225692e-05, + "loss": 0.007086797058582306, + "step": 205290 + }, + { + "epoch": 29.14123491838183, + "grad_norm": 0.08333895355463028, + "learning_rate": 7.087083037615331e-05, + "loss": 0.0071210071444511415, + "step": 205300 + }, + { + "epoch": 29.14265436479773, + "grad_norm": 0.7563687562942505, + "learning_rate": 7.08694109297374e-05, + "loss": 0.002496865391731262, + "step": 205310 + }, + { + "epoch": 29.144073811213627, + "grad_norm": 0.0059651462361216545, + "learning_rate": 7.086799148332151e-05, + "loss": 0.018571692705154418, + "step": 205320 + }, + { + "epoch": 29.145493257629525, + "grad_norm": 2.8342888355255127, + "learning_rate": 7.08665720369056e-05, + "loss": 0.006451577693223953, + "step": 205330 + }, + { + "epoch": 29.146912704045423, + "grad_norm": 5.492838382720947, + "learning_rate": 7.086515259048972e-05, + "loss": 0.07309772968292236, + "step": 205340 + }, + { + "epoch": 29.14833215046132, + "grad_norm": 3.58431077003479, + "learning_rate": 7.086373314407381e-05, + "loss": 0.03515995144844055, + "step": 205350 + }, + { + "epoch": 29.149751596877216, + "grad_norm": 0.6777535676956177, + "learning_rate": 7.086231369765791e-05, + "loss": 0.004998932406306267, + "step": 205360 + }, + { + "epoch": 29.151171043293115, + "grad_norm": 2.2583272457122803, + "learning_rate": 7.086089425124202e-05, + "loss": 0.04538760185241699, + "step": 205370 + }, + { + "epoch": 29.152590489709013, + "grad_norm": 2.326537847518921, + "learning_rate": 7.085947480482612e-05, + "loss": 0.0195939764380455, + "step": 205380 + }, + { + "epoch": 29.15400993612491, + "grad_norm": 7.12360954284668, + "learning_rate": 7.085805535841023e-05, + "loss": 0.03506704568862915, + "step": 205390 + }, + { + "epoch": 29.15542938254081, + "grad_norm": 8.28392505645752, + "learning_rate": 7.085663591199433e-05, + "loss": 0.025259250402450563, + "step": 205400 + }, + { + "epoch": 29.156848828956708, + "grad_norm": 0.009261448867619038, + "learning_rate": 7.085521646557842e-05, + "loss": 0.03002377450466156, + "step": 205410 + }, + { + "epoch": 29.158268275372606, + "grad_norm": 0.3068506121635437, + "learning_rate": 7.085379701916252e-05, + "loss": 0.020888492465019226, + "step": 205420 + }, + { + "epoch": 29.1596877217885, + "grad_norm": 0.061142370104789734, + "learning_rate": 7.085237757274663e-05, + "loss": 0.009020818769931794, + "step": 205430 + }, + { + "epoch": 29.1611071682044, + "grad_norm": 0.10861548781394958, + "learning_rate": 7.085095812633073e-05, + "loss": 0.001706032082438469, + "step": 205440 + }, + { + "epoch": 29.162526614620297, + "grad_norm": 0.3946651220321655, + "learning_rate": 7.084953867991484e-05, + "loss": 0.0039549294859170915, + "step": 205450 + }, + { + "epoch": 29.163946061036196, + "grad_norm": 0.481130987405777, + "learning_rate": 7.084811923349894e-05, + "loss": 0.003743773326277733, + "step": 205460 + }, + { + "epoch": 29.165365507452094, + "grad_norm": 6.167041778564453, + "learning_rate": 7.084669978708304e-05, + "loss": 0.020918484032154083, + "step": 205470 + }, + { + "epoch": 29.166784953867992, + "grad_norm": 0.5331241488456726, + "learning_rate": 7.084528034066715e-05, + "loss": 0.004873482882976532, + "step": 205480 + }, + { + "epoch": 29.16820440028389, + "grad_norm": 0.018122689798474312, + "learning_rate": 7.084386089425124e-05, + "loss": 0.003839380666613579, + "step": 205490 + }, + { + "epoch": 29.169623846699785, + "grad_norm": 0.5368046760559082, + "learning_rate": 7.084244144783536e-05, + "loss": 0.010295604169368745, + "step": 205500 + }, + { + "epoch": 29.169623846699785, + "eval_accuracy": 0.9894449036688497, + "eval_loss": 0.04192632809281349, + "eval_runtime": 31.4367, + "eval_samples_per_second": 500.276, + "eval_steps_per_second": 15.651, + "step": 205500 + }, + { + "epoch": 29.171043293115684, + "grad_norm": 0.1448856145143509, + "learning_rate": 7.084102200141945e-05, + "loss": 0.011319178342819213, + "step": 205510 + }, + { + "epoch": 29.172462739531582, + "grad_norm": 0.00954871904104948, + "learning_rate": 7.083960255500355e-05, + "loss": 0.005288292095065117, + "step": 205520 + }, + { + "epoch": 29.17388218594748, + "grad_norm": 0.22028595209121704, + "learning_rate": 7.083818310858765e-05, + "loss": 0.04223674535751343, + "step": 205530 + }, + { + "epoch": 29.17530163236338, + "grad_norm": 0.03859535604715347, + "learning_rate": 7.083676366217176e-05, + "loss": 0.0028855033218860627, + "step": 205540 + }, + { + "epoch": 29.176721078779277, + "grad_norm": 7.159955978393555, + "learning_rate": 7.083534421575586e-05, + "loss": 0.012969423830509186, + "step": 205550 + }, + { + "epoch": 29.178140525195175, + "grad_norm": 0.059085503220558167, + "learning_rate": 7.083392476933997e-05, + "loss": 0.027366524934768675, + "step": 205560 + }, + { + "epoch": 29.17955997161107, + "grad_norm": 0.06202727183699608, + "learning_rate": 7.083250532292406e-05, + "loss": 0.025493437051773073, + "step": 205570 + }, + { + "epoch": 29.18097941802697, + "grad_norm": 0.041119158267974854, + "learning_rate": 7.083108587650816e-05, + "loss": 0.021938392519950868, + "step": 205580 + }, + { + "epoch": 29.182398864442867, + "grad_norm": 0.22021150588989258, + "learning_rate": 7.082966643009227e-05, + "loss": 0.004367426410317421, + "step": 205590 + }, + { + "epoch": 29.183818310858765, + "grad_norm": 0.07271493226289749, + "learning_rate": 7.082824698367637e-05, + "loss": 0.008750979602336884, + "step": 205600 + }, + { + "epoch": 29.185237757274663, + "grad_norm": 0.06333275139331818, + "learning_rate": 7.082682753726048e-05, + "loss": 0.004452174901962281, + "step": 205610 + }, + { + "epoch": 29.18665720369056, + "grad_norm": 0.018476534634828568, + "learning_rate": 7.082540809084456e-05, + "loss": 0.0043740089982748035, + "step": 205620 + }, + { + "epoch": 29.18807665010646, + "grad_norm": 12.55189323425293, + "learning_rate": 7.082398864442868e-05, + "loss": 0.05591241121292114, + "step": 205630 + }, + { + "epoch": 29.189496096522355, + "grad_norm": 2.6163673400878906, + "learning_rate": 7.082256919801277e-05, + "loss": 0.013601192831993103, + "step": 205640 + }, + { + "epoch": 29.190915542938253, + "grad_norm": 0.03271455317735672, + "learning_rate": 7.082114975159688e-05, + "loss": 0.0013475362211465836, + "step": 205650 + }, + { + "epoch": 29.19233498935415, + "grad_norm": 0.01654021441936493, + "learning_rate": 7.081973030518098e-05, + "loss": 0.003646788373589516, + "step": 205660 + }, + { + "epoch": 29.19375443577005, + "grad_norm": 0.018734095618128777, + "learning_rate": 7.081831085876508e-05, + "loss": 0.0028090622276067735, + "step": 205670 + }, + { + "epoch": 29.195173882185948, + "grad_norm": 0.1278701275587082, + "learning_rate": 7.081689141234919e-05, + "loss": 0.02435038685798645, + "step": 205680 + }, + { + "epoch": 29.196593328601846, + "grad_norm": 4.24138879776001, + "learning_rate": 7.081547196593329e-05, + "loss": 0.004561011865735054, + "step": 205690 + }, + { + "epoch": 29.198012775017745, + "grad_norm": 0.640087902545929, + "learning_rate": 7.08140525195174e-05, + "loss": 0.03111845552921295, + "step": 205700 + }, + { + "epoch": 29.19943222143364, + "grad_norm": 8.193670272827148, + "learning_rate": 7.08126330731015e-05, + "loss": 0.026693865656852722, + "step": 205710 + }, + { + "epoch": 29.200851667849538, + "grad_norm": 0.054795727133750916, + "learning_rate": 7.081121362668559e-05, + "loss": 0.01952531486749649, + "step": 205720 + }, + { + "epoch": 29.202271114265436, + "grad_norm": 0.19635026156902313, + "learning_rate": 7.080979418026969e-05, + "loss": 0.0030548125505447388, + "step": 205730 + }, + { + "epoch": 29.203690560681334, + "grad_norm": 13.01323413848877, + "learning_rate": 7.08083747338538e-05, + "loss": 0.023395511507987975, + "step": 205740 + }, + { + "epoch": 29.205110007097232, + "grad_norm": 6.679422855377197, + "learning_rate": 7.08069552874379e-05, + "loss": 0.0236405611038208, + "step": 205750 + }, + { + "epoch": 29.20652945351313, + "grad_norm": 0.35494017601013184, + "learning_rate": 7.080553584102201e-05, + "loss": 0.007217636704444886, + "step": 205760 + }, + { + "epoch": 29.20794889992903, + "grad_norm": 14.131094932556152, + "learning_rate": 7.08041163946061e-05, + "loss": 0.014654545485973359, + "step": 205770 + }, + { + "epoch": 29.209368346344924, + "grad_norm": 0.24067169427871704, + "learning_rate": 7.08026969481902e-05, + "loss": 0.0017931144684553146, + "step": 205780 + }, + { + "epoch": 29.210787792760822, + "grad_norm": 0.40082618594169617, + "learning_rate": 7.080127750177431e-05, + "loss": 0.021633344888687133, + "step": 205790 + }, + { + "epoch": 29.21220723917672, + "grad_norm": 0.03998534008860588, + "learning_rate": 7.079985805535841e-05, + "loss": 0.026026412844657898, + "step": 205800 + }, + { + "epoch": 29.21362668559262, + "grad_norm": 0.008162587881088257, + "learning_rate": 7.079843860894252e-05, + "loss": 0.0015093978494405747, + "step": 205810 + }, + { + "epoch": 29.215046132008517, + "grad_norm": 0.010677729733288288, + "learning_rate": 7.079701916252662e-05, + "loss": 0.0038524225354194643, + "step": 205820 + }, + { + "epoch": 29.216465578424415, + "grad_norm": 0.01187889650464058, + "learning_rate": 7.079559971611072e-05, + "loss": 0.009772031009197235, + "step": 205830 + }, + { + "epoch": 29.217885024840314, + "grad_norm": 6.031559467315674, + "learning_rate": 7.079418026969482e-05, + "loss": 0.013044306635856628, + "step": 205840 + }, + { + "epoch": 29.21930447125621, + "grad_norm": 4.540803909301758, + "learning_rate": 7.079276082327893e-05, + "loss": 0.014139991998672486, + "step": 205850 + }, + { + "epoch": 29.220723917672107, + "grad_norm": 0.08616404235363007, + "learning_rate": 7.079134137686302e-05, + "loss": 0.005545485764741898, + "step": 205860 + }, + { + "epoch": 29.222143364088005, + "grad_norm": 0.016093222424387932, + "learning_rate": 7.078992193044713e-05, + "loss": 0.016155844926834105, + "step": 205870 + }, + { + "epoch": 29.223562810503903, + "grad_norm": 0.08733079582452774, + "learning_rate": 7.078850248403123e-05, + "loss": 0.023034313321113588, + "step": 205880 + }, + { + "epoch": 29.2249822569198, + "grad_norm": 0.6125519871711731, + "learning_rate": 7.078708303761533e-05, + "loss": 0.027742061018943786, + "step": 205890 + }, + { + "epoch": 29.2264017033357, + "grad_norm": 0.2161005586385727, + "learning_rate": 7.078566359119944e-05, + "loss": 0.014590105414390564, + "step": 205900 + }, + { + "epoch": 29.2278211497516, + "grad_norm": 2.2422709465026855, + "learning_rate": 7.078424414478354e-05, + "loss": 0.02107901722192764, + "step": 205910 + }, + { + "epoch": 29.229240596167493, + "grad_norm": 0.009922299534082413, + "learning_rate": 7.078282469836765e-05, + "loss": 0.005160903930664063, + "step": 205920 + }, + { + "epoch": 29.23066004258339, + "grad_norm": 0.31814995408058167, + "learning_rate": 7.078140525195173e-05, + "loss": 0.02137155532836914, + "step": 205930 + }, + { + "epoch": 29.23207948899929, + "grad_norm": 15.170196533203125, + "learning_rate": 7.077998580553584e-05, + "loss": 0.018766942620277404, + "step": 205940 + }, + { + "epoch": 29.233498935415188, + "grad_norm": 0.1937604695558548, + "learning_rate": 7.077856635911994e-05, + "loss": 0.008404122292995453, + "step": 205950 + }, + { + "epoch": 29.234918381831086, + "grad_norm": 0.18435829877853394, + "learning_rate": 7.077714691270405e-05, + "loss": 0.006417787075042725, + "step": 205960 + }, + { + "epoch": 29.236337828246985, + "grad_norm": 0.16199879348278046, + "learning_rate": 7.077572746628815e-05, + "loss": 0.003665405884385109, + "step": 205970 + }, + { + "epoch": 29.237757274662883, + "grad_norm": 0.5032358765602112, + "learning_rate": 7.077430801987225e-05, + "loss": 0.0015227966010570525, + "step": 205980 + }, + { + "epoch": 29.239176721078778, + "grad_norm": 2.1172358989715576, + "learning_rate": 7.077288857345636e-05, + "loss": 0.0044368445873260495, + "step": 205990 + }, + { + "epoch": 29.240596167494676, + "grad_norm": 0.030380597338080406, + "learning_rate": 7.077146912704045e-05, + "loss": 0.007187486439943313, + "step": 206000 + }, + { + "epoch": 29.240596167494676, + "eval_accuracy": 0.9900171679277675, + "eval_loss": 0.0403205081820488, + "eval_runtime": 31.8328, + "eval_samples_per_second": 494.05, + "eval_steps_per_second": 15.456, + "step": 206000 + }, + { + "epoch": 29.242015613910574, + "grad_norm": 0.05838993936777115, + "learning_rate": 7.077004968062457e-05, + "loss": 0.04411768019199371, + "step": 206010 + }, + { + "epoch": 29.243435060326473, + "grad_norm": 1.6254521608352661, + "learning_rate": 7.076863023420866e-05, + "loss": 0.0021566618233919144, + "step": 206020 + }, + { + "epoch": 29.24485450674237, + "grad_norm": 15.376858711242676, + "learning_rate": 7.076721078779276e-05, + "loss": 0.015964692831039427, + "step": 206030 + }, + { + "epoch": 29.24627395315827, + "grad_norm": 6.894342422485352, + "learning_rate": 7.076579134137686e-05, + "loss": 0.01085326224565506, + "step": 206040 + }, + { + "epoch": 29.247693399574167, + "grad_norm": 1.7550249099731445, + "learning_rate": 7.076437189496097e-05, + "loss": 0.02769227623939514, + "step": 206050 + }, + { + "epoch": 29.249112845990062, + "grad_norm": 0.3359670639038086, + "learning_rate": 7.076295244854507e-05, + "loss": 0.001883171871304512, + "step": 206060 + }, + { + "epoch": 29.25053229240596, + "grad_norm": 2.569694995880127, + "learning_rate": 7.076153300212918e-05, + "loss": 0.048984694480896, + "step": 206070 + }, + { + "epoch": 29.25195173882186, + "grad_norm": 3.5762572288513184, + "learning_rate": 7.076011355571327e-05, + "loss": 0.003993140161037445, + "step": 206080 + }, + { + "epoch": 29.253371185237757, + "grad_norm": 3.5176777839660645, + "learning_rate": 7.075869410929737e-05, + "loss": 0.0041658468544483185, + "step": 206090 + }, + { + "epoch": 29.254790631653655, + "grad_norm": 0.02567232958972454, + "learning_rate": 7.075727466288148e-05, + "loss": 0.0017358839511871337, + "step": 206100 + }, + { + "epoch": 29.256210078069554, + "grad_norm": 7.715625286102295, + "learning_rate": 7.075585521646558e-05, + "loss": 0.004181325435638428, + "step": 206110 + }, + { + "epoch": 29.257629524485452, + "grad_norm": 0.4522411525249481, + "learning_rate": 7.075443577004969e-05, + "loss": 0.02507193386554718, + "step": 206120 + }, + { + "epoch": 29.259048970901347, + "grad_norm": 0.02736782282590866, + "learning_rate": 7.075301632363379e-05, + "loss": 0.0035963211208581924, + "step": 206130 + }, + { + "epoch": 29.260468417317245, + "grad_norm": 0.21447253227233887, + "learning_rate": 7.075159687721789e-05, + "loss": 0.03258508145809173, + "step": 206140 + }, + { + "epoch": 29.261887863733143, + "grad_norm": 17.119125366210938, + "learning_rate": 7.075017743080198e-05, + "loss": 0.040811455249786376, + "step": 206150 + }, + { + "epoch": 29.26330731014904, + "grad_norm": 0.14982275664806366, + "learning_rate": 7.07487579843861e-05, + "loss": 0.038484716415405275, + "step": 206160 + }, + { + "epoch": 29.26472675656494, + "grad_norm": 0.9254148006439209, + "learning_rate": 7.074733853797019e-05, + "loss": 0.029991361498832702, + "step": 206170 + }, + { + "epoch": 29.26614620298084, + "grad_norm": 0.033785831183195114, + "learning_rate": 7.07459190915543e-05, + "loss": 0.03206554055213928, + "step": 206180 + }, + { + "epoch": 29.267565649396737, + "grad_norm": 0.20515330135822296, + "learning_rate": 7.07444996451384e-05, + "loss": 0.05953955054283142, + "step": 206190 + }, + { + "epoch": 29.26898509581263, + "grad_norm": 0.004370762966573238, + "learning_rate": 7.07430801987225e-05, + "loss": 0.02474723905324936, + "step": 206200 + }, + { + "epoch": 29.27040454222853, + "grad_norm": 4.668515205383301, + "learning_rate": 7.074166075230661e-05, + "loss": 0.018773020803928377, + "step": 206210 + }, + { + "epoch": 29.271823988644428, + "grad_norm": 28.015869140625, + "learning_rate": 7.07402413058907e-05, + "loss": 0.04481661319732666, + "step": 206220 + }, + { + "epoch": 29.273243435060326, + "grad_norm": 0.8639832139015198, + "learning_rate": 7.073882185947482e-05, + "loss": 0.02386438250541687, + "step": 206230 + }, + { + "epoch": 29.274662881476225, + "grad_norm": 13.687540054321289, + "learning_rate": 7.07374024130589e-05, + "loss": 0.02066381126642227, + "step": 206240 + }, + { + "epoch": 29.276082327892123, + "grad_norm": 0.1784941405057907, + "learning_rate": 7.073598296664301e-05, + "loss": 0.023449695110321044, + "step": 206250 + }, + { + "epoch": 29.27750177430802, + "grad_norm": 7.884387969970703, + "learning_rate": 7.073456352022711e-05, + "loss": 0.019045060873031615, + "step": 206260 + }, + { + "epoch": 29.278921220723916, + "grad_norm": 2.2537362575531006, + "learning_rate": 7.073314407381122e-05, + "loss": 0.009772108495235443, + "step": 206270 + }, + { + "epoch": 29.280340667139814, + "grad_norm": 0.05800760164856911, + "learning_rate": 7.073172462739533e-05, + "loss": 0.0037116661667823793, + "step": 206280 + }, + { + "epoch": 29.281760113555713, + "grad_norm": 0.3520843982696533, + "learning_rate": 7.073030518097941e-05, + "loss": 0.013787555694580077, + "step": 206290 + }, + { + "epoch": 29.28317955997161, + "grad_norm": 0.10926155745983124, + "learning_rate": 7.072888573456352e-05, + "loss": 0.00247809924185276, + "step": 206300 + }, + { + "epoch": 29.28459900638751, + "grad_norm": 9.280502319335938, + "learning_rate": 7.072746628814762e-05, + "loss": 0.05478657484054565, + "step": 206310 + }, + { + "epoch": 29.286018452803408, + "grad_norm": 0.007723371498286724, + "learning_rate": 7.072604684173173e-05, + "loss": 0.009701645374298096, + "step": 206320 + }, + { + "epoch": 29.287437899219306, + "grad_norm": 0.09914112091064453, + "learning_rate": 7.072462739531583e-05, + "loss": 0.002544272691011429, + "step": 206330 + }, + { + "epoch": 29.2888573456352, + "grad_norm": 0.11047609895467758, + "learning_rate": 7.072320794889993e-05, + "loss": 0.07040393948554993, + "step": 206340 + }, + { + "epoch": 29.2902767920511, + "grad_norm": 0.04615732654929161, + "learning_rate": 7.072178850248403e-05, + "loss": 0.002698909118771553, + "step": 206350 + }, + { + "epoch": 29.291696238466997, + "grad_norm": 0.12837369740009308, + "learning_rate": 7.072036905606814e-05, + "loss": 0.0032525677233934402, + "step": 206360 + }, + { + "epoch": 29.293115684882896, + "grad_norm": 2.8872568607330322, + "learning_rate": 7.071894960965225e-05, + "loss": 0.0017309773713350296, + "step": 206370 + }, + { + "epoch": 29.294535131298794, + "grad_norm": 3.8100996017456055, + "learning_rate": 7.071753016323634e-05, + "loss": 0.013496044278144836, + "step": 206380 + }, + { + "epoch": 29.295954577714692, + "grad_norm": 0.07967525720596313, + "learning_rate": 7.071611071682044e-05, + "loss": 0.00093051977455616, + "step": 206390 + }, + { + "epoch": 29.29737402413059, + "grad_norm": 0.02074570395052433, + "learning_rate": 7.071469127040454e-05, + "loss": 0.00269443579018116, + "step": 206400 + }, + { + "epoch": 29.298793470546485, + "grad_norm": 0.048914436250925064, + "learning_rate": 7.071327182398865e-05, + "loss": 0.02483227998018265, + "step": 206410 + }, + { + "epoch": 29.300212916962384, + "grad_norm": 1.5568811893463135, + "learning_rate": 7.071185237757275e-05, + "loss": 0.010647733509540559, + "step": 206420 + }, + { + "epoch": 29.301632363378282, + "grad_norm": 0.01105272676795721, + "learning_rate": 7.071043293115686e-05, + "loss": 0.003585050255060196, + "step": 206430 + }, + { + "epoch": 29.30305180979418, + "grad_norm": 0.1770784854888916, + "learning_rate": 7.070901348474094e-05, + "loss": 0.0012425307184457778, + "step": 206440 + }, + { + "epoch": 29.30447125621008, + "grad_norm": 0.18587981164455414, + "learning_rate": 7.070759403832505e-05, + "loss": 0.005501533672213554, + "step": 206450 + }, + { + "epoch": 29.305890702625977, + "grad_norm": 0.17616312205791473, + "learning_rate": 7.070617459190916e-05, + "loss": 0.0012329213321208953, + "step": 206460 + }, + { + "epoch": 29.307310149041875, + "grad_norm": 0.1321578472852707, + "learning_rate": 7.070475514549326e-05, + "loss": 0.010490577667951584, + "step": 206470 + }, + { + "epoch": 29.30872959545777, + "grad_norm": 0.8319923281669617, + "learning_rate": 7.070333569907737e-05, + "loss": 0.016795310378074645, + "step": 206480 + }, + { + "epoch": 29.310149041873668, + "grad_norm": 3.319706439971924, + "learning_rate": 7.070191625266147e-05, + "loss": 0.003536257892847061, + "step": 206490 + }, + { + "epoch": 29.311568488289566, + "grad_norm": 1.1165306568145752, + "learning_rate": 7.070049680624557e-05, + "loss": 0.0021443594247102737, + "step": 206500 + }, + { + "epoch": 29.311568488289566, + "eval_accuracy": 0.9918611305398359, + "eval_loss": 0.0344008207321167, + "eval_runtime": 32.2999, + "eval_samples_per_second": 486.905, + "eval_steps_per_second": 15.232, + "step": 206500 + }, + { + "epoch": 29.312987934705465, + "grad_norm": 0.004230252001434565, + "learning_rate": 7.069907735982966e-05, + "loss": 0.0009764399379491806, + "step": 206510 + }, + { + "epoch": 29.314407381121363, + "grad_norm": 2.231658458709717, + "learning_rate": 7.069765791341378e-05, + "loss": 0.017434492707252502, + "step": 206520 + }, + { + "epoch": 29.31582682753726, + "grad_norm": 0.016378097236156464, + "learning_rate": 7.069623846699787e-05, + "loss": 0.011703105270862579, + "step": 206530 + }, + { + "epoch": 29.31724627395316, + "grad_norm": 0.10890492051839828, + "learning_rate": 7.069481902058198e-05, + "loss": 0.0018124084919691086, + "step": 206540 + }, + { + "epoch": 29.318665720369054, + "grad_norm": 0.016483768820762634, + "learning_rate": 7.069339957416608e-05, + "loss": 0.006163845956325531, + "step": 206550 + }, + { + "epoch": 29.320085166784953, + "grad_norm": 0.01687806285917759, + "learning_rate": 7.069198012775018e-05, + "loss": 0.011482794582843781, + "step": 206560 + }, + { + "epoch": 29.32150461320085, + "grad_norm": 0.23197101056575775, + "learning_rate": 7.069056068133429e-05, + "loss": 0.01025947481393814, + "step": 206570 + }, + { + "epoch": 29.32292405961675, + "grad_norm": 0.027597175911068916, + "learning_rate": 7.068914123491839e-05, + "loss": 0.010907813161611556, + "step": 206580 + }, + { + "epoch": 29.324343506032648, + "grad_norm": 17.75995445251465, + "learning_rate": 7.06877217885025e-05, + "loss": 0.03195474743843078, + "step": 206590 + }, + { + "epoch": 29.325762952448546, + "grad_norm": 0.06022009998559952, + "learning_rate": 7.068630234208658e-05, + "loss": 0.0016454372555017472, + "step": 206600 + }, + { + "epoch": 29.327182398864444, + "grad_norm": 5.036454677581787, + "learning_rate": 7.068488289567069e-05, + "loss": 0.03137316703796387, + "step": 206610 + }, + { + "epoch": 29.32860184528034, + "grad_norm": 0.14994248747825623, + "learning_rate": 7.068346344925479e-05, + "loss": 0.004964204877614975, + "step": 206620 + }, + { + "epoch": 29.330021291696237, + "grad_norm": 0.009216463193297386, + "learning_rate": 7.06820440028389e-05, + "loss": 0.017287929356098176, + "step": 206630 + }, + { + "epoch": 29.331440738112136, + "grad_norm": 0.03254463151097298, + "learning_rate": 7.0680624556423e-05, + "loss": 0.008409076929092407, + "step": 206640 + }, + { + "epoch": 29.332860184528034, + "grad_norm": 6.550024032592773, + "learning_rate": 7.06792051100071e-05, + "loss": 0.024315524101257324, + "step": 206650 + }, + { + "epoch": 29.334279630943932, + "grad_norm": 0.032407090067863464, + "learning_rate": 7.06777856635912e-05, + "loss": 0.001193404197692871, + "step": 206660 + }, + { + "epoch": 29.33569907735983, + "grad_norm": 0.032613396644592285, + "learning_rate": 7.06763662171753e-05, + "loss": 0.012750309705734254, + "step": 206670 + }, + { + "epoch": 29.33711852377573, + "grad_norm": 0.032559942454099655, + "learning_rate": 7.067494677075941e-05, + "loss": 0.011046990752220154, + "step": 206680 + }, + { + "epoch": 29.338537970191624, + "grad_norm": 11.506123542785645, + "learning_rate": 7.067352732434351e-05, + "loss": 0.00934581682085991, + "step": 206690 + }, + { + "epoch": 29.339957416607522, + "grad_norm": 0.9660173654556274, + "learning_rate": 7.067210787792761e-05, + "loss": 0.007860028743743896, + "step": 206700 + }, + { + "epoch": 29.34137686302342, + "grad_norm": 0.1605048030614853, + "learning_rate": 7.067068843151171e-05, + "loss": 0.003213459625840187, + "step": 206710 + }, + { + "epoch": 29.34279630943932, + "grad_norm": 0.04365316778421402, + "learning_rate": 7.066926898509582e-05, + "loss": 0.027396318316459656, + "step": 206720 + }, + { + "epoch": 29.344215755855217, + "grad_norm": 6.34977912902832, + "learning_rate": 7.066784953867992e-05, + "loss": 0.01186293438076973, + "step": 206730 + }, + { + "epoch": 29.345635202271115, + "grad_norm": 2.3218204975128174, + "learning_rate": 7.066643009226403e-05, + "loss": 0.032149025797843934, + "step": 206740 + }, + { + "epoch": 29.347054648687013, + "grad_norm": 0.05894698575139046, + "learning_rate": 7.066501064584812e-05, + "loss": 0.03736211359500885, + "step": 206750 + }, + { + "epoch": 29.348474095102908, + "grad_norm": 0.01244346983730793, + "learning_rate": 7.066359119943222e-05, + "loss": 0.03391439318656921, + "step": 206760 + }, + { + "epoch": 29.349893541518806, + "grad_norm": 0.05077536776661873, + "learning_rate": 7.066217175301633e-05, + "loss": 0.0029471926391124725, + "step": 206770 + }, + { + "epoch": 29.351312987934705, + "grad_norm": 0.040277305990457535, + "learning_rate": 7.066075230660043e-05, + "loss": 0.04465804994106293, + "step": 206780 + }, + { + "epoch": 29.352732434350603, + "grad_norm": 0.007205401547253132, + "learning_rate": 7.065933286018454e-05, + "loss": 0.006827531754970551, + "step": 206790 + }, + { + "epoch": 29.3541518807665, + "grad_norm": 6.362082004547119, + "learning_rate": 7.065791341376862e-05, + "loss": 0.03246740996837616, + "step": 206800 + }, + { + "epoch": 29.3555713271824, + "grad_norm": 1.018240213394165, + "learning_rate": 7.065649396735274e-05, + "loss": 0.06519121527671815, + "step": 206810 + }, + { + "epoch": 29.356990773598298, + "grad_norm": 0.08913514018058777, + "learning_rate": 7.065507452093683e-05, + "loss": 0.005973305925726891, + "step": 206820 + }, + { + "epoch": 29.358410220014193, + "grad_norm": 0.08579272776842117, + "learning_rate": 7.065365507452094e-05, + "loss": 0.0032924115657806396, + "step": 206830 + }, + { + "epoch": 29.35982966643009, + "grad_norm": 0.12039094418287277, + "learning_rate": 7.065223562810504e-05, + "loss": 0.0016877718269824981, + "step": 206840 + }, + { + "epoch": 29.36124911284599, + "grad_norm": 5.136697769165039, + "learning_rate": 7.065081618168915e-05, + "loss": 0.00986371487379074, + "step": 206850 + }, + { + "epoch": 29.362668559261888, + "grad_norm": 0.07523219287395477, + "learning_rate": 7.064939673527325e-05, + "loss": 0.006726563721895218, + "step": 206860 + }, + { + "epoch": 29.364088005677786, + "grad_norm": 0.057299163192510605, + "learning_rate": 7.064797728885735e-05, + "loss": 0.0013577602803707123, + "step": 206870 + }, + { + "epoch": 29.365507452093684, + "grad_norm": 1.766453504562378, + "learning_rate": 7.064655784244146e-05, + "loss": 0.03357782363891602, + "step": 206880 + }, + { + "epoch": 29.366926898509583, + "grad_norm": 7.942363262176514, + "learning_rate": 7.064513839602555e-05, + "loss": 0.032121342420578, + "step": 206890 + }, + { + "epoch": 29.368346344925477, + "grad_norm": 0.04284035786986351, + "learning_rate": 7.064371894960967e-05, + "loss": 0.023537811636924744, + "step": 206900 + }, + { + "epoch": 29.369765791341376, + "grad_norm": 0.0047437711618840694, + "learning_rate": 7.064229950319375e-05, + "loss": 0.023866380751132964, + "step": 206910 + }, + { + "epoch": 29.371185237757274, + "grad_norm": 1.941832423210144, + "learning_rate": 7.064088005677786e-05, + "loss": 0.002190626785159111, + "step": 206920 + }, + { + "epoch": 29.372604684173172, + "grad_norm": 1.889676570892334, + "learning_rate": 7.063946061036196e-05, + "loss": 0.008549542725086212, + "step": 206930 + }, + { + "epoch": 29.37402413058907, + "grad_norm": 0.06554765999317169, + "learning_rate": 7.063804116394607e-05, + "loss": 0.008473973721265793, + "step": 206940 + }, + { + "epoch": 29.37544357700497, + "grad_norm": 0.4954431653022766, + "learning_rate": 7.063662171753017e-05, + "loss": 0.051783818006515506, + "step": 206950 + }, + { + "epoch": 29.376863023420867, + "grad_norm": 0.08015116304159164, + "learning_rate": 7.063520227111426e-05, + "loss": 0.0158165842294693, + "step": 206960 + }, + { + "epoch": 29.378282469836762, + "grad_norm": 1.3325133323669434, + "learning_rate": 7.063378282469837e-05, + "loss": 0.03605609834194183, + "step": 206970 + }, + { + "epoch": 29.37970191625266, + "grad_norm": 0.5866050720214844, + "learning_rate": 7.063236337828247e-05, + "loss": 0.004483730718493462, + "step": 206980 + }, + { + "epoch": 29.38112136266856, + "grad_norm": 0.055682629346847534, + "learning_rate": 7.063094393186658e-05, + "loss": 0.001834530383348465, + "step": 206990 + }, + { + "epoch": 29.382540809084457, + "grad_norm": 9.57794189453125, + "learning_rate": 7.062952448545068e-05, + "loss": 0.037946689128875735, + "step": 207000 + }, + { + "epoch": 29.382540809084457, + "eval_accuracy": 0.9871558466331786, + "eval_loss": 0.055358581244945526, + "eval_runtime": 34.8249, + "eval_samples_per_second": 451.602, + "eval_steps_per_second": 14.128, + "step": 207000 + }, + { + "epoch": 29.383960255500355, + "grad_norm": 0.5347028374671936, + "learning_rate": 7.062810503903478e-05, + "loss": 0.0035023897886276244, + "step": 207010 + }, + { + "epoch": 29.385379701916253, + "grad_norm": 0.017401598393917084, + "learning_rate": 7.062668559261887e-05, + "loss": 0.004852369800209999, + "step": 207020 + }, + { + "epoch": 29.386799148332152, + "grad_norm": 0.045942019671201706, + "learning_rate": 7.062526614620299e-05, + "loss": 0.012155733257532119, + "step": 207030 + }, + { + "epoch": 29.388218594748047, + "grad_norm": 4.170889854431152, + "learning_rate": 7.062384669978708e-05, + "loss": 0.022297139465808868, + "step": 207040 + }, + { + "epoch": 29.389638041163945, + "grad_norm": 0.09114756435155869, + "learning_rate": 7.06224272533712e-05, + "loss": 0.028968113660812377, + "step": 207050 + }, + { + "epoch": 29.391057487579843, + "grad_norm": 2.5694429874420166, + "learning_rate": 7.062100780695529e-05, + "loss": 0.02784644961357117, + "step": 207060 + }, + { + "epoch": 29.39247693399574, + "grad_norm": 0.101987324655056, + "learning_rate": 7.061958836053939e-05, + "loss": 0.008181928098201752, + "step": 207070 + }, + { + "epoch": 29.39389638041164, + "grad_norm": 0.435624361038208, + "learning_rate": 7.06181689141235e-05, + "loss": 0.003020097687840462, + "step": 207080 + }, + { + "epoch": 29.395315826827538, + "grad_norm": 0.4973660707473755, + "learning_rate": 7.06167494677076e-05, + "loss": 0.002857193723320961, + "step": 207090 + }, + { + "epoch": 29.396735273243436, + "grad_norm": 0.13785980641841888, + "learning_rate": 7.061533002129171e-05, + "loss": 0.004714754223823547, + "step": 207100 + }, + { + "epoch": 29.39815471965933, + "grad_norm": 0.33820459246635437, + "learning_rate": 7.061391057487579e-05, + "loss": 0.0012860391288995743, + "step": 207110 + }, + { + "epoch": 29.39957416607523, + "grad_norm": 0.010108851827681065, + "learning_rate": 7.06124911284599e-05, + "loss": 0.052089476585388185, + "step": 207120 + }, + { + "epoch": 29.400993612491128, + "grad_norm": 0.0608946867287159, + "learning_rate": 7.0611071682044e-05, + "loss": 0.0062202602624893185, + "step": 207130 + }, + { + "epoch": 29.402413058907026, + "grad_norm": 0.007224023807793856, + "learning_rate": 7.060965223562811e-05, + "loss": 0.013519644737243652, + "step": 207140 + }, + { + "epoch": 29.403832505322924, + "grad_norm": 1.726156234741211, + "learning_rate": 7.060823278921221e-05, + "loss": 0.02125403881072998, + "step": 207150 + }, + { + "epoch": 29.405251951738823, + "grad_norm": 0.40045469999313354, + "learning_rate": 7.06068133427963e-05, + "loss": 0.038829609751701355, + "step": 207160 + }, + { + "epoch": 29.40667139815472, + "grad_norm": 0.9645289182662964, + "learning_rate": 7.060539389638042e-05, + "loss": 0.019781768321990967, + "step": 207170 + }, + { + "epoch": 29.408090844570616, + "grad_norm": 0.26502883434295654, + "learning_rate": 7.060397444996451e-05, + "loss": 0.010779917985200883, + "step": 207180 + }, + { + "epoch": 29.409510290986514, + "grad_norm": 0.012740222737193108, + "learning_rate": 7.060255500354863e-05, + "loss": 0.0011323921382427216, + "step": 207190 + }, + { + "epoch": 29.410929737402412, + "grad_norm": 15.784189224243164, + "learning_rate": 7.060113555713272e-05, + "loss": 0.04365306794643402, + "step": 207200 + }, + { + "epoch": 29.41234918381831, + "grad_norm": 0.5272105932235718, + "learning_rate": 7.059971611071683e-05, + "loss": 0.006255033612251282, + "step": 207210 + }, + { + "epoch": 29.41376863023421, + "grad_norm": 0.2925769090652466, + "learning_rate": 7.059829666430092e-05, + "loss": 0.002010131627321243, + "step": 207220 + }, + { + "epoch": 29.415188076650107, + "grad_norm": 0.007859217002987862, + "learning_rate": 7.059687721788503e-05, + "loss": 0.0031325660645961763, + "step": 207230 + }, + { + "epoch": 29.416607523066006, + "grad_norm": 0.04791152477264404, + "learning_rate": 7.059559971611071e-05, + "loss": 0.01274888515472412, + "step": 207240 + }, + { + "epoch": 29.4180269694819, + "grad_norm": 0.07124443352222443, + "learning_rate": 7.059418026969482e-05, + "loss": 0.02101006209850311, + "step": 207250 + }, + { + "epoch": 29.4194464158978, + "grad_norm": 12.356841087341309, + "learning_rate": 7.059276082327892e-05, + "loss": 0.06222732663154602, + "step": 207260 + }, + { + "epoch": 29.420865862313697, + "grad_norm": 0.008840433321893215, + "learning_rate": 7.059134137686303e-05, + "loss": 0.01817548871040344, + "step": 207270 + }, + { + "epoch": 29.422285308729595, + "grad_norm": 4.947365760803223, + "learning_rate": 7.058992193044713e-05, + "loss": 0.00868762880563736, + "step": 207280 + }, + { + "epoch": 29.423704755145494, + "grad_norm": 11.73182201385498, + "learning_rate": 7.058850248403123e-05, + "loss": 0.016803601384162904, + "step": 207290 + }, + { + "epoch": 29.425124201561392, + "grad_norm": 6.883684158325195, + "learning_rate": 7.058708303761534e-05, + "loss": 0.011293884366750717, + "step": 207300 + }, + { + "epoch": 29.42654364797729, + "grad_norm": 0.18785050511360168, + "learning_rate": 7.058566359119944e-05, + "loss": 0.0060439582914114, + "step": 207310 + }, + { + "epoch": 29.427963094393185, + "grad_norm": 0.08103861659765244, + "learning_rate": 7.058424414478355e-05, + "loss": 0.0069694884121418, + "step": 207320 + }, + { + "epoch": 29.429382540809083, + "grad_norm": 0.0545981340110302, + "learning_rate": 7.058282469836764e-05, + "loss": 0.005035455897450447, + "step": 207330 + }, + { + "epoch": 29.43080198722498, + "grad_norm": 2.247483730316162, + "learning_rate": 7.058140525195174e-05, + "loss": 0.02363567054271698, + "step": 207340 + }, + { + "epoch": 29.43222143364088, + "grad_norm": 1.6828631162643433, + "learning_rate": 7.057998580553584e-05, + "loss": 0.008231790363788604, + "step": 207350 + }, + { + "epoch": 29.433640880056778, + "grad_norm": 0.6441678404808044, + "learning_rate": 7.057856635911995e-05, + "loss": 0.006405394524335861, + "step": 207360 + }, + { + "epoch": 29.435060326472676, + "grad_norm": 10.112329483032227, + "learning_rate": 7.057714691270405e-05, + "loss": 0.01870162636041641, + "step": 207370 + }, + { + "epoch": 29.436479772888575, + "grad_norm": 0.1270720213651657, + "learning_rate": 7.057572746628816e-05, + "loss": 0.01767708659172058, + "step": 207380 + }, + { + "epoch": 29.43789921930447, + "grad_norm": 0.09709369391202927, + "learning_rate": 7.057430801987225e-05, + "loss": 0.014326515793800353, + "step": 207390 + }, + { + "epoch": 29.439318665720368, + "grad_norm": 7.068256378173828, + "learning_rate": 7.057288857345635e-05, + "loss": 0.03163247108459473, + "step": 207400 + }, + { + "epoch": 29.440738112136266, + "grad_norm": 0.2543978691101074, + "learning_rate": 7.057146912704046e-05, + "loss": 0.011107003688812256, + "step": 207410 + }, + { + "epoch": 29.442157558552164, + "grad_norm": 0.19930283725261688, + "learning_rate": 7.057004968062456e-05, + "loss": 0.002790645509958267, + "step": 207420 + }, + { + "epoch": 29.443577004968063, + "grad_norm": 0.015351482667028904, + "learning_rate": 7.056863023420867e-05, + "loss": 0.08950978517532349, + "step": 207430 + }, + { + "epoch": 29.44499645138396, + "grad_norm": 0.018879149109125137, + "learning_rate": 7.056721078779276e-05, + "loss": 0.028763386607170104, + "step": 207440 + }, + { + "epoch": 29.44641589779986, + "grad_norm": 0.04124530404806137, + "learning_rate": 7.056579134137687e-05, + "loss": 0.01323392391204834, + "step": 207450 + }, + { + "epoch": 29.447835344215754, + "grad_norm": 2.036367893218994, + "learning_rate": 7.056437189496096e-05, + "loss": 0.011055976152420044, + "step": 207460 + }, + { + "epoch": 29.449254790631652, + "grad_norm": 0.0517902635037899, + "learning_rate": 7.056295244854507e-05, + "loss": 0.0013077601790428162, + "step": 207470 + }, + { + "epoch": 29.45067423704755, + "grad_norm": 0.1412457823753357, + "learning_rate": 7.056153300212917e-05, + "loss": 0.007746271789073944, + "step": 207480 + }, + { + "epoch": 29.45209368346345, + "grad_norm": 2.1197640895843506, + "learning_rate": 7.056011355571327e-05, + "loss": 0.004479076340794563, + "step": 207490 + }, + { + "epoch": 29.453513129879347, + "grad_norm": 0.022078821435570717, + "learning_rate": 7.055869410929738e-05, + "loss": 0.020055294036865234, + "step": 207500 + }, + { + "epoch": 29.453513129879347, + "eval_accuracy": 0.9885547148216443, + "eval_loss": 0.04507309943437576, + "eval_runtime": 32.3087, + "eval_samples_per_second": 486.773, + "eval_steps_per_second": 15.228, + "step": 207500 + }, + { + "epoch": 29.454932576295246, + "grad_norm": 5.957076072692871, + "learning_rate": 7.055727466288148e-05, + "loss": 0.01099892258644104, + "step": 207510 + }, + { + "epoch": 29.456352022711144, + "grad_norm": 0.3243045210838318, + "learning_rate": 7.055585521646559e-05, + "loss": 0.025699955224990845, + "step": 207520 + }, + { + "epoch": 29.45777146912704, + "grad_norm": 0.016812700778245926, + "learning_rate": 7.055443577004969e-05, + "loss": 0.002168106287717819, + "step": 207530 + }, + { + "epoch": 29.459190915542937, + "grad_norm": 0.12417203187942505, + "learning_rate": 7.05530163236338e-05, + "loss": 0.0031671524047851562, + "step": 207540 + }, + { + "epoch": 29.460610361958835, + "grad_norm": 0.5885121822357178, + "learning_rate": 7.055159687721788e-05, + "loss": 0.001204390823841095, + "step": 207550 + }, + { + "epoch": 29.462029808374734, + "grad_norm": 0.0889643132686615, + "learning_rate": 7.055017743080199e-05, + "loss": 0.009354464709758759, + "step": 207560 + }, + { + "epoch": 29.463449254790632, + "grad_norm": 0.018797585740685463, + "learning_rate": 7.054875798438609e-05, + "loss": 0.02751462459564209, + "step": 207570 + }, + { + "epoch": 29.46486870120653, + "grad_norm": 0.02053203620016575, + "learning_rate": 7.05473385379702e-05, + "loss": 0.010246165096759796, + "step": 207580 + }, + { + "epoch": 29.46628814762243, + "grad_norm": 1.7970507144927979, + "learning_rate": 7.05459190915543e-05, + "loss": 0.009696543216705322, + "step": 207590 + }, + { + "epoch": 29.467707594038323, + "grad_norm": 0.06337790191173553, + "learning_rate": 7.05444996451384e-05, + "loss": 0.031008327007293703, + "step": 207600 + }, + { + "epoch": 29.46912704045422, + "grad_norm": 9.829461097717285, + "learning_rate": 7.05430801987225e-05, + "loss": 0.04555315375328064, + "step": 207610 + }, + { + "epoch": 29.47054648687012, + "grad_norm": 0.08556224405765533, + "learning_rate": 7.05416607523066e-05, + "loss": 0.0008712131530046463, + "step": 207620 + }, + { + "epoch": 29.471965933286018, + "grad_norm": 0.043423138558864594, + "learning_rate": 7.054024130589071e-05, + "loss": 0.01525944471359253, + "step": 207630 + }, + { + "epoch": 29.473385379701917, + "grad_norm": 1.0372881889343262, + "learning_rate": 7.053882185947481e-05, + "loss": 0.0022549033164978027, + "step": 207640 + }, + { + "epoch": 29.474804826117815, + "grad_norm": 1.955945372581482, + "learning_rate": 7.053740241305891e-05, + "loss": 0.0019386846572160721, + "step": 207650 + }, + { + "epoch": 29.476224272533713, + "grad_norm": 0.2896319031715393, + "learning_rate": 7.0535982966643e-05, + "loss": 0.005782990157604218, + "step": 207660 + }, + { + "epoch": 29.477643718949608, + "grad_norm": 0.43979528546333313, + "learning_rate": 7.053456352022712e-05, + "loss": 0.006328025460243225, + "step": 207670 + }, + { + "epoch": 29.479063165365506, + "grad_norm": 4.684175491333008, + "learning_rate": 7.053314407381121e-05, + "loss": 0.03429814875125885, + "step": 207680 + }, + { + "epoch": 29.480482611781405, + "grad_norm": 0.04637220874428749, + "learning_rate": 7.053172462739533e-05, + "loss": 0.002201828733086586, + "step": 207690 + }, + { + "epoch": 29.481902058197303, + "grad_norm": 0.01632952131330967, + "learning_rate": 7.053030518097942e-05, + "loss": 0.008470715582370758, + "step": 207700 + }, + { + "epoch": 29.4833215046132, + "grad_norm": 0.31423118710517883, + "learning_rate": 7.052888573456352e-05, + "loss": 0.0009234648197889328, + "step": 207710 + }, + { + "epoch": 29.4847409510291, + "grad_norm": 0.006450807210057974, + "learning_rate": 7.052746628814763e-05, + "loss": 0.010138851404190064, + "step": 207720 + }, + { + "epoch": 29.486160397444998, + "grad_norm": 0.03514031693339348, + "learning_rate": 7.052604684173173e-05, + "loss": 0.00773688405752182, + "step": 207730 + }, + { + "epoch": 29.487579843860892, + "grad_norm": 0.002886669710278511, + "learning_rate": 7.052462739531584e-05, + "loss": 0.004109704121947289, + "step": 207740 + }, + { + "epoch": 29.48899929027679, + "grad_norm": 0.013425658456981182, + "learning_rate": 7.052320794889992e-05, + "loss": 0.002970973029732704, + "step": 207750 + }, + { + "epoch": 29.49041873669269, + "grad_norm": 5.19477653503418, + "learning_rate": 7.052178850248403e-05, + "loss": 0.009461761265993119, + "step": 207760 + }, + { + "epoch": 29.491838183108587, + "grad_norm": 2.3899848461151123, + "learning_rate": 7.052036905606813e-05, + "loss": 0.020083563029766084, + "step": 207770 + }, + { + "epoch": 29.493257629524486, + "grad_norm": 3.507599115371704, + "learning_rate": 7.051894960965224e-05, + "loss": 0.018284621834754943, + "step": 207780 + }, + { + "epoch": 29.494677075940384, + "grad_norm": 0.04308437556028366, + "learning_rate": 7.051753016323634e-05, + "loss": 0.013885025680065156, + "step": 207790 + }, + { + "epoch": 29.496096522356282, + "grad_norm": 0.014941342175006866, + "learning_rate": 7.051611071682044e-05, + "loss": 0.00392661914229393, + "step": 207800 + }, + { + "epoch": 29.497515968772177, + "grad_norm": 7.3909173011779785, + "learning_rate": 7.051469127040455e-05, + "loss": 0.012020472437143326, + "step": 207810 + }, + { + "epoch": 29.498935415188075, + "grad_norm": 0.08073733747005463, + "learning_rate": 7.051327182398865e-05, + "loss": 0.010773827880620956, + "step": 207820 + }, + { + "epoch": 29.500354861603974, + "grad_norm": 10.379618644714355, + "learning_rate": 7.051185237757276e-05, + "loss": 0.015787975490093233, + "step": 207830 + }, + { + "epoch": 29.501774308019872, + "grad_norm": 0.0026349599938839674, + "learning_rate": 7.051043293115685e-05, + "loss": 0.015194141864776611, + "step": 207840 + }, + { + "epoch": 29.50319375443577, + "grad_norm": 0.47407883405685425, + "learning_rate": 7.050901348474095e-05, + "loss": 0.004409328103065491, + "step": 207850 + }, + { + "epoch": 29.50461320085167, + "grad_norm": 0.34455084800720215, + "learning_rate": 7.050759403832505e-05, + "loss": 0.005682982504367828, + "step": 207860 + }, + { + "epoch": 29.506032647267567, + "grad_norm": 0.07422154396772385, + "learning_rate": 7.050617459190916e-05, + "loss": 0.04049008786678314, + "step": 207870 + }, + { + "epoch": 29.50745209368346, + "grad_norm": 2.90767765045166, + "learning_rate": 7.050475514549326e-05, + "loss": 0.005877177789807319, + "step": 207880 + }, + { + "epoch": 29.50887154009936, + "grad_norm": 0.09375117719173431, + "learning_rate": 7.050333569907737e-05, + "loss": 0.026822158694267274, + "step": 207890 + }, + { + "epoch": 29.51029098651526, + "grad_norm": 0.08490375429391861, + "learning_rate": 7.050191625266147e-05, + "loss": 0.013622145354747772, + "step": 207900 + }, + { + "epoch": 29.511710432931157, + "grad_norm": 3.9255616664886475, + "learning_rate": 7.050049680624556e-05, + "loss": 0.012689261138439179, + "step": 207910 + }, + { + "epoch": 29.513129879347055, + "grad_norm": 6.728745460510254, + "learning_rate": 7.049907735982967e-05, + "loss": 0.006310088187456131, + "step": 207920 + }, + { + "epoch": 29.514549325762953, + "grad_norm": 0.05335655063390732, + "learning_rate": 7.049765791341377e-05, + "loss": 0.017458222806453705, + "step": 207930 + }, + { + "epoch": 29.51596877217885, + "grad_norm": 0.09485534578561783, + "learning_rate": 7.049623846699788e-05, + "loss": 0.00457368902862072, + "step": 207940 + }, + { + "epoch": 29.517388218594746, + "grad_norm": 0.0961996465921402, + "learning_rate": 7.049481902058198e-05, + "loss": 0.002969733253121376, + "step": 207950 + }, + { + "epoch": 29.518807665010645, + "grad_norm": 0.02200569398701191, + "learning_rate": 7.049339957416608e-05, + "loss": 0.024971652030944824, + "step": 207960 + }, + { + "epoch": 29.520227111426543, + "grad_norm": 0.009184109978377819, + "learning_rate": 7.049198012775017e-05, + "loss": 0.00218205526471138, + "step": 207970 + }, + { + "epoch": 29.52164655784244, + "grad_norm": 4.01493501663208, + "learning_rate": 7.049056068133428e-05, + "loss": 0.01424625664949417, + "step": 207980 + }, + { + "epoch": 29.52306600425834, + "grad_norm": 0.7110947370529175, + "learning_rate": 7.048914123491838e-05, + "loss": 0.0007833488285541535, + "step": 207990 + }, + { + "epoch": 29.524485450674238, + "grad_norm": 0.02012433484196663, + "learning_rate": 7.048772178850249e-05, + "loss": 0.00396137572824955, + "step": 208000 + }, + { + "epoch": 29.524485450674238, + "eval_accuracy": 0.9879824505627265, + "eval_loss": 0.04906020313501358, + "eval_runtime": 30.9847, + "eval_samples_per_second": 507.574, + "eval_steps_per_second": 15.879, + "step": 208000 + }, + { + "epoch": 29.525904897090136, + "grad_norm": 0.8171433210372925, + "learning_rate": 7.048630234208659e-05, + "loss": 0.009226363897323609, + "step": 208010 + }, + { + "epoch": 29.52732434350603, + "grad_norm": 16.739763259887695, + "learning_rate": 7.048488289567069e-05, + "loss": 0.06825363636016846, + "step": 208020 + }, + { + "epoch": 29.52874378992193, + "grad_norm": 0.043713267892599106, + "learning_rate": 7.04834634492548e-05, + "loss": 0.04245122075080872, + "step": 208030 + }, + { + "epoch": 29.530163236337827, + "grad_norm": 0.14865319430828094, + "learning_rate": 7.04820440028389e-05, + "loss": 0.012282351404428482, + "step": 208040 + }, + { + "epoch": 29.531582682753726, + "grad_norm": 0.25177642703056335, + "learning_rate": 7.048062455642301e-05, + "loss": 0.0026421390473842623, + "step": 208050 + }, + { + "epoch": 29.533002129169624, + "grad_norm": 1.8172814846038818, + "learning_rate": 7.047920511000709e-05, + "loss": 0.0065997250378131865, + "step": 208060 + }, + { + "epoch": 29.534421575585522, + "grad_norm": 10.152331352233887, + "learning_rate": 7.04777856635912e-05, + "loss": 0.01830917000770569, + "step": 208070 + }, + { + "epoch": 29.53584102200142, + "grad_norm": 0.05693427100777626, + "learning_rate": 7.04763662171753e-05, + "loss": 0.02759735882282257, + "step": 208080 + }, + { + "epoch": 29.537260468417315, + "grad_norm": 1.5811372995376587, + "learning_rate": 7.047494677075941e-05, + "loss": 0.010139301419258118, + "step": 208090 + }, + { + "epoch": 29.538679914833214, + "grad_norm": 10.705254554748535, + "learning_rate": 7.047352732434351e-05, + "loss": 0.034941470623016356, + "step": 208100 + }, + { + "epoch": 29.540099361249112, + "grad_norm": 0.30637726187705994, + "learning_rate": 7.04721078779276e-05, + "loss": 0.0025851037353277206, + "step": 208110 + }, + { + "epoch": 29.54151880766501, + "grad_norm": 0.3187023997306824, + "learning_rate": 7.047068843151172e-05, + "loss": 0.04892149567604065, + "step": 208120 + }, + { + "epoch": 29.54293825408091, + "grad_norm": 5.719927787780762, + "learning_rate": 7.046926898509581e-05, + "loss": 0.007777091860771179, + "step": 208130 + }, + { + "epoch": 29.544357700496807, + "grad_norm": 1.112587332725525, + "learning_rate": 7.046784953867992e-05, + "loss": 0.014365059137344361, + "step": 208140 + }, + { + "epoch": 29.545777146912705, + "grad_norm": 1.2128536701202393, + "learning_rate": 7.046643009226402e-05, + "loss": 0.0014048945158720016, + "step": 208150 + }, + { + "epoch": 29.5471965933286, + "grad_norm": 0.14399656653404236, + "learning_rate": 7.046501064584812e-05, + "loss": 0.005722270905971527, + "step": 208160 + }, + { + "epoch": 29.5486160397445, + "grad_norm": 0.6469829678535461, + "learning_rate": 7.046359119943222e-05, + "loss": 0.014530965685844421, + "step": 208170 + }, + { + "epoch": 29.550035486160397, + "grad_norm": 0.8332312703132629, + "learning_rate": 7.046217175301633e-05, + "loss": 0.011495922505855561, + "step": 208180 + }, + { + "epoch": 29.551454932576295, + "grad_norm": 1.0112602710723877, + "learning_rate": 7.046075230660042e-05, + "loss": 0.004384545609354973, + "step": 208190 + }, + { + "epoch": 29.552874378992193, + "grad_norm": 0.032812006771564484, + "learning_rate": 7.045933286018454e-05, + "loss": 0.01060132086277008, + "step": 208200 + }, + { + "epoch": 29.55429382540809, + "grad_norm": 1.7734308242797852, + "learning_rate": 7.045791341376863e-05, + "loss": 0.005113392323255539, + "step": 208210 + }, + { + "epoch": 29.55571327182399, + "grad_norm": 0.25011181831359863, + "learning_rate": 7.045649396735273e-05, + "loss": 0.0024748574942350388, + "step": 208220 + }, + { + "epoch": 29.557132718239885, + "grad_norm": 7.156906604766846, + "learning_rate": 7.045507452093684e-05, + "loss": 0.004167269915342331, + "step": 208230 + }, + { + "epoch": 29.558552164655783, + "grad_norm": 2.7445216178894043, + "learning_rate": 7.045365507452094e-05, + "loss": 0.01515558809041977, + "step": 208240 + }, + { + "epoch": 29.55997161107168, + "grad_norm": 0.44155505299568176, + "learning_rate": 7.045223562810505e-05, + "loss": 0.007102020084857941, + "step": 208250 + }, + { + "epoch": 29.56139105748758, + "grad_norm": 0.48439744114875793, + "learning_rate": 7.045081618168915e-05, + "loss": 0.016608348488807677, + "step": 208260 + }, + { + "epoch": 29.562810503903478, + "grad_norm": 3.131302833557129, + "learning_rate": 7.044939673527324e-05, + "loss": 0.014863318204879761, + "step": 208270 + }, + { + "epoch": 29.564229950319376, + "grad_norm": 0.2976144254207611, + "learning_rate": 7.044797728885734e-05, + "loss": 0.011309333890676499, + "step": 208280 + }, + { + "epoch": 29.565649396735274, + "grad_norm": 0.01057754922658205, + "learning_rate": 7.044655784244145e-05, + "loss": 0.006054006516933441, + "step": 208290 + }, + { + "epoch": 29.56706884315117, + "grad_norm": 0.8902459740638733, + "learning_rate": 7.044513839602555e-05, + "loss": 0.05157136917114258, + "step": 208300 + }, + { + "epoch": 29.568488289567068, + "grad_norm": 17.06734275817871, + "learning_rate": 7.044371894960966e-05, + "loss": 0.027499192953109743, + "step": 208310 + }, + { + "epoch": 29.569907735982966, + "grad_norm": 0.49059608578681946, + "learning_rate": 7.044229950319376e-05, + "loss": 0.02154984325170517, + "step": 208320 + }, + { + "epoch": 29.571327182398864, + "grad_norm": 8.080525398254395, + "learning_rate": 7.044088005677786e-05, + "loss": 0.01359315812587738, + "step": 208330 + }, + { + "epoch": 29.572746628814762, + "grad_norm": 0.03249148651957512, + "learning_rate": 7.043946061036197e-05, + "loss": 0.00732974112033844, + "step": 208340 + }, + { + "epoch": 29.57416607523066, + "grad_norm": 0.44115912914276123, + "learning_rate": 7.043804116394606e-05, + "loss": 0.015574277937412262, + "step": 208350 + }, + { + "epoch": 29.57558552164656, + "grad_norm": 0.054017435759305954, + "learning_rate": 7.043662171753017e-05, + "loss": 0.024350492656230925, + "step": 208360 + }, + { + "epoch": 29.577004968062454, + "grad_norm": 1.5436614751815796, + "learning_rate": 7.043520227111426e-05, + "loss": 0.026005351543426515, + "step": 208370 + }, + { + "epoch": 29.578424414478352, + "grad_norm": 0.49544036388397217, + "learning_rate": 7.043378282469837e-05, + "loss": 0.04969885349273682, + "step": 208380 + }, + { + "epoch": 29.57984386089425, + "grad_norm": 1.050898790359497, + "learning_rate": 7.043236337828247e-05, + "loss": 0.016392117738723753, + "step": 208390 + }, + { + "epoch": 29.58126330731015, + "grad_norm": 17.052169799804688, + "learning_rate": 7.043094393186658e-05, + "loss": 0.04376031160354614, + "step": 208400 + }, + { + "epoch": 29.582682753726047, + "grad_norm": 11.823457717895508, + "learning_rate": 7.042952448545068e-05, + "loss": 0.016380494832992552, + "step": 208410 + }, + { + "epoch": 29.584102200141945, + "grad_norm": 0.26832714676856995, + "learning_rate": 7.042810503903477e-05, + "loss": 0.024531126022338867, + "step": 208420 + }, + { + "epoch": 29.585521646557844, + "grad_norm": 4.602287292480469, + "learning_rate": 7.042668559261888e-05, + "loss": 0.002498789131641388, + "step": 208430 + }, + { + "epoch": 29.58694109297374, + "grad_norm": 0.01515783928334713, + "learning_rate": 7.042526614620298e-05, + "loss": 0.007285246998071671, + "step": 208440 + }, + { + "epoch": 29.588360539389637, + "grad_norm": 0.03890039399266243, + "learning_rate": 7.042384669978709e-05, + "loss": 0.01828044354915619, + "step": 208450 + }, + { + "epoch": 29.589779985805535, + "grad_norm": 0.3353753983974457, + "learning_rate": 7.042242725337119e-05, + "loss": 0.028917086124420167, + "step": 208460 + }, + { + "epoch": 29.591199432221433, + "grad_norm": 1.7283716201782227, + "learning_rate": 7.042100780695529e-05, + "loss": 0.043337056040763856, + "step": 208470 + }, + { + "epoch": 29.59261887863733, + "grad_norm": 0.6961215138435364, + "learning_rate": 7.041958836053938e-05, + "loss": 0.02684147357940674, + "step": 208480 + }, + { + "epoch": 29.59403832505323, + "grad_norm": 1.633212924003601, + "learning_rate": 7.04181689141235e-05, + "loss": 0.011064384877681733, + "step": 208490 + }, + { + "epoch": 29.59545777146913, + "grad_norm": 0.3913591206073761, + "learning_rate": 7.041674946770759e-05, + "loss": 0.03477397561073303, + "step": 208500 + }, + { + "epoch": 29.59545777146913, + "eval_accuracy": 0.9852482991034527, + "eval_loss": 0.05446361005306244, + "eval_runtime": 31.4954, + "eval_samples_per_second": 499.343, + "eval_steps_per_second": 15.621, + "step": 208500 + }, + { + "epoch": 29.596877217885023, + "grad_norm": 0.7452126741409302, + "learning_rate": 7.04153300212917e-05, + "loss": 0.01313551515340805, + "step": 208510 + }, + { + "epoch": 29.59829666430092, + "grad_norm": 0.6042484045028687, + "learning_rate": 7.04139105748758e-05, + "loss": 0.008436571061611175, + "step": 208520 + }, + { + "epoch": 29.59971611071682, + "grad_norm": 4.191666126251221, + "learning_rate": 7.04124911284599e-05, + "loss": 0.009888374805450439, + "step": 208530 + }, + { + "epoch": 29.601135557132718, + "grad_norm": 0.3080088198184967, + "learning_rate": 7.041107168204401e-05, + "loss": 0.007224229723215103, + "step": 208540 + }, + { + "epoch": 29.602555003548616, + "grad_norm": 0.02807941660284996, + "learning_rate": 7.04096522356281e-05, + "loss": 0.001141386479139328, + "step": 208550 + }, + { + "epoch": 29.603974449964515, + "grad_norm": 1.1252931356430054, + "learning_rate": 7.040823278921222e-05, + "loss": 0.016075532138347625, + "step": 208560 + }, + { + "epoch": 29.605393896380413, + "grad_norm": 0.5136713981628418, + "learning_rate": 7.040681334279631e-05, + "loss": 0.05346091985702515, + "step": 208570 + }, + { + "epoch": 29.606813342796308, + "grad_norm": 0.04487874358892441, + "learning_rate": 7.040539389638041e-05, + "loss": 0.0019524358212947845, + "step": 208580 + }, + { + "epoch": 29.608232789212206, + "grad_norm": 0.00861656665802002, + "learning_rate": 7.040397444996451e-05, + "loss": 0.029509967565536498, + "step": 208590 + }, + { + "epoch": 29.609652235628104, + "grad_norm": 9.650275230407715, + "learning_rate": 7.040255500354862e-05, + "loss": 0.014944563806056976, + "step": 208600 + }, + { + "epoch": 29.611071682044003, + "grad_norm": 11.063875198364258, + "learning_rate": 7.040113555713273e-05, + "loss": 0.030073022842407225, + "step": 208610 + }, + { + "epoch": 29.6124911284599, + "grad_norm": 0.07659758627414703, + "learning_rate": 7.039971611071683e-05, + "loss": 0.013931533694267273, + "step": 208620 + }, + { + "epoch": 29.6139105748758, + "grad_norm": 0.8316934704780579, + "learning_rate": 7.039829666430093e-05, + "loss": 0.009550350904464721, + "step": 208630 + }, + { + "epoch": 29.615330021291697, + "grad_norm": 1.3273394107818604, + "learning_rate": 7.039687721788502e-05, + "loss": 0.016545410454273223, + "step": 208640 + }, + { + "epoch": 29.616749467707596, + "grad_norm": 0.7643616199493408, + "learning_rate": 7.039545777146913e-05, + "loss": 0.01775372326374054, + "step": 208650 + }, + { + "epoch": 29.61816891412349, + "grad_norm": 0.059268299490213394, + "learning_rate": 7.039403832505323e-05, + "loss": 0.009915129840373993, + "step": 208660 + }, + { + "epoch": 29.61958836053939, + "grad_norm": 21.977678298950195, + "learning_rate": 7.039261887863734e-05, + "loss": 0.03129419684410095, + "step": 208670 + }, + { + "epoch": 29.621007806955287, + "grad_norm": 0.5069807767868042, + "learning_rate": 7.039119943222143e-05, + "loss": 0.0012141138315200806, + "step": 208680 + }, + { + "epoch": 29.622427253371185, + "grad_norm": 1.2897690534591675, + "learning_rate": 7.038977998580554e-05, + "loss": 0.02401934713125229, + "step": 208690 + }, + { + "epoch": 29.623846699787084, + "grad_norm": 3.818894863128662, + "learning_rate": 7.038836053938965e-05, + "loss": 0.018886232376098634, + "step": 208700 + }, + { + "epoch": 29.625266146202982, + "grad_norm": 0.6808741688728333, + "learning_rate": 7.038694109297375e-05, + "loss": 0.005103215947747231, + "step": 208710 + }, + { + "epoch": 29.62668559261888, + "grad_norm": 0.23045605421066284, + "learning_rate": 7.038552164655786e-05, + "loss": 0.019849085807800294, + "step": 208720 + }, + { + "epoch": 29.628105039034775, + "grad_norm": 2.845106601715088, + "learning_rate": 7.038410220014194e-05, + "loss": 0.017224286496639252, + "step": 208730 + }, + { + "epoch": 29.629524485450673, + "grad_norm": 0.0687076523900032, + "learning_rate": 7.038268275372605e-05, + "loss": 0.006759358942508698, + "step": 208740 + }, + { + "epoch": 29.63094393186657, + "grad_norm": 0.03565913066267967, + "learning_rate": 7.038126330731015e-05, + "loss": 0.01460828334093094, + "step": 208750 + }, + { + "epoch": 29.63236337828247, + "grad_norm": 0.03706299886107445, + "learning_rate": 7.037984386089426e-05, + "loss": 0.006172128766775131, + "step": 208760 + }, + { + "epoch": 29.63378282469837, + "grad_norm": 0.11024293303489685, + "learning_rate": 7.037842441447836e-05, + "loss": 0.009482118487358093, + "step": 208770 + }, + { + "epoch": 29.635202271114267, + "grad_norm": 0.24113775789737701, + "learning_rate": 7.037700496806245e-05, + "loss": 0.0010822061449289322, + "step": 208780 + }, + { + "epoch": 29.636621717530165, + "grad_norm": 15.45003604888916, + "learning_rate": 7.037558552164657e-05, + "loss": 0.008089704811573029, + "step": 208790 + }, + { + "epoch": 29.63804116394606, + "grad_norm": 0.26761844754219055, + "learning_rate": 7.037416607523066e-05, + "loss": 0.001810285449028015, + "step": 208800 + }, + { + "epoch": 29.639460610361958, + "grad_norm": 0.07736538350582123, + "learning_rate": 7.037274662881477e-05, + "loss": 0.04090999662876129, + "step": 208810 + }, + { + "epoch": 29.640880056777856, + "grad_norm": 0.015408860519528389, + "learning_rate": 7.037132718239887e-05, + "loss": 0.007116435468196869, + "step": 208820 + }, + { + "epoch": 29.642299503193755, + "grad_norm": 2.170957088470459, + "learning_rate": 7.036990773598297e-05, + "loss": 0.00902838259935379, + "step": 208830 + }, + { + "epoch": 29.643718949609653, + "grad_norm": 0.24883918464183807, + "learning_rate": 7.036848828956707e-05, + "loss": 0.02464032918214798, + "step": 208840 + }, + { + "epoch": 29.64513839602555, + "grad_norm": 0.061892345547676086, + "learning_rate": 7.036706884315118e-05, + "loss": 0.0037682078778743743, + "step": 208850 + }, + { + "epoch": 29.64655784244145, + "grad_norm": 0.04264070838689804, + "learning_rate": 7.036564939673527e-05, + "loss": 0.0009153746068477631, + "step": 208860 + }, + { + "epoch": 29.647977288857344, + "grad_norm": 2.5950746536254883, + "learning_rate": 7.036422995031938e-05, + "loss": 0.03214665949344635, + "step": 208870 + }, + { + "epoch": 29.649396735273243, + "grad_norm": 0.027431290596723557, + "learning_rate": 7.036281050390348e-05, + "loss": 0.009576027095317841, + "step": 208880 + }, + { + "epoch": 29.65081618168914, + "grad_norm": 2.445262908935547, + "learning_rate": 7.036139105748758e-05, + "loss": 0.01462910920381546, + "step": 208890 + }, + { + "epoch": 29.65223562810504, + "grad_norm": 0.31247493624687195, + "learning_rate": 7.035997161107169e-05, + "loss": 0.004024193435907364, + "step": 208900 + }, + { + "epoch": 29.653655074520938, + "grad_norm": 0.18434490263462067, + "learning_rate": 7.035855216465579e-05, + "loss": 0.011388210207223892, + "step": 208910 + }, + { + "epoch": 29.655074520936836, + "grad_norm": 0.2533305883407593, + "learning_rate": 7.03571327182399e-05, + "loss": 0.032611867785453795, + "step": 208920 + }, + { + "epoch": 29.656493967352734, + "grad_norm": 0.4420749247074127, + "learning_rate": 7.0355713271824e-05, + "loss": 0.02737516462802887, + "step": 208930 + }, + { + "epoch": 29.65791341376863, + "grad_norm": 3.088407278060913, + "learning_rate": 7.03542938254081e-05, + "loss": 0.02890838384628296, + "step": 208940 + }, + { + "epoch": 29.659332860184527, + "grad_norm": 0.2065955251455307, + "learning_rate": 7.035287437899219e-05, + "loss": 0.001988516375422478, + "step": 208950 + }, + { + "epoch": 29.660752306600425, + "grad_norm": 0.5789322853088379, + "learning_rate": 7.03514549325763e-05, + "loss": 0.020954021811485292, + "step": 208960 + }, + { + "epoch": 29.662171753016324, + "grad_norm": 8.413910865783691, + "learning_rate": 7.03500354861604e-05, + "loss": 0.005849240347743034, + "step": 208970 + }, + { + "epoch": 29.663591199432222, + "grad_norm": 0.13941845297813416, + "learning_rate": 7.034861603974451e-05, + "loss": 0.001313413679599762, + "step": 208980 + }, + { + "epoch": 29.66501064584812, + "grad_norm": 2.013253927230835, + "learning_rate": 7.034719659332861e-05, + "loss": 0.01588839292526245, + "step": 208990 + }, + { + "epoch": 29.66643009226402, + "grad_norm": 0.22498781979084015, + "learning_rate": 7.03457771469127e-05, + "loss": 0.0009338967502117157, + "step": 209000 + }, + { + "epoch": 29.66643009226402, + "eval_accuracy": 0.9886182997393018, + "eval_loss": 0.046487320214509964, + "eval_runtime": 32.3094, + "eval_samples_per_second": 486.763, + "eval_steps_per_second": 15.228, + "step": 209000 + }, + { + "epoch": 29.667849538679913, + "grad_norm": 12.995738983154297, + "learning_rate": 7.034435770049682e-05, + "loss": 0.04022456407546997, + "step": 209010 + }, + { + "epoch": 29.669268985095812, + "grad_norm": 14.79349136352539, + "learning_rate": 7.034293825408091e-05, + "loss": 0.011307716369628906, + "step": 209020 + }, + { + "epoch": 29.67068843151171, + "grad_norm": 0.02259664051234722, + "learning_rate": 7.034151880766502e-05, + "loss": 0.08723182678222656, + "step": 209030 + }, + { + "epoch": 29.67210787792761, + "grad_norm": 0.03584439679980278, + "learning_rate": 7.034009936124911e-05, + "loss": 0.018898254632949828, + "step": 209040 + }, + { + "epoch": 29.673527324343507, + "grad_norm": 0.02606879360973835, + "learning_rate": 7.033867991483322e-05, + "loss": 0.018964344263076784, + "step": 209050 + }, + { + "epoch": 29.674946770759405, + "grad_norm": 0.0389786958694458, + "learning_rate": 7.033726046841732e-05, + "loss": 0.029776120185852052, + "step": 209060 + }, + { + "epoch": 29.676366217175303, + "grad_norm": 0.5237786769866943, + "learning_rate": 7.033584102200143e-05, + "loss": 0.006915730983018875, + "step": 209070 + }, + { + "epoch": 29.677785663591198, + "grad_norm": 0.055597785860300064, + "learning_rate": 7.033442157558552e-05, + "loss": 0.037065541744232176, + "step": 209080 + }, + { + "epoch": 29.679205110007096, + "grad_norm": 0.028811389580368996, + "learning_rate": 7.033300212916962e-05, + "loss": 0.004376491531729698, + "step": 209090 + }, + { + "epoch": 29.680624556422995, + "grad_norm": 0.41480177640914917, + "learning_rate": 7.033158268275373e-05, + "loss": 0.013488754630088806, + "step": 209100 + }, + { + "epoch": 29.682044002838893, + "grad_norm": 0.14620953798294067, + "learning_rate": 7.033016323633783e-05, + "loss": 0.021088628470897673, + "step": 209110 + }, + { + "epoch": 29.68346344925479, + "grad_norm": 15.157882690429688, + "learning_rate": 7.032874378992194e-05, + "loss": 0.017085982859134673, + "step": 209120 + }, + { + "epoch": 29.68488289567069, + "grad_norm": 0.139865905046463, + "learning_rate": 7.032732434350604e-05, + "loss": 0.022445803880691527, + "step": 209130 + }, + { + "epoch": 29.686302342086588, + "grad_norm": 18.478282928466797, + "learning_rate": 7.032590489709014e-05, + "loss": 0.022239850461483003, + "step": 209140 + }, + { + "epoch": 29.687721788502483, + "grad_norm": 6.108360767364502, + "learning_rate": 7.032448545067423e-05, + "loss": 0.01110762283205986, + "step": 209150 + }, + { + "epoch": 29.68914123491838, + "grad_norm": 0.72826087474823, + "learning_rate": 7.032306600425834e-05, + "loss": 0.02307276725769043, + "step": 209160 + }, + { + "epoch": 29.69056068133428, + "grad_norm": 0.00913691520690918, + "learning_rate": 7.032164655784244e-05, + "loss": 0.014743681252002715, + "step": 209170 + }, + { + "epoch": 29.691980127750178, + "grad_norm": 1.5320665836334229, + "learning_rate": 7.032022711142655e-05, + "loss": 0.04224913418292999, + "step": 209180 + }, + { + "epoch": 29.693399574166076, + "grad_norm": 1.6394563913345337, + "learning_rate": 7.031880766501065e-05, + "loss": 0.007460614293813705, + "step": 209190 + }, + { + "epoch": 29.694819020581974, + "grad_norm": 13.387170791625977, + "learning_rate": 7.031738821859475e-05, + "loss": 0.023605000972747803, + "step": 209200 + }, + { + "epoch": 29.696238466997873, + "grad_norm": 0.05924532562494278, + "learning_rate": 7.031596877217886e-05, + "loss": 0.022049921751022338, + "step": 209210 + }, + { + "epoch": 29.697657913413767, + "grad_norm": 1.3570935726165771, + "learning_rate": 7.031454932576296e-05, + "loss": 0.029002752900123597, + "step": 209220 + }, + { + "epoch": 29.699077359829666, + "grad_norm": 0.37804803252220154, + "learning_rate": 7.031312987934707e-05, + "loss": 0.03318764269351959, + "step": 209230 + }, + { + "epoch": 29.700496806245564, + "grad_norm": 0.11271392554044724, + "learning_rate": 7.031171043293115e-05, + "loss": 0.0469030499458313, + "step": 209240 + }, + { + "epoch": 29.701916252661462, + "grad_norm": 0.40114831924438477, + "learning_rate": 7.031029098651526e-05, + "loss": 0.027224627137184144, + "step": 209250 + }, + { + "epoch": 29.70333569907736, + "grad_norm": 0.9028560519218445, + "learning_rate": 7.030887154009936e-05, + "loss": 0.007577848434448242, + "step": 209260 + }, + { + "epoch": 29.70475514549326, + "grad_norm": 0.010067733004689217, + "learning_rate": 7.030745209368347e-05, + "loss": 0.0008014257997274399, + "step": 209270 + }, + { + "epoch": 29.706174591909157, + "grad_norm": 0.03343568742275238, + "learning_rate": 7.030603264726757e-05, + "loss": 0.003070628270506859, + "step": 209280 + }, + { + "epoch": 29.707594038325052, + "grad_norm": 0.1290713995695114, + "learning_rate": 7.030461320085168e-05, + "loss": 0.01804938167333603, + "step": 209290 + }, + { + "epoch": 29.70901348474095, + "grad_norm": 0.022898826748132706, + "learning_rate": 7.030319375443578e-05, + "loss": 0.0018119256943464278, + "step": 209300 + }, + { + "epoch": 29.71043293115685, + "grad_norm": 1.0513118505477905, + "learning_rate": 7.030191625266147e-05, + "loss": 0.019848327338695525, + "step": 209310 + }, + { + "epoch": 29.711852377572747, + "grad_norm": 0.026186294853687286, + "learning_rate": 7.030049680624556e-05, + "loss": 0.01719692200422287, + "step": 209320 + }, + { + "epoch": 29.713271823988645, + "grad_norm": 0.13418754935264587, + "learning_rate": 7.029907735982967e-05, + "loss": 0.005875326693058014, + "step": 209330 + }, + { + "epoch": 29.714691270404543, + "grad_norm": 0.04535046964883804, + "learning_rate": 7.029765791341377e-05, + "loss": 0.002035566791892052, + "step": 209340 + }, + { + "epoch": 29.71611071682044, + "grad_norm": 8.090560913085938, + "learning_rate": 7.029623846699788e-05, + "loss": 0.00617947019636631, + "step": 209350 + }, + { + "epoch": 29.717530163236336, + "grad_norm": 0.009255850687623024, + "learning_rate": 7.029481902058199e-05, + "loss": 0.010502047836780548, + "step": 209360 + }, + { + "epoch": 29.718949609652235, + "grad_norm": 1.5361385345458984, + "learning_rate": 7.029339957416607e-05, + "loss": 0.01832001507282257, + "step": 209370 + }, + { + "epoch": 29.720369056068133, + "grad_norm": 0.021829338744282722, + "learning_rate": 7.029198012775018e-05, + "loss": 0.01078387051820755, + "step": 209380 + }, + { + "epoch": 29.72178850248403, + "grad_norm": 6.5356268882751465, + "learning_rate": 7.029056068133428e-05, + "loss": 0.06797637939453124, + "step": 209390 + }, + { + "epoch": 29.72320794889993, + "grad_norm": 0.25435853004455566, + "learning_rate": 7.028914123491839e-05, + "loss": 0.018374067544937134, + "step": 209400 + }, + { + "epoch": 29.724627395315828, + "grad_norm": 0.11314039677381516, + "learning_rate": 7.028772178850249e-05, + "loss": 0.01821415573358536, + "step": 209410 + }, + { + "epoch": 29.726046841731726, + "grad_norm": 0.005699900910258293, + "learning_rate": 7.028630234208659e-05, + "loss": 0.002424689009785652, + "step": 209420 + }, + { + "epoch": 29.72746628814762, + "grad_norm": 0.023874491453170776, + "learning_rate": 7.028488289567068e-05, + "loss": 0.014740371704101562, + "step": 209430 + }, + { + "epoch": 29.72888573456352, + "grad_norm": 0.04667259380221367, + "learning_rate": 7.02834634492548e-05, + "loss": 0.002538125216960907, + "step": 209440 + }, + { + "epoch": 29.730305180979418, + "grad_norm": 0.058522436767816544, + "learning_rate": 7.02820440028389e-05, + "loss": 0.013267439603805543, + "step": 209450 + }, + { + "epoch": 29.731724627395316, + "grad_norm": 12.251399993896484, + "learning_rate": 7.0280624556423e-05, + "loss": 0.027759110927581786, + "step": 209460 + }, + { + "epoch": 29.733144073811214, + "grad_norm": 9.439790725708008, + "learning_rate": 7.02792051100071e-05, + "loss": 0.03325506448745728, + "step": 209470 + }, + { + "epoch": 29.734563520227113, + "grad_norm": 0.28134945034980774, + "learning_rate": 7.02777856635912e-05, + "loss": 0.047185423970222476, + "step": 209480 + }, + { + "epoch": 29.73598296664301, + "grad_norm": 0.15302136540412903, + "learning_rate": 7.027636621717531e-05, + "loss": 0.03507803976535797, + "step": 209490 + }, + { + "epoch": 29.737402413058906, + "grad_norm": 0.14830857515335083, + "learning_rate": 7.02749467707594e-05, + "loss": 0.009745923429727554, + "step": 209500 + }, + { + "epoch": 29.737402413058906, + "eval_accuracy": 0.9881096203980416, + "eval_loss": 0.048819221556186676, + "eval_runtime": 32.8342, + "eval_samples_per_second": 478.983, + "eval_steps_per_second": 14.984, + "step": 209500 + }, + { + "epoch": 29.738821859474804, + "grad_norm": 0.01499196793884039, + "learning_rate": 7.027352732434352e-05, + "loss": 0.001153871789574623, + "step": 209510 + }, + { + "epoch": 29.740241305890702, + "grad_norm": 13.944330215454102, + "learning_rate": 7.02721078779276e-05, + "loss": 0.024008971452713013, + "step": 209520 + }, + { + "epoch": 29.7416607523066, + "grad_norm": 0.05285196378827095, + "learning_rate": 7.027068843151171e-05, + "loss": 0.004849657788872719, + "step": 209530 + }, + { + "epoch": 29.7430801987225, + "grad_norm": 4.824704647064209, + "learning_rate": 7.026926898509582e-05, + "loss": 0.016922979056835173, + "step": 209540 + }, + { + "epoch": 29.744499645138397, + "grad_norm": 0.8816559314727783, + "learning_rate": 7.026784953867992e-05, + "loss": 0.010996547341346741, + "step": 209550 + }, + { + "epoch": 29.745919091554295, + "grad_norm": 0.7314534783363342, + "learning_rate": 7.026643009226403e-05, + "loss": 0.0030949696898460387, + "step": 209560 + }, + { + "epoch": 29.74733853797019, + "grad_norm": 0.008747121319174767, + "learning_rate": 7.026501064584811e-05, + "loss": 0.01813964992761612, + "step": 209570 + }, + { + "epoch": 29.74875798438609, + "grad_norm": 0.08595044165849686, + "learning_rate": 7.026359119943222e-05, + "loss": 0.0013654917478561402, + "step": 209580 + }, + { + "epoch": 29.750177430801987, + "grad_norm": 0.6860477328300476, + "learning_rate": 7.026217175301632e-05, + "loss": 0.012960536777973175, + "step": 209590 + }, + { + "epoch": 29.751596877217885, + "grad_norm": 0.018768014386296272, + "learning_rate": 7.026075230660043e-05, + "loss": 0.02058204710483551, + "step": 209600 + }, + { + "epoch": 29.753016323633783, + "grad_norm": 0.09180847555398941, + "learning_rate": 7.025933286018453e-05, + "loss": 0.009937920421361924, + "step": 209610 + }, + { + "epoch": 29.75443577004968, + "grad_norm": 12.400583267211914, + "learning_rate": 7.025791341376864e-05, + "loss": 0.04334622025489807, + "step": 209620 + }, + { + "epoch": 29.75585521646558, + "grad_norm": 0.22353364527225494, + "learning_rate": 7.025649396735274e-05, + "loss": 0.01860593557357788, + "step": 209630 + }, + { + "epoch": 29.757274662881475, + "grad_norm": 0.018336806446313858, + "learning_rate": 7.025507452093684e-05, + "loss": 0.02058257758617401, + "step": 209640 + }, + { + "epoch": 29.758694109297373, + "grad_norm": 0.857136607170105, + "learning_rate": 7.025365507452095e-05, + "loss": 0.03863702118396759, + "step": 209650 + }, + { + "epoch": 29.76011355571327, + "grad_norm": 0.015166526660323143, + "learning_rate": 7.025223562810504e-05, + "loss": 0.00626649558544159, + "step": 209660 + }, + { + "epoch": 29.76153300212917, + "grad_norm": 0.23573365807533264, + "learning_rate": 7.025081618168916e-05, + "loss": 0.0022231899201869965, + "step": 209670 + }, + { + "epoch": 29.762952448545068, + "grad_norm": 15.121125221252441, + "learning_rate": 7.024939673527324e-05, + "loss": 0.012397938966751098, + "step": 209680 + }, + { + "epoch": 29.764371894960966, + "grad_norm": 0.05725543946027756, + "learning_rate": 7.024797728885735e-05, + "loss": 0.00311901792883873, + "step": 209690 + }, + { + "epoch": 29.765791341376865, + "grad_norm": 0.07817237824201584, + "learning_rate": 7.024655784244145e-05, + "loss": 0.0019193414598703384, + "step": 209700 + }, + { + "epoch": 29.76721078779276, + "grad_norm": 0.12894479930400848, + "learning_rate": 7.024513839602556e-05, + "loss": 0.0042214527726173404, + "step": 209710 + }, + { + "epoch": 29.768630234208658, + "grad_norm": 3.6294195652008057, + "learning_rate": 7.024371894960966e-05, + "loss": 0.027300333976745604, + "step": 209720 + }, + { + "epoch": 29.770049680624556, + "grad_norm": 0.29750198125839233, + "learning_rate": 7.024229950319375e-05, + "loss": 0.010976825654506684, + "step": 209730 + }, + { + "epoch": 29.771469127040454, + "grad_norm": 0.6299542784690857, + "learning_rate": 7.024088005677786e-05, + "loss": 0.0029639042913913727, + "step": 209740 + }, + { + "epoch": 29.772888573456353, + "grad_norm": 0.05707570165395737, + "learning_rate": 7.023946061036196e-05, + "loss": 0.020954130589962004, + "step": 209750 + }, + { + "epoch": 29.77430801987225, + "grad_norm": 0.007832161150872707, + "learning_rate": 7.023804116394607e-05, + "loss": 0.04112120270729065, + "step": 209760 + }, + { + "epoch": 29.77572746628815, + "grad_norm": 0.7540813684463501, + "learning_rate": 7.023662171753017e-05, + "loss": 0.003725597634911537, + "step": 209770 + }, + { + "epoch": 29.777146912704044, + "grad_norm": 17.479290008544922, + "learning_rate": 7.023520227111427e-05, + "loss": 0.03490169644355774, + "step": 209780 + }, + { + "epoch": 29.778566359119942, + "grad_norm": 0.04591252654790878, + "learning_rate": 7.023378282469836e-05, + "loss": 0.0011666052043437958, + "step": 209790 + }, + { + "epoch": 29.77998580553584, + "grad_norm": 0.029737213626503944, + "learning_rate": 7.023236337828248e-05, + "loss": 0.015971672534942628, + "step": 209800 + }, + { + "epoch": 29.78140525195174, + "grad_norm": 0.48643508553504944, + "learning_rate": 7.023094393186657e-05, + "loss": 0.011987714469432831, + "step": 209810 + }, + { + "epoch": 29.782824698367637, + "grad_norm": 0.004228704608976841, + "learning_rate": 7.022952448545068e-05, + "loss": 0.008068503439426422, + "step": 209820 + }, + { + "epoch": 29.784244144783536, + "grad_norm": 3.1509816646575928, + "learning_rate": 7.022810503903478e-05, + "loss": 0.0035766955465078355, + "step": 209830 + }, + { + "epoch": 29.785663591199434, + "grad_norm": 11.602933883666992, + "learning_rate": 7.022668559261888e-05, + "loss": 0.012034596502780914, + "step": 209840 + }, + { + "epoch": 29.78708303761533, + "grad_norm": 0.009348217397928238, + "learning_rate": 7.022526614620299e-05, + "loss": 0.007089111208915711, + "step": 209850 + }, + { + "epoch": 29.788502484031227, + "grad_norm": 0.22918376326560974, + "learning_rate": 7.022384669978709e-05, + "loss": 0.0009151618927717209, + "step": 209860 + }, + { + "epoch": 29.789921930447125, + "grad_norm": 0.03347579017281532, + "learning_rate": 7.02224272533712e-05, + "loss": 0.006451761722564698, + "step": 209870 + }, + { + "epoch": 29.791341376863024, + "grad_norm": 0.0015337757067754865, + "learning_rate": 7.022100780695528e-05, + "loss": 0.0018684174865484238, + "step": 209880 + }, + { + "epoch": 29.792760823278922, + "grad_norm": 17.030925750732422, + "learning_rate": 7.021958836053939e-05, + "loss": 0.01206776350736618, + "step": 209890 + }, + { + "epoch": 29.79418026969482, + "grad_norm": 6.822809219360352, + "learning_rate": 7.021816891412349e-05, + "loss": 0.007399451732635498, + "step": 209900 + }, + { + "epoch": 29.79559971611072, + "grad_norm": 0.0420171320438385, + "learning_rate": 7.02167494677076e-05, + "loss": 0.00505472905933857, + "step": 209910 + }, + { + "epoch": 29.797019162526613, + "grad_norm": 0.02209319919347763, + "learning_rate": 7.02153300212917e-05, + "loss": 0.003628060221672058, + "step": 209920 + }, + { + "epoch": 29.79843860894251, + "grad_norm": 4.751023292541504, + "learning_rate": 7.02139105748758e-05, + "loss": 0.07658085227012634, + "step": 209930 + }, + { + "epoch": 29.79985805535841, + "grad_norm": 0.012931781820952892, + "learning_rate": 7.02124911284599e-05, + "loss": 0.008856560289859771, + "step": 209940 + }, + { + "epoch": 29.801277501774308, + "grad_norm": 0.015526315197348595, + "learning_rate": 7.0211071682044e-05, + "loss": 0.004002826288342476, + "step": 209950 + }, + { + "epoch": 29.802696948190206, + "grad_norm": 4.246027946472168, + "learning_rate": 7.020965223562811e-05, + "loss": 0.024775618314743043, + "step": 209960 + }, + { + "epoch": 29.804116394606105, + "grad_norm": 0.24059826135635376, + "learning_rate": 7.020823278921221e-05, + "loss": 0.003307870402932167, + "step": 209970 + }, + { + "epoch": 29.805535841022003, + "grad_norm": 0.15848951041698456, + "learning_rate": 7.020681334279632e-05, + "loss": 0.00776301771402359, + "step": 209980 + }, + { + "epoch": 29.806955287437898, + "grad_norm": 0.7452932596206665, + "learning_rate": 7.020539389638041e-05, + "loss": 0.013262240588665009, + "step": 209990 + }, + { + "epoch": 29.808374733853796, + "grad_norm": 0.03255649656057358, + "learning_rate": 7.020397444996452e-05, + "loss": 0.00800723060965538, + "step": 210000 + }, + { + "epoch": 29.808374733853796, + "eval_accuracy": 0.9888090544922744, + "eval_loss": 0.04274590685963631, + "eval_runtime": 31.8483, + "eval_samples_per_second": 493.809, + "eval_steps_per_second": 15.448, + "step": 210000 + }, + { + "epoch": 29.809794180269694, + "grad_norm": 0.036069273948669434, + "learning_rate": 7.020255500354862e-05, + "loss": 0.02205509543418884, + "step": 210010 + }, + { + "epoch": 29.811213626685593, + "grad_norm": 0.03993762657046318, + "learning_rate": 7.020113555713273e-05, + "loss": 0.0005988061428070068, + "step": 210020 + }, + { + "epoch": 29.81263307310149, + "grad_norm": 0.016143597662448883, + "learning_rate": 7.019971611071682e-05, + "loss": 0.004531471058726311, + "step": 210030 + }, + { + "epoch": 29.81405251951739, + "grad_norm": 0.04161229357123375, + "learning_rate": 7.019829666430092e-05, + "loss": 0.011912484467029572, + "step": 210040 + }, + { + "epoch": 29.815471965933288, + "grad_norm": 0.10627970844507217, + "learning_rate": 7.019687721788503e-05, + "loss": 0.035160872340202334, + "step": 210050 + }, + { + "epoch": 29.816891412349182, + "grad_norm": 10.340616226196289, + "learning_rate": 7.019545777146913e-05, + "loss": 0.009840410947799683, + "step": 210060 + }, + { + "epoch": 29.81831085876508, + "grad_norm": 0.012918769381940365, + "learning_rate": 7.019403832505324e-05, + "loss": 0.007581527531147003, + "step": 210070 + }, + { + "epoch": 29.81973030518098, + "grad_norm": 0.006601301487535238, + "learning_rate": 7.019261887863734e-05, + "loss": 0.01808585226535797, + "step": 210080 + }, + { + "epoch": 29.821149751596877, + "grad_norm": 0.12041378766298294, + "learning_rate": 7.019119943222143e-05, + "loss": 0.0019443638622760773, + "step": 210090 + }, + { + "epoch": 29.822569198012776, + "grad_norm": 1.9848415851593018, + "learning_rate": 7.018977998580553e-05, + "loss": 0.015048198401927948, + "step": 210100 + }, + { + "epoch": 29.823988644428674, + "grad_norm": 1.9600070714950562, + "learning_rate": 7.018836053938964e-05, + "loss": 0.01587709188461304, + "step": 210110 + }, + { + "epoch": 29.825408090844572, + "grad_norm": 0.27479860186576843, + "learning_rate": 7.018694109297374e-05, + "loss": 0.03019278347492218, + "step": 210120 + }, + { + "epoch": 29.826827537260467, + "grad_norm": 0.04216597229242325, + "learning_rate": 7.018552164655785e-05, + "loss": 0.019566655158996582, + "step": 210130 + }, + { + "epoch": 29.828246983676365, + "grad_norm": 0.04467777907848358, + "learning_rate": 7.018410220014195e-05, + "loss": 0.021921955049037933, + "step": 210140 + }, + { + "epoch": 29.829666430092264, + "grad_norm": 2.1350622177124023, + "learning_rate": 7.018268275372605e-05, + "loss": 0.004408906400203705, + "step": 210150 + }, + { + "epoch": 29.831085876508162, + "grad_norm": 0.4451933801174164, + "learning_rate": 7.018126330731016e-05, + "loss": 0.0017247773706912995, + "step": 210160 + }, + { + "epoch": 29.83250532292406, + "grad_norm": 3.126490592956543, + "learning_rate": 7.017984386089425e-05, + "loss": 0.006161994487047196, + "step": 210170 + }, + { + "epoch": 29.83392476933996, + "grad_norm": 0.04792619124054909, + "learning_rate": 7.017842441447837e-05, + "loss": 0.014666743576526642, + "step": 210180 + }, + { + "epoch": 29.835344215755857, + "grad_norm": 6.306613445281982, + "learning_rate": 7.017700496806245e-05, + "loss": 0.010163724422454834, + "step": 210190 + }, + { + "epoch": 29.83676366217175, + "grad_norm": 0.22338517010211945, + "learning_rate": 7.017558552164656e-05, + "loss": 0.0038127053529024126, + "step": 210200 + }, + { + "epoch": 29.83818310858765, + "grad_norm": 2.0981943607330322, + "learning_rate": 7.017416607523066e-05, + "loss": 0.0022517468780279158, + "step": 210210 + }, + { + "epoch": 29.839602555003548, + "grad_norm": 0.26583585143089294, + "learning_rate": 7.017274662881477e-05, + "loss": 0.0022166892886161804, + "step": 210220 + }, + { + "epoch": 29.841022001419446, + "grad_norm": 0.11152207106351852, + "learning_rate": 7.017132718239887e-05, + "loss": 0.012229574471712112, + "step": 210230 + }, + { + "epoch": 29.842441447835345, + "grad_norm": 3.677936315536499, + "learning_rate": 7.016990773598296e-05, + "loss": 0.05997421145439148, + "step": 210240 + }, + { + "epoch": 29.843860894251243, + "grad_norm": 0.2121368646621704, + "learning_rate": 7.016848828956707e-05, + "loss": 0.013307876884937286, + "step": 210250 + }, + { + "epoch": 29.84528034066714, + "grad_norm": 0.06833631545305252, + "learning_rate": 7.016706884315117e-05, + "loss": 0.01831166446208954, + "step": 210260 + }, + { + "epoch": 29.846699787083036, + "grad_norm": 1.9441441297531128, + "learning_rate": 7.016564939673528e-05, + "loss": 0.032204282283782956, + "step": 210270 + }, + { + "epoch": 29.848119233498934, + "grad_norm": 0.21742039918899536, + "learning_rate": 7.016422995031938e-05, + "loss": 0.02830483615398407, + "step": 210280 + }, + { + "epoch": 29.849538679914833, + "grad_norm": 0.03376418352127075, + "learning_rate": 7.016281050390348e-05, + "loss": 0.02388675808906555, + "step": 210290 + }, + { + "epoch": 29.85095812633073, + "grad_norm": 3.8426804542541504, + "learning_rate": 7.016139105748757e-05, + "loss": 0.022425413131713867, + "step": 210300 + }, + { + "epoch": 29.85237757274663, + "grad_norm": 0.10748086869716644, + "learning_rate": 7.015997161107169e-05, + "loss": 0.005413120985031128, + "step": 210310 + }, + { + "epoch": 29.853797019162528, + "grad_norm": 0.010420465841889381, + "learning_rate": 7.015855216465578e-05, + "loss": 0.004019345343112946, + "step": 210320 + }, + { + "epoch": 29.855216465578426, + "grad_norm": 0.03834371641278267, + "learning_rate": 7.01571327182399e-05, + "loss": 0.04053741693496704, + "step": 210330 + }, + { + "epoch": 29.85663591199432, + "grad_norm": 7.8643999099731445, + "learning_rate": 7.015571327182399e-05, + "loss": 0.016335429251194, + "step": 210340 + }, + { + "epoch": 29.85805535841022, + "grad_norm": 0.08243205398321152, + "learning_rate": 7.015429382540809e-05, + "loss": 0.013536286354064942, + "step": 210350 + }, + { + "epoch": 29.859474804826117, + "grad_norm": 0.22149892151355743, + "learning_rate": 7.01528743789922e-05, + "loss": 0.025066664814949034, + "step": 210360 + }, + { + "epoch": 29.860894251242016, + "grad_norm": 0.029212048277258873, + "learning_rate": 7.01514549325763e-05, + "loss": 0.02091346085071564, + "step": 210370 + }, + { + "epoch": 29.862313697657914, + "grad_norm": 0.3328849971294403, + "learning_rate": 7.015003548616041e-05, + "loss": 0.0025015164166688917, + "step": 210380 + }, + { + "epoch": 29.863733144073812, + "grad_norm": 0.0118649210780859, + "learning_rate": 7.01486160397445e-05, + "loss": 0.0020175855606794357, + "step": 210390 + }, + { + "epoch": 29.86515259048971, + "grad_norm": 2.4015092849731445, + "learning_rate": 7.01471965933286e-05, + "loss": 0.022922374308109283, + "step": 210400 + }, + { + "epoch": 29.866572036905605, + "grad_norm": 0.7641366124153137, + "learning_rate": 7.01457771469127e-05, + "loss": 0.005733419209718704, + "step": 210410 + }, + { + "epoch": 29.867991483321504, + "grad_norm": 0.21228757500648499, + "learning_rate": 7.014435770049681e-05, + "loss": 0.01528138667345047, + "step": 210420 + }, + { + "epoch": 29.869410929737402, + "grad_norm": 0.02544327639043331, + "learning_rate": 7.014293825408091e-05, + "loss": 0.0327951967716217, + "step": 210430 + }, + { + "epoch": 29.8708303761533, + "grad_norm": 0.13670480251312256, + "learning_rate": 7.014151880766502e-05, + "loss": 0.001551944762468338, + "step": 210440 + }, + { + "epoch": 29.8722498225692, + "grad_norm": 4.851408958435059, + "learning_rate": 7.014009936124912e-05, + "loss": 0.018519604206085206, + "step": 210450 + }, + { + "epoch": 29.873669268985097, + "grad_norm": 1.3289653062820435, + "learning_rate": 7.013867991483321e-05, + "loss": 0.015623869001865387, + "step": 210460 + }, + { + "epoch": 29.875088715400995, + "grad_norm": 0.8028246760368347, + "learning_rate": 7.013726046841732e-05, + "loss": 0.00957406759262085, + "step": 210470 + }, + { + "epoch": 29.87650816181689, + "grad_norm": 0.7676525712013245, + "learning_rate": 7.013584102200142e-05, + "loss": 0.016458290815353393, + "step": 210480 + }, + { + "epoch": 29.87792760823279, + "grad_norm": 0.016941837966442108, + "learning_rate": 7.013442157558553e-05, + "loss": 0.0018132548779249192, + "step": 210490 + }, + { + "epoch": 29.879347054648687, + "grad_norm": 0.05174301192164421, + "learning_rate": 7.013300212916962e-05, + "loss": 0.02605198323726654, + "step": 210500 + }, + { + "epoch": 29.879347054648687, + "eval_accuracy": 0.9872830164684937, + "eval_loss": 0.0519292876124382, + "eval_runtime": 32.3543, + "eval_samples_per_second": 486.087, + "eval_steps_per_second": 15.207, + "step": 210500 + }, + { + "epoch": 29.880766501064585, + "grad_norm": 0.5584043264389038, + "learning_rate": 7.013158268275373e-05, + "loss": 0.01894974857568741, + "step": 210510 + }, + { + "epoch": 29.882185947480483, + "grad_norm": 0.19691333174705505, + "learning_rate": 7.013016323633783e-05, + "loss": 0.011624640226364136, + "step": 210520 + }, + { + "epoch": 29.88360539389638, + "grad_norm": 0.015322535298764706, + "learning_rate": 7.012874378992194e-05, + "loss": 0.05853897929191589, + "step": 210530 + }, + { + "epoch": 29.88502484031228, + "grad_norm": 0.03605230152606964, + "learning_rate": 7.012732434350603e-05, + "loss": 0.024759840965270997, + "step": 210540 + }, + { + "epoch": 29.886444286728175, + "grad_norm": 28.4610652923584, + "learning_rate": 7.012590489709013e-05, + "loss": 0.07793487310409546, + "step": 210550 + }, + { + "epoch": 29.887863733144073, + "grad_norm": 0.016418175771832466, + "learning_rate": 7.012448545067424e-05, + "loss": 0.004225868359208107, + "step": 210560 + }, + { + "epoch": 29.88928317955997, + "grad_norm": 7.285479545593262, + "learning_rate": 7.012306600425834e-05, + "loss": 0.0037365615367889403, + "step": 210570 + }, + { + "epoch": 29.89070262597587, + "grad_norm": 0.1448362022638321, + "learning_rate": 7.012164655784245e-05, + "loss": 0.010446234047412873, + "step": 210580 + }, + { + "epoch": 29.892122072391768, + "grad_norm": 0.021222852170467377, + "learning_rate": 7.012022711142655e-05, + "loss": 0.04382705986499787, + "step": 210590 + }, + { + "epoch": 29.893541518807666, + "grad_norm": 1.1870070695877075, + "learning_rate": 7.011880766501065e-05, + "loss": 0.019762569665908815, + "step": 210600 + }, + { + "epoch": 29.894960965223564, + "grad_norm": 1.2411420345306396, + "learning_rate": 7.011738821859474e-05, + "loss": 0.0013447798788547515, + "step": 210610 + }, + { + "epoch": 29.89638041163946, + "grad_norm": 18.725135803222656, + "learning_rate": 7.011596877217885e-05, + "loss": 0.009711290150880814, + "step": 210620 + }, + { + "epoch": 29.897799858055357, + "grad_norm": 3.1602108478546143, + "learning_rate": 7.011454932576295e-05, + "loss": 0.002853131666779518, + "step": 210630 + }, + { + "epoch": 29.899219304471256, + "grad_norm": 2.287039041519165, + "learning_rate": 7.011312987934706e-05, + "loss": 0.0026075053960084916, + "step": 210640 + }, + { + "epoch": 29.900638750887154, + "grad_norm": 0.33860427141189575, + "learning_rate": 7.011171043293116e-05, + "loss": 0.04841657876968384, + "step": 210650 + }, + { + "epoch": 29.902058197303052, + "grad_norm": 4.314185619354248, + "learning_rate": 7.011029098651526e-05, + "loss": 0.043196958303451535, + "step": 210660 + }, + { + "epoch": 29.90347764371895, + "grad_norm": 3.717047691345215, + "learning_rate": 7.010887154009937e-05, + "loss": 0.0049698606133461, + "step": 210670 + }, + { + "epoch": 29.90489709013485, + "grad_norm": 0.7943224906921387, + "learning_rate": 7.010745209368346e-05, + "loss": 0.020778357982635498, + "step": 210680 + }, + { + "epoch": 29.906316536550744, + "grad_norm": 14.602803230285645, + "learning_rate": 7.010603264726758e-05, + "loss": 0.011139304935932159, + "step": 210690 + }, + { + "epoch": 29.907735982966642, + "grad_norm": 0.44264012575149536, + "learning_rate": 7.010461320085167e-05, + "loss": 0.01831185817718506, + "step": 210700 + }, + { + "epoch": 29.90915542938254, + "grad_norm": 3.011425256729126, + "learning_rate": 7.010319375443577e-05, + "loss": 0.02365773618221283, + "step": 210710 + }, + { + "epoch": 29.91057487579844, + "grad_norm": 0.5699248909950256, + "learning_rate": 7.010177430801987e-05, + "loss": 0.04038099646568298, + "step": 210720 + }, + { + "epoch": 29.911994322214337, + "grad_norm": 1.3826227188110352, + "learning_rate": 7.010035486160398e-05, + "loss": 0.0011871442198753356, + "step": 210730 + }, + { + "epoch": 29.913413768630235, + "grad_norm": 0.0123749403283, + "learning_rate": 7.009893541518808e-05, + "loss": 0.010754004120826721, + "step": 210740 + }, + { + "epoch": 29.914833215046134, + "grad_norm": 1.1084388494491577, + "learning_rate": 7.009751596877219e-05, + "loss": 0.0030169848352670668, + "step": 210750 + }, + { + "epoch": 29.91625266146203, + "grad_norm": 2.928938388824463, + "learning_rate": 7.009609652235628e-05, + "loss": 0.023078998923301695, + "step": 210760 + }, + { + "epoch": 29.917672107877927, + "grad_norm": 2.1470890045166016, + "learning_rate": 7.009467707594038e-05, + "loss": 0.010352196544408799, + "step": 210770 + }, + { + "epoch": 29.919091554293825, + "grad_norm": 3.785104513168335, + "learning_rate": 7.009325762952449e-05, + "loss": 0.04870032966136932, + "step": 210780 + }, + { + "epoch": 29.920511000709723, + "grad_norm": 0.17893315851688385, + "learning_rate": 7.009183818310859e-05, + "loss": 0.011190450936555862, + "step": 210790 + }, + { + "epoch": 29.92193044712562, + "grad_norm": 0.09943350404500961, + "learning_rate": 7.00904187366927e-05, + "loss": 0.0491945743560791, + "step": 210800 + }, + { + "epoch": 29.92334989354152, + "grad_norm": 17.256948471069336, + "learning_rate": 7.008899929027678e-05, + "loss": 0.031239277124404906, + "step": 210810 + }, + { + "epoch": 29.924769339957418, + "grad_norm": 7.590287208557129, + "learning_rate": 7.00875798438609e-05, + "loss": 0.013414359092712403, + "step": 210820 + }, + { + "epoch": 29.926188786373313, + "grad_norm": 0.03780009597539902, + "learning_rate": 7.008616039744499e-05, + "loss": 0.01039394736289978, + "step": 210830 + }, + { + "epoch": 29.92760823278921, + "grad_norm": 9.73901081085205, + "learning_rate": 7.00847409510291e-05, + "loss": 0.02032313197851181, + "step": 210840 + }, + { + "epoch": 29.92902767920511, + "grad_norm": 0.6268885731697083, + "learning_rate": 7.008332150461322e-05, + "loss": 0.041335922479629514, + "step": 210850 + }, + { + "epoch": 29.930447125621008, + "grad_norm": 0.10813816636800766, + "learning_rate": 7.00819020581973e-05, + "loss": 0.012819239497184753, + "step": 210860 + }, + { + "epoch": 29.931866572036906, + "grad_norm": 0.009968667291104794, + "learning_rate": 7.008048261178141e-05, + "loss": 0.011910492181777954, + "step": 210870 + }, + { + "epoch": 29.933286018452804, + "grad_norm": 0.3656076490879059, + "learning_rate": 7.007906316536551e-05, + "loss": 0.0050042420625686646, + "step": 210880 + }, + { + "epoch": 29.934705464868703, + "grad_norm": 0.07637067139148712, + "learning_rate": 7.007764371894962e-05, + "loss": 0.014927402138710022, + "step": 210890 + }, + { + "epoch": 29.936124911284598, + "grad_norm": 0.2737957239151001, + "learning_rate": 7.007622427253372e-05, + "loss": 0.007909691333770752, + "step": 210900 + }, + { + "epoch": 29.937544357700496, + "grad_norm": 0.13002285361289978, + "learning_rate": 7.007480482611781e-05, + "loss": 0.011070364713668823, + "step": 210910 + }, + { + "epoch": 29.938963804116394, + "grad_norm": 11.745224952697754, + "learning_rate": 7.007338537970191e-05, + "loss": 0.023775207996368408, + "step": 210920 + }, + { + "epoch": 29.940383250532292, + "grad_norm": 0.09761713445186615, + "learning_rate": 7.007196593328602e-05, + "loss": 0.0008118517696857452, + "step": 210930 + }, + { + "epoch": 29.94180269694819, + "grad_norm": 0.8777351975440979, + "learning_rate": 7.007054648687013e-05, + "loss": 0.013869833946228028, + "step": 210940 + }, + { + "epoch": 29.94322214336409, + "grad_norm": 0.008516182191669941, + "learning_rate": 7.006912704045423e-05, + "loss": 0.008169782161712647, + "step": 210950 + }, + { + "epoch": 29.944641589779987, + "grad_norm": 2.615320920944214, + "learning_rate": 7.006770759403833e-05, + "loss": 0.005178046971559524, + "step": 210960 + }, + { + "epoch": 29.946061036195882, + "grad_norm": 0.8279242515563965, + "learning_rate": 7.006628814762242e-05, + "loss": 0.008460301160812377, + "step": 210970 + }, + { + "epoch": 29.94748048261178, + "grad_norm": 0.025956755504012108, + "learning_rate": 7.006486870120654e-05, + "loss": 0.027733737230300905, + "step": 210980 + }, + { + "epoch": 29.94889992902768, + "grad_norm": 0.05930059403181076, + "learning_rate": 7.006344925479063e-05, + "loss": 0.006725509464740753, + "step": 210990 + }, + { + "epoch": 29.950319375443577, + "grad_norm": 1.0148698091506958, + "learning_rate": 7.006202980837474e-05, + "loss": 0.005101503431797027, + "step": 211000 + }, + { + "epoch": 29.950319375443577, + "eval_accuracy": 0.9893813187511922, + "eval_loss": 0.04631072282791138, + "eval_runtime": 33.025, + "eval_samples_per_second": 476.215, + "eval_steps_per_second": 14.898, + "step": 211000 + }, + { + "epoch": 29.951738821859475, + "grad_norm": 0.042548034340143204, + "learning_rate": 7.006061036195883e-05, + "loss": 0.014736038446426392, + "step": 211010 + }, + { + "epoch": 29.953158268275374, + "grad_norm": 0.2442946434020996, + "learning_rate": 7.005919091554294e-05, + "loss": 0.0064274862408638, + "step": 211020 + }, + { + "epoch": 29.954577714691272, + "grad_norm": 3.058619737625122, + "learning_rate": 7.005777146912705e-05, + "loss": 0.02532339096069336, + "step": 211030 + }, + { + "epoch": 29.955997161107167, + "grad_norm": 0.04528282210230827, + "learning_rate": 7.005635202271115e-05, + "loss": 0.012376990169286728, + "step": 211040 + }, + { + "epoch": 29.957416607523065, + "grad_norm": 0.007500608451664448, + "learning_rate": 7.005493257629526e-05, + "loss": 0.01364167034626007, + "step": 211050 + }, + { + "epoch": 29.958836053938963, + "grad_norm": 0.5466422438621521, + "learning_rate": 7.005351312987935e-05, + "loss": 0.010111391544342041, + "step": 211060 + }, + { + "epoch": 29.96025550035486, + "grad_norm": 0.016730457544326782, + "learning_rate": 7.005209368346345e-05, + "loss": 0.002797259762883186, + "step": 211070 + }, + { + "epoch": 29.96167494677076, + "grad_norm": 0.010776524432003498, + "learning_rate": 7.005067423704755e-05, + "loss": 0.0034115824848413466, + "step": 211080 + }, + { + "epoch": 29.96309439318666, + "grad_norm": 0.08483441919088364, + "learning_rate": 7.004925479063166e-05, + "loss": 0.011999818682670593, + "step": 211090 + }, + { + "epoch": 29.964513839602557, + "grad_norm": 2.3872272968292236, + "learning_rate": 7.004783534421576e-05, + "loss": 0.0038280710577964783, + "step": 211100 + }, + { + "epoch": 29.96593328601845, + "grad_norm": 0.1357547789812088, + "learning_rate": 7.004641589779987e-05, + "loss": 0.007810837030410767, + "step": 211110 + }, + { + "epoch": 29.96735273243435, + "grad_norm": 0.09707599878311157, + "learning_rate": 7.004499645138397e-05, + "loss": 0.006424635648727417, + "step": 211120 + }, + { + "epoch": 29.968772178850248, + "grad_norm": 0.051539842039346695, + "learning_rate": 7.004357700496806e-05, + "loss": 0.002042248845100403, + "step": 211130 + }, + { + "epoch": 29.970191625266146, + "grad_norm": 0.2518443167209625, + "learning_rate": 7.004215755855217e-05, + "loss": 0.011627249419689178, + "step": 211140 + }, + { + "epoch": 29.971611071682045, + "grad_norm": 0.6993831396102905, + "learning_rate": 7.004073811213627e-05, + "loss": 0.012432660907506943, + "step": 211150 + }, + { + "epoch": 29.973030518097943, + "grad_norm": 0.1353403478860855, + "learning_rate": 7.003931866572038e-05, + "loss": 0.01787296384572983, + "step": 211160 + }, + { + "epoch": 29.97444996451384, + "grad_norm": 0.20163406431674957, + "learning_rate": 7.003789921930447e-05, + "loss": 0.007647266238927841, + "step": 211170 + }, + { + "epoch": 29.975869410929736, + "grad_norm": 0.0409359335899353, + "learning_rate": 7.003647977288858e-05, + "loss": 0.030155873298645018, + "step": 211180 + }, + { + "epoch": 29.977288857345634, + "grad_norm": 4.942883014678955, + "learning_rate": 7.003506032647267e-05, + "loss": 0.005973444879055023, + "step": 211190 + }, + { + "epoch": 29.978708303761533, + "grad_norm": 1.335252046585083, + "learning_rate": 7.003364088005679e-05, + "loss": 0.016622060537338258, + "step": 211200 + }, + { + "epoch": 29.98012775017743, + "grad_norm": 0.004320142790675163, + "learning_rate": 7.003222143364088e-05, + "loss": 0.023348397016525267, + "step": 211210 + }, + { + "epoch": 29.98154719659333, + "grad_norm": 0.1693977266550064, + "learning_rate": 7.003080198722498e-05, + "loss": 0.012502394616603851, + "step": 211220 + }, + { + "epoch": 29.982966643009227, + "grad_norm": 0.6542554497718811, + "learning_rate": 7.002938254080909e-05, + "loss": 0.009727237373590469, + "step": 211230 + }, + { + "epoch": 29.984386089425126, + "grad_norm": 0.61367267370224, + "learning_rate": 7.002796309439319e-05, + "loss": 0.0014375995844602585, + "step": 211240 + }, + { + "epoch": 29.98580553584102, + "grad_norm": 0.12728919088840485, + "learning_rate": 7.00265436479773e-05, + "loss": 0.012868376076221466, + "step": 211250 + }, + { + "epoch": 29.98722498225692, + "grad_norm": 0.008273114450275898, + "learning_rate": 7.00251242015614e-05, + "loss": 0.02186914086341858, + "step": 211260 + }, + { + "epoch": 29.988644428672817, + "grad_norm": 0.007098309695720673, + "learning_rate": 7.00237047551455e-05, + "loss": 0.004061097651720047, + "step": 211270 + }, + { + "epoch": 29.990063875088715, + "grad_norm": 0.4283337891101837, + "learning_rate": 7.002228530872959e-05, + "loss": 0.018356017768383026, + "step": 211280 + }, + { + "epoch": 29.991483321504614, + "grad_norm": 3.7067079544067383, + "learning_rate": 7.00208658623137e-05, + "loss": 0.003904668614268303, + "step": 211290 + }, + { + "epoch": 29.992902767920512, + "grad_norm": 0.11674164235591888, + "learning_rate": 7.00194464158978e-05, + "loss": 0.006020195037126541, + "step": 211300 + }, + { + "epoch": 29.99432221433641, + "grad_norm": 0.2138572782278061, + "learning_rate": 7.001802696948191e-05, + "loss": 0.02794843018054962, + "step": 211310 + }, + { + "epoch": 29.995741660752305, + "grad_norm": 7.6637725830078125, + "learning_rate": 7.001660752306601e-05, + "loss": 0.04382042586803436, + "step": 211320 + }, + { + "epoch": 29.997161107168203, + "grad_norm": 0.0178285650908947, + "learning_rate": 7.00151880766501e-05, + "loss": 0.007542849332094192, + "step": 211330 + }, + { + "epoch": 29.9985805535841, + "grad_norm": 0.09237750619649887, + "learning_rate": 7.001376863023422e-05, + "loss": 0.0012067213654518127, + "step": 211340 + }, + { + "epoch": 30.0, + "grad_norm": 0.08866535872220993, + "learning_rate": 7.001234918381831e-05, + "loss": 0.009071370959281922, + "step": 211350 + }, + { + "epoch": 30.0014194464159, + "grad_norm": 0.021719258278608322, + "learning_rate": 7.001092973740243e-05, + "loss": 0.02918194532394409, + "step": 211360 + }, + { + "epoch": 30.002838892831797, + "grad_norm": 10.265488624572754, + "learning_rate": 7.000951029098652e-05, + "loss": 0.019150686264038087, + "step": 211370 + }, + { + "epoch": 30.004258339247695, + "grad_norm": 8.328301429748535, + "learning_rate": 7.000809084457062e-05, + "loss": 0.05575352907180786, + "step": 211380 + }, + { + "epoch": 30.00567778566359, + "grad_norm": 0.4792425036430359, + "learning_rate": 7.000667139815472e-05, + "loss": 0.027524644136428834, + "step": 211390 + }, + { + "epoch": 30.007097232079488, + "grad_norm": 0.3047794997692108, + "learning_rate": 7.000525195173883e-05, + "loss": 0.021050310134887694, + "step": 211400 + }, + { + "epoch": 30.008516678495386, + "grad_norm": 0.9992392063140869, + "learning_rate": 7.000383250532293e-05, + "loss": 0.002852541580796242, + "step": 211410 + }, + { + "epoch": 30.009936124911285, + "grad_norm": 0.011997714638710022, + "learning_rate": 7.000241305890704e-05, + "loss": 0.004365823417901993, + "step": 211420 + }, + { + "epoch": 30.011355571327183, + "grad_norm": 5.450412750244141, + "learning_rate": 7.000099361249113e-05, + "loss": 0.006674918532371521, + "step": 211430 + }, + { + "epoch": 30.01277501774308, + "grad_norm": 0.019416842609643936, + "learning_rate": 6.999957416607523e-05, + "loss": 0.001654459908604622, + "step": 211440 + }, + { + "epoch": 30.01419446415898, + "grad_norm": 0.0044103688560426235, + "learning_rate": 6.999815471965934e-05, + "loss": 0.0025305885821580885, + "step": 211450 + }, + { + "epoch": 30.015613910574874, + "grad_norm": 0.007769922260195017, + "learning_rate": 6.999673527324344e-05, + "loss": 0.028684568405151368, + "step": 211460 + }, + { + "epoch": 30.017033356990773, + "grad_norm": 7.382498264312744, + "learning_rate": 6.999531582682755e-05, + "loss": 0.009305721521377564, + "step": 211470 + }, + { + "epoch": 30.01845280340667, + "grad_norm": 1.5825101137161255, + "learning_rate": 6.999389638041163e-05, + "loss": 0.010973513126373291, + "step": 211480 + }, + { + "epoch": 30.01987224982257, + "grad_norm": 0.12606993317604065, + "learning_rate": 6.999247693399575e-05, + "loss": 0.023143777251243593, + "step": 211490 + }, + { + "epoch": 30.021291696238467, + "grad_norm": 0.2321770042181015, + "learning_rate": 6.999105748757984e-05, + "loss": 0.030882963538169862, + "step": 211500 + }, + { + "epoch": 30.021291696238467, + "eval_accuracy": 0.9844852800915623, + "eval_loss": 0.06150616705417633, + "eval_runtime": 31.4493, + "eval_samples_per_second": 500.074, + "eval_steps_per_second": 15.644, + "step": 211500 + }, + { + "epoch": 30.022711142654366, + "grad_norm": 0.013512508943676949, + "learning_rate": 6.998963804116395e-05, + "loss": 0.0038036204874515533, + "step": 211510 + }, + { + "epoch": 30.024130589070264, + "grad_norm": 2.0810863971710205, + "learning_rate": 6.998821859474805e-05, + "loss": 0.007263979315757752, + "step": 211520 + }, + { + "epoch": 30.02555003548616, + "grad_norm": 0.03146110102534294, + "learning_rate": 6.998679914833215e-05, + "loss": 0.0029671624302864074, + "step": 211530 + }, + { + "epoch": 30.026969481902057, + "grad_norm": 0.08032510429620743, + "learning_rate": 6.998537970191626e-05, + "loss": 0.0008687902241945267, + "step": 211540 + }, + { + "epoch": 30.028388928317955, + "grad_norm": 0.01906907744705677, + "learning_rate": 6.998396025550036e-05, + "loss": 0.028126344084739685, + "step": 211550 + }, + { + "epoch": 30.029808374733854, + "grad_norm": 1.5431476831436157, + "learning_rate": 6.998254080908447e-05, + "loss": 0.010562259703874588, + "step": 211560 + }, + { + "epoch": 30.031227821149752, + "grad_norm": 0.01194123923778534, + "learning_rate": 6.998112136266856e-05, + "loss": 0.020265528559684755, + "step": 211570 + }, + { + "epoch": 30.03264726756565, + "grad_norm": 1.8185986280441284, + "learning_rate": 6.997970191625266e-05, + "loss": 0.009165041893720628, + "step": 211580 + }, + { + "epoch": 30.03406671398155, + "grad_norm": 1.457209587097168, + "learning_rate": 6.997828246983676e-05, + "loss": 0.011584511399269104, + "step": 211590 + }, + { + "epoch": 30.035486160397443, + "grad_norm": 1.9136199951171875, + "learning_rate": 6.997686302342087e-05, + "loss": 0.01792135536670685, + "step": 211600 + }, + { + "epoch": 30.03690560681334, + "grad_norm": 0.30868199467658997, + "learning_rate": 6.997544357700497e-05, + "loss": 0.013226291537284851, + "step": 211610 + }, + { + "epoch": 30.03832505322924, + "grad_norm": 0.4339883327484131, + "learning_rate": 6.997402413058908e-05, + "loss": 0.008551794290542602, + "step": 211620 + }, + { + "epoch": 30.03974449964514, + "grad_norm": 0.30592116713523865, + "learning_rate": 6.997260468417318e-05, + "loss": 0.009321023523807526, + "step": 211630 + }, + { + "epoch": 30.041163946061037, + "grad_norm": 0.04794823005795479, + "learning_rate": 6.997118523775727e-05, + "loss": 0.01687488555908203, + "step": 211640 + }, + { + "epoch": 30.042583392476935, + "grad_norm": 0.06836681813001633, + "learning_rate": 6.996976579134138e-05, + "loss": 0.013798435032367707, + "step": 211650 + }, + { + "epoch": 30.044002838892833, + "grad_norm": 0.005391431972384453, + "learning_rate": 6.996834634492548e-05, + "loss": 0.023706623911857606, + "step": 211660 + }, + { + "epoch": 30.045422285308728, + "grad_norm": 0.030010733753442764, + "learning_rate": 6.996692689850959e-05, + "loss": 0.004481830820441246, + "step": 211670 + }, + { + "epoch": 30.046841731724626, + "grad_norm": 0.04397927224636078, + "learning_rate": 6.996550745209368e-05, + "loss": 0.024973911046981812, + "step": 211680 + }, + { + "epoch": 30.048261178140525, + "grad_norm": 1.0651755332946777, + "learning_rate": 6.996408800567779e-05, + "loss": 0.011841523647308349, + "step": 211690 + }, + { + "epoch": 30.049680624556423, + "grad_norm": 0.011277709156274796, + "learning_rate": 6.996266855926188e-05, + "loss": 0.004130860790610313, + "step": 211700 + }, + { + "epoch": 30.05110007097232, + "grad_norm": 11.453584671020508, + "learning_rate": 6.9961249112846e-05, + "loss": 0.01691017150878906, + "step": 211710 + }, + { + "epoch": 30.05251951738822, + "grad_norm": 1.1528111696243286, + "learning_rate": 6.99598296664301e-05, + "loss": 0.0023319311439990997, + "step": 211720 + }, + { + "epoch": 30.053938963804118, + "grad_norm": 16.629817962646484, + "learning_rate": 6.99584102200142e-05, + "loss": 0.02117021232843399, + "step": 211730 + }, + { + "epoch": 30.055358410220013, + "grad_norm": 0.3019838333129883, + "learning_rate": 6.99569907735983e-05, + "loss": 0.0038490056991577148, + "step": 211740 + }, + { + "epoch": 30.05677785663591, + "grad_norm": 0.03861602023243904, + "learning_rate": 6.99555713271824e-05, + "loss": 0.06846773624420166, + "step": 211750 + }, + { + "epoch": 30.05819730305181, + "grad_norm": 0.05614195764064789, + "learning_rate": 6.995415188076651e-05, + "loss": 0.007803746312856674, + "step": 211760 + }, + { + "epoch": 30.059616749467708, + "grad_norm": 8.09411334991455, + "learning_rate": 6.995273243435061e-05, + "loss": 0.03796707093715668, + "step": 211770 + }, + { + "epoch": 30.061036195883606, + "grad_norm": 9.873359680175781, + "learning_rate": 6.995131298793472e-05, + "loss": 0.005271016061306, + "step": 211780 + }, + { + "epoch": 30.062455642299504, + "grad_norm": 1.2882072925567627, + "learning_rate": 6.99500354861604e-05, + "loss": 0.023053589463233947, + "step": 211790 + }, + { + "epoch": 30.063875088715402, + "grad_norm": 0.5354165434837341, + "learning_rate": 6.994861603974451e-05, + "loss": 0.007648727297782898, + "step": 211800 + }, + { + "epoch": 30.065294535131297, + "grad_norm": 8.144124031066895, + "learning_rate": 6.99471965933286e-05, + "loss": 0.01458071917295456, + "step": 211810 + }, + { + "epoch": 30.066713981547196, + "grad_norm": 0.026305319741368294, + "learning_rate": 6.994577714691271e-05, + "loss": 0.006798344850540161, + "step": 211820 + }, + { + "epoch": 30.068133427963094, + "grad_norm": 1.933854103088379, + "learning_rate": 6.99443577004968e-05, + "loss": 0.0061323467642068865, + "step": 211830 + }, + { + "epoch": 30.069552874378992, + "grad_norm": 1.4600192308425903, + "learning_rate": 6.994293825408092e-05, + "loss": 0.02843245565891266, + "step": 211840 + }, + { + "epoch": 30.07097232079489, + "grad_norm": 0.16366243362426758, + "learning_rate": 6.994151880766501e-05, + "loss": 0.0163414865732193, + "step": 211850 + }, + { + "epoch": 30.07239176721079, + "grad_norm": 0.0026283650659024715, + "learning_rate": 6.994009936124911e-05, + "loss": 0.0391406238079071, + "step": 211860 + }, + { + "epoch": 30.073811213626687, + "grad_norm": 7.28436279296875, + "learning_rate": 6.993867991483322e-05, + "loss": 0.0025311820209026337, + "step": 211870 + }, + { + "epoch": 30.075230660042582, + "grad_norm": 7.104146957397461, + "learning_rate": 6.993726046841732e-05, + "loss": 0.05019637942314148, + "step": 211880 + }, + { + "epoch": 30.07665010645848, + "grad_norm": 0.3594396710395813, + "learning_rate": 6.993584102200143e-05, + "loss": 0.033248919248580935, + "step": 211890 + }, + { + "epoch": 30.07806955287438, + "grad_norm": 0.017716702073812485, + "learning_rate": 6.993442157558553e-05, + "loss": 0.0012816138565540313, + "step": 211900 + }, + { + "epoch": 30.079488999290277, + "grad_norm": 0.06887849420309067, + "learning_rate": 6.993300212916963e-05, + "loss": 0.009827596694231033, + "step": 211910 + }, + { + "epoch": 30.080908445706175, + "grad_norm": 0.18302486836910248, + "learning_rate": 6.993158268275372e-05, + "loss": 0.004198795184493065, + "step": 211920 + }, + { + "epoch": 30.082327892122073, + "grad_norm": 0.00330292503349483, + "learning_rate": 6.993016323633783e-05, + "loss": 0.0010709919035434723, + "step": 211930 + }, + { + "epoch": 30.08374733853797, + "grad_norm": 0.20266108214855194, + "learning_rate": 6.992874378992193e-05, + "loss": 0.0008147977292537689, + "step": 211940 + }, + { + "epoch": 30.085166784953866, + "grad_norm": 0.004657120443880558, + "learning_rate": 6.992732434350604e-05, + "loss": 0.0015729721635580063, + "step": 211950 + }, + { + "epoch": 30.086586231369765, + "grad_norm": 0.13762293756008148, + "learning_rate": 6.992590489709014e-05, + "loss": 0.005975906178355217, + "step": 211960 + }, + { + "epoch": 30.088005677785663, + "grad_norm": 0.2619955837726593, + "learning_rate": 6.992448545067424e-05, + "loss": 0.0012866102159023284, + "step": 211970 + }, + { + "epoch": 30.08942512420156, + "grad_norm": 0.012761157006025314, + "learning_rate": 6.992306600425835e-05, + "loss": 0.002117026224732399, + "step": 211980 + }, + { + "epoch": 30.09084457061746, + "grad_norm": 0.016301140189170837, + "learning_rate": 6.992164655784245e-05, + "loss": 0.015670649707317352, + "step": 211990 + }, + { + "epoch": 30.092264017033358, + "grad_norm": 0.050057001411914825, + "learning_rate": 6.992022711142656e-05, + "loss": 0.01325473040342331, + "step": 212000 + }, + { + "epoch": 30.092264017033358, + "eval_accuracy": 0.9879824505627265, + "eval_loss": 0.04827112331986427, + "eval_runtime": 34.7227, + "eval_samples_per_second": 452.931, + "eval_steps_per_second": 14.169, + "step": 212000 + }, + { + "epoch": 30.093683463449256, + "grad_norm": 0.6575603485107422, + "learning_rate": 6.991880766501064e-05, + "loss": 0.010229889303445816, + "step": 212010 + }, + { + "epoch": 30.09510290986515, + "grad_norm": 0.02255658060312271, + "learning_rate": 6.991738821859475e-05, + "loss": 0.015713316202163697, + "step": 212020 + }, + { + "epoch": 30.09652235628105, + "grad_norm": 0.3856049180030823, + "learning_rate": 6.991596877217885e-05, + "loss": 0.008151683211326598, + "step": 212030 + }, + { + "epoch": 30.097941802696948, + "grad_norm": 0.032382071018218994, + "learning_rate": 6.991454932576296e-05, + "loss": 0.05020936131477356, + "step": 212040 + }, + { + "epoch": 30.099361249112846, + "grad_norm": 0.07424971461296082, + "learning_rate": 6.991312987934706e-05, + "loss": 0.017887437343597413, + "step": 212050 + }, + { + "epoch": 30.100780695528744, + "grad_norm": 0.40596747398376465, + "learning_rate": 6.991171043293117e-05, + "loss": 0.02576586604118347, + "step": 212060 + }, + { + "epoch": 30.102200141944643, + "grad_norm": 0.013728444464504719, + "learning_rate": 6.991029098651527e-05, + "loss": 0.027828171849250793, + "step": 212070 + }, + { + "epoch": 30.10361958836054, + "grad_norm": 3.182605266571045, + "learning_rate": 6.990887154009936e-05, + "loss": 0.018579551577568056, + "step": 212080 + }, + { + "epoch": 30.105039034776436, + "grad_norm": 0.03567405790090561, + "learning_rate": 6.990745209368347e-05, + "loss": 0.011724074929952621, + "step": 212090 + }, + { + "epoch": 30.106458481192334, + "grad_norm": 0.026548059657216072, + "learning_rate": 6.990603264726757e-05, + "loss": 0.010010054707527161, + "step": 212100 + }, + { + "epoch": 30.107877927608232, + "grad_norm": 0.5232923626899719, + "learning_rate": 6.990461320085168e-05, + "loss": 0.06171758770942688, + "step": 212110 + }, + { + "epoch": 30.10929737402413, + "grad_norm": 0.6364029049873352, + "learning_rate": 6.990319375443577e-05, + "loss": 0.03967137634754181, + "step": 212120 + }, + { + "epoch": 30.11071682044003, + "grad_norm": 0.022541826590895653, + "learning_rate": 6.990177430801988e-05, + "loss": 0.024051125347614288, + "step": 212130 + }, + { + "epoch": 30.112136266855927, + "grad_norm": 0.7877983450889587, + "learning_rate": 6.990035486160397e-05, + "loss": 0.007774632424116135, + "step": 212140 + }, + { + "epoch": 30.113555713271825, + "grad_norm": 4.348184108734131, + "learning_rate": 6.989893541518808e-05, + "loss": 0.008421513438224792, + "step": 212150 + }, + { + "epoch": 30.11497515968772, + "grad_norm": 4.1026763916015625, + "learning_rate": 6.989751596877218e-05, + "loss": 0.004843123257160187, + "step": 212160 + }, + { + "epoch": 30.11639460610362, + "grad_norm": 3.9769046306610107, + "learning_rate": 6.989609652235628e-05, + "loss": 0.015046152472496032, + "step": 212170 + }, + { + "epoch": 30.117814052519517, + "grad_norm": 9.623827934265137, + "learning_rate": 6.989467707594039e-05, + "loss": 0.02101084440946579, + "step": 212180 + }, + { + "epoch": 30.119233498935415, + "grad_norm": 0.011805993504822254, + "learning_rate": 6.989325762952449e-05, + "loss": 0.004447016492486, + "step": 212190 + }, + { + "epoch": 30.120652945351313, + "grad_norm": 3.4689929485321045, + "learning_rate": 6.98918381831086e-05, + "loss": 0.013919220864772796, + "step": 212200 + }, + { + "epoch": 30.12207239176721, + "grad_norm": 0.008309482596814632, + "learning_rate": 6.98904187366927e-05, + "loss": 0.006451842188835144, + "step": 212210 + }, + { + "epoch": 30.12349183818311, + "grad_norm": 0.01608753576874733, + "learning_rate": 6.98889992902768e-05, + "loss": 0.009026254713535308, + "step": 212220 + }, + { + "epoch": 30.124911284599005, + "grad_norm": 0.0062293424271047115, + "learning_rate": 6.988757984386089e-05, + "loss": 0.02639628350734711, + "step": 212230 + }, + { + "epoch": 30.126330731014903, + "grad_norm": 0.010195920243859291, + "learning_rate": 6.9886160397445e-05, + "loss": 0.006513465940952301, + "step": 212240 + }, + { + "epoch": 30.1277501774308, + "grad_norm": 0.010919681750237942, + "learning_rate": 6.98847409510291e-05, + "loss": 0.0009154029190540314, + "step": 212250 + }, + { + "epoch": 30.1291696238467, + "grad_norm": 0.16457945108413696, + "learning_rate": 6.988332150461321e-05, + "loss": 0.011978773772716523, + "step": 212260 + }, + { + "epoch": 30.130589070262598, + "grad_norm": 0.09812165051698685, + "learning_rate": 6.988190205819731e-05, + "loss": 0.018270018696784972, + "step": 212270 + }, + { + "epoch": 30.132008516678496, + "grad_norm": 0.05046382546424866, + "learning_rate": 6.98804826117814e-05, + "loss": 0.0011176992207765578, + "step": 212280 + }, + { + "epoch": 30.133427963094395, + "grad_norm": 0.03142794221639633, + "learning_rate": 6.987906316536552e-05, + "loss": 0.006418798863887787, + "step": 212290 + }, + { + "epoch": 30.13484740951029, + "grad_norm": 0.010454786010086536, + "learning_rate": 6.987764371894961e-05, + "loss": 0.003117601573467255, + "step": 212300 + }, + { + "epoch": 30.136266855926188, + "grad_norm": 19.044401168823242, + "learning_rate": 6.987622427253372e-05, + "loss": 0.029964715242385864, + "step": 212310 + }, + { + "epoch": 30.137686302342086, + "grad_norm": 0.004592748358845711, + "learning_rate": 6.987480482611781e-05, + "loss": 0.01951422095298767, + "step": 212320 + }, + { + "epoch": 30.139105748757984, + "grad_norm": 0.9357092976570129, + "learning_rate": 6.987338537970192e-05, + "loss": 0.0042485356330871586, + "step": 212330 + }, + { + "epoch": 30.140525195173883, + "grad_norm": 15.977811813354492, + "learning_rate": 6.987196593328602e-05, + "loss": 0.024503645300865174, + "step": 212340 + }, + { + "epoch": 30.14194464158978, + "grad_norm": 0.6749070286750793, + "learning_rate": 6.987054648687013e-05, + "loss": 0.08136647939682007, + "step": 212350 + }, + { + "epoch": 30.14336408800568, + "grad_norm": 0.018691346049308777, + "learning_rate": 6.986912704045422e-05, + "loss": 0.002069659158587456, + "step": 212360 + }, + { + "epoch": 30.144783534421574, + "grad_norm": 0.10765863209962845, + "learning_rate": 6.986770759403832e-05, + "loss": 0.022393032908439636, + "step": 212370 + }, + { + "epoch": 30.146202980837472, + "grad_norm": 0.012182637117803097, + "learning_rate": 6.986628814762243e-05, + "loss": 0.01871950477361679, + "step": 212380 + }, + { + "epoch": 30.14762242725337, + "grad_norm": 1.137426495552063, + "learning_rate": 6.986486870120653e-05, + "loss": 0.0052015773952007295, + "step": 212390 + }, + { + "epoch": 30.14904187366927, + "grad_norm": 0.0146013293415308, + "learning_rate": 6.986344925479064e-05, + "loss": 0.0022560857236385346, + "step": 212400 + }, + { + "epoch": 30.150461320085167, + "grad_norm": 3.933382749557495, + "learning_rate": 6.986202980837474e-05, + "loss": 0.006025727465748787, + "step": 212410 + }, + { + "epoch": 30.151880766501066, + "grad_norm": 0.6074274778366089, + "learning_rate": 6.986061036195885e-05, + "loss": 0.0017040010541677475, + "step": 212420 + }, + { + "epoch": 30.153300212916964, + "grad_norm": 0.019787278026342392, + "learning_rate": 6.985919091554293e-05, + "loss": 0.006288781762123108, + "step": 212430 + }, + { + "epoch": 30.15471965933286, + "grad_norm": 0.5478348135948181, + "learning_rate": 6.985777146912704e-05, + "loss": 0.012265089154243469, + "step": 212440 + }, + { + "epoch": 30.156139105748757, + "grad_norm": 0.03398847579956055, + "learning_rate": 6.985635202271114e-05, + "loss": 0.0038840211927890778, + "step": 212450 + }, + { + "epoch": 30.157558552164655, + "grad_norm": 0.11739201098680496, + "learning_rate": 6.985493257629525e-05, + "loss": 0.01054050400853157, + "step": 212460 + }, + { + "epoch": 30.158977998580554, + "grad_norm": 0.05124296620488167, + "learning_rate": 6.985351312987935e-05, + "loss": 0.0009090609848499299, + "step": 212470 + }, + { + "epoch": 30.160397444996452, + "grad_norm": 17.000961303710938, + "learning_rate": 6.985209368346345e-05, + "loss": 0.027625414729118346, + "step": 212480 + }, + { + "epoch": 30.16181689141235, + "grad_norm": 0.17123429477214813, + "learning_rate": 6.985067423704756e-05, + "loss": 0.037226781249046326, + "step": 212490 + }, + { + "epoch": 30.16323633782825, + "grad_norm": 0.019969090819358826, + "learning_rate": 6.984925479063166e-05, + "loss": 0.00677279531955719, + "step": 212500 + }, + { + "epoch": 30.16323633782825, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.0557560957968235, + "eval_runtime": 33.7879, + "eval_samples_per_second": 465.463, + "eval_steps_per_second": 14.561, + "step": 212500 + }, + { + "epoch": 30.164655784244143, + "grad_norm": 0.13461044430732727, + "learning_rate": 6.984783534421577e-05, + "loss": 0.02603411376476288, + "step": 212510 + }, + { + "epoch": 30.16607523066004, + "grad_norm": 0.037575919181108475, + "learning_rate": 6.984641589779986e-05, + "loss": 0.012272270023822784, + "step": 212520 + }, + { + "epoch": 30.16749467707594, + "grad_norm": 0.39904189109802246, + "learning_rate": 6.984499645138396e-05, + "loss": 0.004850456491112709, + "step": 212530 + }, + { + "epoch": 30.168914123491838, + "grad_norm": 0.06679762154817581, + "learning_rate": 6.984357700496806e-05, + "loss": 0.009850251674652099, + "step": 212540 + }, + { + "epoch": 30.170333569907736, + "grad_norm": 0.16961944103240967, + "learning_rate": 6.984215755855217e-05, + "loss": 0.019363664090633392, + "step": 212550 + }, + { + "epoch": 30.171753016323635, + "grad_norm": 0.012400633655488491, + "learning_rate": 6.984073811213627e-05, + "loss": 0.004565772414207458, + "step": 212560 + }, + { + "epoch": 30.173172462739533, + "grad_norm": 0.014820380136370659, + "learning_rate": 6.983931866572038e-05, + "loss": 0.004516617953777313, + "step": 212570 + }, + { + "epoch": 30.174591909155428, + "grad_norm": 3.2193093299865723, + "learning_rate": 6.983789921930448e-05, + "loss": 0.03558715581893921, + "step": 212580 + }, + { + "epoch": 30.176011355571326, + "grad_norm": 0.051418185234069824, + "learning_rate": 6.983647977288857e-05, + "loss": 0.0032874356955289842, + "step": 212590 + }, + { + "epoch": 30.177430801987224, + "grad_norm": 7.97539758682251, + "learning_rate": 6.983506032647268e-05, + "loss": 0.08396758437156678, + "step": 212600 + }, + { + "epoch": 30.178850248403123, + "grad_norm": 0.15083737671375275, + "learning_rate": 6.983364088005678e-05, + "loss": 0.0012353863567113876, + "step": 212610 + }, + { + "epoch": 30.18026969481902, + "grad_norm": 0.6988218426704407, + "learning_rate": 6.983222143364089e-05, + "loss": 0.02218914031982422, + "step": 212620 + }, + { + "epoch": 30.18168914123492, + "grad_norm": 0.13522954285144806, + "learning_rate": 6.983080198722498e-05, + "loss": 0.008670365810394287, + "step": 212630 + }, + { + "epoch": 30.183108587650818, + "grad_norm": 0.18490323424339294, + "learning_rate": 6.982938254080909e-05, + "loss": 0.010605181753635406, + "step": 212640 + }, + { + "epoch": 30.184528034066712, + "grad_norm": 0.11108189821243286, + "learning_rate": 6.982796309439318e-05, + "loss": 0.0024842709302902223, + "step": 212650 + }, + { + "epoch": 30.18594748048261, + "grad_norm": 0.10506170243024826, + "learning_rate": 6.98265436479773e-05, + "loss": 0.006322260946035385, + "step": 212660 + }, + { + "epoch": 30.18736692689851, + "grad_norm": 0.1574009656906128, + "learning_rate": 6.982512420156139e-05, + "loss": 0.003542044758796692, + "step": 212670 + }, + { + "epoch": 30.188786373314407, + "grad_norm": 10.687641143798828, + "learning_rate": 6.982370475514549e-05, + "loss": 0.013673466444015504, + "step": 212680 + }, + { + "epoch": 30.190205819730306, + "grad_norm": 0.1877956986427307, + "learning_rate": 6.98222853087296e-05, + "loss": 0.008156492561101913, + "step": 212690 + }, + { + "epoch": 30.191625266146204, + "grad_norm": 0.06984951347112656, + "learning_rate": 6.98208658623137e-05, + "loss": 0.0017861265689134598, + "step": 212700 + }, + { + "epoch": 30.193044712562102, + "grad_norm": 6.140615940093994, + "learning_rate": 6.981944641589781e-05, + "loss": 0.0100922629237175, + "step": 212710 + }, + { + "epoch": 30.194464158977997, + "grad_norm": 0.12135222554206848, + "learning_rate": 6.98180269694819e-05, + "loss": 0.010702241957187653, + "step": 212720 + }, + { + "epoch": 30.195883605393895, + "grad_norm": 5.046215057373047, + "learning_rate": 6.9816607523066e-05, + "loss": 0.009819479286670684, + "step": 212730 + }, + { + "epoch": 30.197303051809794, + "grad_norm": 7.089147090911865, + "learning_rate": 6.98151880766501e-05, + "loss": 0.019428007304668427, + "step": 212740 + }, + { + "epoch": 30.198722498225692, + "grad_norm": 12.242188453674316, + "learning_rate": 6.981376863023421e-05, + "loss": 0.048758164048194885, + "step": 212750 + }, + { + "epoch": 30.20014194464159, + "grad_norm": 0.26790592074394226, + "learning_rate": 6.981234918381831e-05, + "loss": 0.017485521733760834, + "step": 212760 + }, + { + "epoch": 30.20156139105749, + "grad_norm": 0.46164220571517944, + "learning_rate": 6.981092973740242e-05, + "loss": 0.03755452632904053, + "step": 212770 + }, + { + "epoch": 30.202980837473387, + "grad_norm": 0.04061897099018097, + "learning_rate": 6.980951029098652e-05, + "loss": 0.027882614731788637, + "step": 212780 + }, + { + "epoch": 30.20440028388928, + "grad_norm": 12.371613502502441, + "learning_rate": 6.980809084457061e-05, + "loss": 0.0403554379940033, + "step": 212790 + }, + { + "epoch": 30.20581973030518, + "grad_norm": 0.004459378309547901, + "learning_rate": 6.980667139815473e-05, + "loss": 0.016533708572387694, + "step": 212800 + }, + { + "epoch": 30.207239176721078, + "grad_norm": 0.38951870799064636, + "learning_rate": 6.980525195173882e-05, + "loss": 0.00806300938129425, + "step": 212810 + }, + { + "epoch": 30.208658623136976, + "grad_norm": 0.31601911783218384, + "learning_rate": 6.980383250532293e-05, + "loss": 0.005649219825863838, + "step": 212820 + }, + { + "epoch": 30.210078069552875, + "grad_norm": 3.317620038986206, + "learning_rate": 6.980241305890703e-05, + "loss": 0.005392247438430786, + "step": 212830 + }, + { + "epoch": 30.211497515968773, + "grad_norm": 13.841819763183594, + "learning_rate": 6.980099361249113e-05, + "loss": 0.026733142137527467, + "step": 212840 + }, + { + "epoch": 30.21291696238467, + "grad_norm": 0.006392532028257847, + "learning_rate": 6.979957416607523e-05, + "loss": 0.036316481232643125, + "step": 212850 + }, + { + "epoch": 30.214336408800566, + "grad_norm": 0.03351481258869171, + "learning_rate": 6.979815471965934e-05, + "loss": 0.009548093378543853, + "step": 212860 + }, + { + "epoch": 30.215755855216464, + "grad_norm": 0.2757713794708252, + "learning_rate": 6.979673527324343e-05, + "loss": 0.018198777735233308, + "step": 212870 + }, + { + "epoch": 30.217175301632363, + "grad_norm": 0.15068532526493073, + "learning_rate": 6.979531582682755e-05, + "loss": 0.0017869275063276291, + "step": 212880 + }, + { + "epoch": 30.21859474804826, + "grad_norm": 11.237641334533691, + "learning_rate": 6.979389638041164e-05, + "loss": 0.017587298154830934, + "step": 212890 + }, + { + "epoch": 30.22001419446416, + "grad_norm": 0.027217620983719826, + "learning_rate": 6.979247693399574e-05, + "loss": 0.0024331603199243546, + "step": 212900 + }, + { + "epoch": 30.221433640880058, + "grad_norm": 0.13464994728565216, + "learning_rate": 6.979105748757985e-05, + "loss": 0.019517666101455687, + "step": 212910 + }, + { + "epoch": 30.222853087295956, + "grad_norm": 8.667631149291992, + "learning_rate": 6.978963804116395e-05, + "loss": 0.015613992512226105, + "step": 212920 + }, + { + "epoch": 30.22427253371185, + "grad_norm": 0.009872407652437687, + "learning_rate": 6.978821859474806e-05, + "loss": 0.009352077543735505, + "step": 212930 + }, + { + "epoch": 30.22569198012775, + "grad_norm": 0.14811746776103973, + "learning_rate": 6.978679914833214e-05, + "loss": 0.014467805624008179, + "step": 212940 + }, + { + "epoch": 30.227111426543647, + "grad_norm": 0.01120639406144619, + "learning_rate": 6.978537970191625e-05, + "loss": 0.010067746043205261, + "step": 212950 + }, + { + "epoch": 30.228530872959546, + "grad_norm": 0.3893008828163147, + "learning_rate": 6.978396025550035e-05, + "loss": 0.0037411943078041075, + "step": 212960 + }, + { + "epoch": 30.229950319375444, + "grad_norm": 8.767455101013184, + "learning_rate": 6.978254080908446e-05, + "loss": 0.012560251355171203, + "step": 212970 + }, + { + "epoch": 30.231369765791342, + "grad_norm": 0.3839244246482849, + "learning_rate": 6.978112136266856e-05, + "loss": 0.004644034802913666, + "step": 212980 + }, + { + "epoch": 30.23278921220724, + "grad_norm": 0.4331493079662323, + "learning_rate": 6.977970191625266e-05, + "loss": 0.012580749392509461, + "step": 212990 + }, + { + "epoch": 30.234208658623135, + "grad_norm": 18.985525131225586, + "learning_rate": 6.977828246983677e-05, + "loss": 0.01643640398979187, + "step": 213000 + }, + { + "epoch": 30.234208658623135, + "eval_accuracy": 0.9913524511985757, + "eval_loss": 0.032575417309999466, + "eval_runtime": 33.9614, + "eval_samples_per_second": 463.085, + "eval_steps_per_second": 14.487, + "step": 213000 + }, + { + "epoch": 30.235628105039034, + "grad_norm": 0.04492988809943199, + "learning_rate": 6.977686302342087e-05, + "loss": 0.03335049450397491, + "step": 213010 + }, + { + "epoch": 30.237047551454932, + "grad_norm": 0.07767122238874435, + "learning_rate": 6.977544357700498e-05, + "loss": 0.0022516295313835142, + "step": 213020 + }, + { + "epoch": 30.23846699787083, + "grad_norm": 0.005648863967508078, + "learning_rate": 6.977402413058907e-05, + "loss": 0.013629768788814545, + "step": 213030 + }, + { + "epoch": 30.23988644428673, + "grad_norm": 0.011128929443657398, + "learning_rate": 6.977260468417317e-05, + "loss": 0.0400622546672821, + "step": 213040 + }, + { + "epoch": 30.241305890702627, + "grad_norm": 0.04289038106799126, + "learning_rate": 6.977118523775727e-05, + "loss": 0.021360859274864197, + "step": 213050 + }, + { + "epoch": 30.242725337118525, + "grad_norm": 12.087090492248535, + "learning_rate": 6.976976579134138e-05, + "loss": 0.03273816704750061, + "step": 213060 + }, + { + "epoch": 30.24414478353442, + "grad_norm": 0.024653321132063866, + "learning_rate": 6.976834634492548e-05, + "loss": 0.001738555356860161, + "step": 213070 + }, + { + "epoch": 30.24556422995032, + "grad_norm": 0.9525904059410095, + "learning_rate": 6.976692689850959e-05, + "loss": 0.006759804487228393, + "step": 213080 + }, + { + "epoch": 30.246983676366217, + "grad_norm": 0.0700807273387909, + "learning_rate": 6.976550745209369e-05, + "loss": 0.012352063506841659, + "step": 213090 + }, + { + "epoch": 30.248403122782115, + "grad_norm": 5.092954635620117, + "learning_rate": 6.976408800567778e-05, + "loss": 0.005207516252994537, + "step": 213100 + }, + { + "epoch": 30.249822569198013, + "grad_norm": 0.08607988059520721, + "learning_rate": 6.97626685592619e-05, + "loss": 0.0017956510186195374, + "step": 213110 + }, + { + "epoch": 30.25124201561391, + "grad_norm": 0.15446226298809052, + "learning_rate": 6.976124911284599e-05, + "loss": 0.027001652121543884, + "step": 213120 + }, + { + "epoch": 30.25266146202981, + "grad_norm": 0.020780598744750023, + "learning_rate": 6.97598296664301e-05, + "loss": 0.04367642402648926, + "step": 213130 + }, + { + "epoch": 30.254080908445705, + "grad_norm": 15.364978790283203, + "learning_rate": 6.97584102200142e-05, + "loss": 0.03239233195781708, + "step": 213140 + }, + { + "epoch": 30.255500354861603, + "grad_norm": 0.011596398428082466, + "learning_rate": 6.97569907735983e-05, + "loss": 0.012232889235019685, + "step": 213150 + }, + { + "epoch": 30.2569198012775, + "grad_norm": 0.01667173020541668, + "learning_rate": 6.97555713271824e-05, + "loss": 0.0029441840946674348, + "step": 213160 + }, + { + "epoch": 30.2583392476934, + "grad_norm": 0.817295253276825, + "learning_rate": 6.97541518807665e-05, + "loss": 0.0017556596547365188, + "step": 213170 + }, + { + "epoch": 30.259758694109298, + "grad_norm": 0.015979450196027756, + "learning_rate": 6.975273243435062e-05, + "loss": 0.002313631400465965, + "step": 213180 + }, + { + "epoch": 30.261178140525196, + "grad_norm": 0.6255717873573303, + "learning_rate": 6.975131298793471e-05, + "loss": 0.0016085240989923478, + "step": 213190 + }, + { + "epoch": 30.262597586941094, + "grad_norm": 0.026597611606121063, + "learning_rate": 6.974989354151881e-05, + "loss": 0.03051452338695526, + "step": 213200 + }, + { + "epoch": 30.26401703335699, + "grad_norm": 0.01691882312297821, + "learning_rate": 6.974847409510291e-05, + "loss": 0.0039089653640985485, + "step": 213210 + }, + { + "epoch": 30.265436479772887, + "grad_norm": 0.06692077964544296, + "learning_rate": 6.974705464868702e-05, + "loss": 0.0008791320025920868, + "step": 213220 + }, + { + "epoch": 30.266855926188786, + "grad_norm": 0.029035652056336403, + "learning_rate": 6.974563520227112e-05, + "loss": 0.0005528673529624939, + "step": 213230 + }, + { + "epoch": 30.268275372604684, + "grad_norm": 0.016593236476182938, + "learning_rate": 6.974421575585523e-05, + "loss": 0.0016620464622974396, + "step": 213240 + }, + { + "epoch": 30.269694819020582, + "grad_norm": 0.03592121973633766, + "learning_rate": 6.974279630943931e-05, + "loss": 0.0016379941254854203, + "step": 213250 + }, + { + "epoch": 30.27111426543648, + "grad_norm": 0.17036710679531097, + "learning_rate": 6.974137686302342e-05, + "loss": 0.002627789229154587, + "step": 213260 + }, + { + "epoch": 30.27253371185238, + "grad_norm": 0.12463587522506714, + "learning_rate": 6.973995741660753e-05, + "loss": 0.0074273556470870975, + "step": 213270 + }, + { + "epoch": 30.273953158268274, + "grad_norm": 0.31811007857322693, + "learning_rate": 6.973853797019163e-05, + "loss": 0.008890306949615479, + "step": 213280 + }, + { + "epoch": 30.275372604684172, + "grad_norm": 0.05848356708884239, + "learning_rate": 6.973711852377574e-05, + "loss": 0.006590832024812698, + "step": 213290 + }, + { + "epoch": 30.27679205110007, + "grad_norm": 0.0015730512095615268, + "learning_rate": 6.973569907735983e-05, + "loss": 0.06595694422721862, + "step": 213300 + }, + { + "epoch": 30.27821149751597, + "grad_norm": 0.045138657093048096, + "learning_rate": 6.973427963094394e-05, + "loss": 0.003913930803537369, + "step": 213310 + }, + { + "epoch": 30.279630943931867, + "grad_norm": 0.10191139578819275, + "learning_rate": 6.973286018452803e-05, + "loss": 0.0030664507299661635, + "step": 213320 + }, + { + "epoch": 30.281050390347765, + "grad_norm": 0.024374453350901604, + "learning_rate": 6.973144073811214e-05, + "loss": 0.01478181779384613, + "step": 213330 + }, + { + "epoch": 30.282469836763664, + "grad_norm": 1.2249189615249634, + "learning_rate": 6.973002129169624e-05, + "loss": 0.0034690357744693754, + "step": 213340 + }, + { + "epoch": 30.28388928317956, + "grad_norm": 0.012856968678534031, + "learning_rate": 6.972860184528034e-05, + "loss": 0.0007944367825984955, + "step": 213350 + }, + { + "epoch": 30.285308729595457, + "grad_norm": 0.0037022901233285666, + "learning_rate": 6.972718239886445e-05, + "loss": 0.04025251269340515, + "step": 213360 + }, + { + "epoch": 30.286728176011355, + "grad_norm": 3.135465621948242, + "learning_rate": 6.972576295244855e-05, + "loss": 0.007756094634532929, + "step": 213370 + }, + { + "epoch": 30.288147622427253, + "grad_norm": 0.05779300257563591, + "learning_rate": 6.972434350603266e-05, + "loss": 0.004590510949492454, + "step": 213380 + }, + { + "epoch": 30.28956706884315, + "grad_norm": 0.0036681017372757196, + "learning_rate": 6.972292405961676e-05, + "loss": 0.0061123076826334, + "step": 213390 + }, + { + "epoch": 30.29098651525905, + "grad_norm": 0.12398575246334076, + "learning_rate": 6.972150461320085e-05, + "loss": 0.013778039813041687, + "step": 213400 + }, + { + "epoch": 30.292405961674948, + "grad_norm": 0.05198562890291214, + "learning_rate": 6.972008516678495e-05, + "loss": 0.008058224618434907, + "step": 213410 + }, + { + "epoch": 30.293825408090843, + "grad_norm": 0.04202358424663544, + "learning_rate": 6.971866572036906e-05, + "loss": 0.016183950006961823, + "step": 213420 + }, + { + "epoch": 30.29524485450674, + "grad_norm": 0.1661132127046585, + "learning_rate": 6.971724627395316e-05, + "loss": 0.006205129623413086, + "step": 213430 + }, + { + "epoch": 30.29666430092264, + "grad_norm": 3.2114932537078857, + "learning_rate": 6.971582682753727e-05, + "loss": 0.034755590558052066, + "step": 213440 + }, + { + "epoch": 30.298083747338538, + "grad_norm": 0.30723172426223755, + "learning_rate": 6.971440738112137e-05, + "loss": 0.02019895315170288, + "step": 213450 + }, + { + "epoch": 30.299503193754436, + "grad_norm": 18.12069320678711, + "learning_rate": 6.971298793470546e-05, + "loss": 0.016213835775852205, + "step": 213460 + }, + { + "epoch": 30.300922640170334, + "grad_norm": 0.07105506211519241, + "learning_rate": 6.971156848828958e-05, + "loss": 0.02666441202163696, + "step": 213470 + }, + { + "epoch": 30.302342086586233, + "grad_norm": 0.20825739204883575, + "learning_rate": 6.971014904187367e-05, + "loss": 0.0026362910866737366, + "step": 213480 + }, + { + "epoch": 30.303761533002127, + "grad_norm": 8.89615249633789, + "learning_rate": 6.970872959545778e-05, + "loss": 0.041865947842597964, + "step": 213490 + }, + { + "epoch": 30.305180979418026, + "grad_norm": 0.184341162443161, + "learning_rate": 6.970731014904188e-05, + "loss": 0.0039998859167099, + "step": 213500 + }, + { + "epoch": 30.305180979418026, + "eval_accuracy": 0.9902715075983977, + "eval_loss": 0.0452544279396534, + "eval_runtime": 34.2556, + "eval_samples_per_second": 459.108, + "eval_steps_per_second": 14.363, + "step": 213500 + }, + { + "epoch": 30.306600425833924, + "grad_norm": 3.171359062194824, + "learning_rate": 6.970589070262598e-05, + "loss": 0.010481182485818863, + "step": 213510 + }, + { + "epoch": 30.308019872249822, + "grad_norm": 0.004430424887686968, + "learning_rate": 6.970447125621008e-05, + "loss": 0.006259272247552872, + "step": 213520 + }, + { + "epoch": 30.30943931866572, + "grad_norm": 0.6004817485809326, + "learning_rate": 6.970305180979419e-05, + "loss": 0.027704459428787232, + "step": 213530 + }, + { + "epoch": 30.31085876508162, + "grad_norm": 0.6627089977264404, + "learning_rate": 6.970163236337828e-05, + "loss": 0.0229725182056427, + "step": 213540 + }, + { + "epoch": 30.312278211497517, + "grad_norm": 0.015962794423103333, + "learning_rate": 6.97002129169624e-05, + "loss": 0.03487211167812347, + "step": 213550 + }, + { + "epoch": 30.313697657913412, + "grad_norm": 1.5800961256027222, + "learning_rate": 6.969879347054649e-05, + "loss": 0.004393167048692703, + "step": 213560 + }, + { + "epoch": 30.31511710432931, + "grad_norm": 0.05584333464503288, + "learning_rate": 6.969737402413059e-05, + "loss": 0.0021387040615081787, + "step": 213570 + }, + { + "epoch": 30.31653655074521, + "grad_norm": 14.075117111206055, + "learning_rate": 6.96959545777147e-05, + "loss": 0.12181761264801025, + "step": 213580 + }, + { + "epoch": 30.317955997161107, + "grad_norm": 0.02297227457165718, + "learning_rate": 6.96945351312988e-05, + "loss": 0.003320970386266708, + "step": 213590 + }, + { + "epoch": 30.319375443577005, + "grad_norm": 0.05748753994703293, + "learning_rate": 6.969311568488291e-05, + "loss": 0.018818378448486328, + "step": 213600 + }, + { + "epoch": 30.320794889992904, + "grad_norm": 0.010294465348124504, + "learning_rate": 6.969169623846699e-05, + "loss": 0.006062963604927063, + "step": 213610 + }, + { + "epoch": 30.322214336408802, + "grad_norm": 0.2300993651151657, + "learning_rate": 6.96902767920511e-05, + "loss": 0.03580346703529358, + "step": 213620 + }, + { + "epoch": 30.323633782824697, + "grad_norm": 15.318633079528809, + "learning_rate": 6.96888573456352e-05, + "loss": 0.02738127112388611, + "step": 213630 + }, + { + "epoch": 30.325053229240595, + "grad_norm": 0.23434357345104218, + "learning_rate": 6.968743789921931e-05, + "loss": 0.0058558288961648945, + "step": 213640 + }, + { + "epoch": 30.326472675656493, + "grad_norm": 0.06053111329674721, + "learning_rate": 6.968601845280341e-05, + "loss": 0.030677196383476258, + "step": 213650 + }, + { + "epoch": 30.32789212207239, + "grad_norm": 0.3836609423160553, + "learning_rate": 6.968459900638751e-05, + "loss": 0.0005654800683259964, + "step": 213660 + }, + { + "epoch": 30.32931156848829, + "grad_norm": 6.640696048736572, + "learning_rate": 6.968317955997162e-05, + "loss": 0.02615102529525757, + "step": 213670 + }, + { + "epoch": 30.330731014904188, + "grad_norm": 0.09219910949468613, + "learning_rate": 6.968176011355572e-05, + "loss": 0.0047644350677728655, + "step": 213680 + }, + { + "epoch": 30.332150461320087, + "grad_norm": 0.7548179030418396, + "learning_rate": 6.968034066713983e-05, + "loss": 0.003709470480680466, + "step": 213690 + }, + { + "epoch": 30.33356990773598, + "grad_norm": 0.5750740766525269, + "learning_rate": 6.967892122072392e-05, + "loss": 0.003945963457226753, + "step": 213700 + }, + { + "epoch": 30.33498935415188, + "grad_norm": 8.594654083251953, + "learning_rate": 6.967750177430802e-05, + "loss": 0.013190490007400513, + "step": 213710 + }, + { + "epoch": 30.336408800567778, + "grad_norm": 0.4322192370891571, + "learning_rate": 6.967608232789212e-05, + "loss": 0.019464053213596344, + "step": 213720 + }, + { + "epoch": 30.337828246983676, + "grad_norm": 2.104672431945801, + "learning_rate": 6.967466288147623e-05, + "loss": 0.031652092933654785, + "step": 213730 + }, + { + "epoch": 30.339247693399575, + "grad_norm": 3.6902406215667725, + "learning_rate": 6.967324343506033e-05, + "loss": 0.04853520393371582, + "step": 213740 + }, + { + "epoch": 30.340667139815473, + "grad_norm": 10.758426666259766, + "learning_rate": 6.967182398864444e-05, + "loss": 0.030333533883094788, + "step": 213750 + }, + { + "epoch": 30.34208658623137, + "grad_norm": 0.07540660351514816, + "learning_rate": 6.967040454222853e-05, + "loss": 0.006315629184246063, + "step": 213760 + }, + { + "epoch": 30.343506032647266, + "grad_norm": 0.06291403621435165, + "learning_rate": 6.966898509581263e-05, + "loss": 0.017659446597099303, + "step": 213770 + }, + { + "epoch": 30.344925479063164, + "grad_norm": 0.3075029253959656, + "learning_rate": 6.966756564939674e-05, + "loss": 0.0180808886885643, + "step": 213780 + }, + { + "epoch": 30.346344925479062, + "grad_norm": 0.02922029420733452, + "learning_rate": 6.966614620298084e-05, + "loss": 0.005203073471784591, + "step": 213790 + }, + { + "epoch": 30.34776437189496, + "grad_norm": 0.015898626297712326, + "learning_rate": 6.966472675656495e-05, + "loss": 0.0033422503620386124, + "step": 213800 + }, + { + "epoch": 30.34918381831086, + "grad_norm": 0.9257256388664246, + "learning_rate": 6.966330731014905e-05, + "loss": 0.011642952263355256, + "step": 213810 + }, + { + "epoch": 30.350603264726757, + "grad_norm": 11.116868019104004, + "learning_rate": 6.966188786373315e-05, + "loss": 0.031056010723114015, + "step": 213820 + }, + { + "epoch": 30.352022711142656, + "grad_norm": 20.03546714782715, + "learning_rate": 6.966046841731724e-05, + "loss": 0.01895372271537781, + "step": 213830 + }, + { + "epoch": 30.35344215755855, + "grad_norm": 7.977284908294678, + "learning_rate": 6.965904897090135e-05, + "loss": 0.017136585712432862, + "step": 213840 + }, + { + "epoch": 30.35486160397445, + "grad_norm": 0.15353576838970184, + "learning_rate": 6.965762952448545e-05, + "loss": 0.009045113623142243, + "step": 213850 + }, + { + "epoch": 30.356281050390347, + "grad_norm": 0.006386205554008484, + "learning_rate": 6.965621007806956e-05, + "loss": 0.031707587838172915, + "step": 213860 + }, + { + "epoch": 30.357700496806245, + "grad_norm": 6.954352855682373, + "learning_rate": 6.965479063165366e-05, + "loss": 0.005451960116624832, + "step": 213870 + }, + { + "epoch": 30.359119943222144, + "grad_norm": 0.038362372666597366, + "learning_rate": 6.965337118523776e-05, + "loss": 0.010467012971639633, + "step": 213880 + }, + { + "epoch": 30.360539389638042, + "grad_norm": 0.2409149408340454, + "learning_rate": 6.965195173882187e-05, + "loss": 0.002701243758201599, + "step": 213890 + }, + { + "epoch": 30.36195883605394, + "grad_norm": 0.04085102677345276, + "learning_rate": 6.965053229240597e-05, + "loss": 0.0018392093479633331, + "step": 213900 + }, + { + "epoch": 30.363378282469835, + "grad_norm": 11.485808372497559, + "learning_rate": 6.964911284599008e-05, + "loss": 0.020595794916152953, + "step": 213910 + }, + { + "epoch": 30.364797728885733, + "grad_norm": 0.1509091556072235, + "learning_rate": 6.964769339957416e-05, + "loss": 0.004381385073065758, + "step": 213920 + }, + { + "epoch": 30.36621717530163, + "grad_norm": 0.008203708566725254, + "learning_rate": 6.964627395315827e-05, + "loss": 0.008702069520950317, + "step": 213930 + }, + { + "epoch": 30.36763662171753, + "grad_norm": 1.3214647769927979, + "learning_rate": 6.964485450674237e-05, + "loss": 0.0152978777885437, + "step": 213940 + }, + { + "epoch": 30.36905606813343, + "grad_norm": 4.879977226257324, + "learning_rate": 6.964343506032648e-05, + "loss": 0.046046674251556396, + "step": 213950 + }, + { + "epoch": 30.370475514549327, + "grad_norm": 0.1308564841747284, + "learning_rate": 6.964201561391058e-05, + "loss": 0.004556400701403618, + "step": 213960 + }, + { + "epoch": 30.371894960965225, + "grad_norm": 0.07993537932634354, + "learning_rate": 6.964059616749467e-05, + "loss": 0.017645484209060668, + "step": 213970 + }, + { + "epoch": 30.37331440738112, + "grad_norm": 0.07570703327655792, + "learning_rate": 6.963917672107879e-05, + "loss": 0.019048064947128296, + "step": 213980 + }, + { + "epoch": 30.374733853797018, + "grad_norm": 0.0054298629984259605, + "learning_rate": 6.963775727466288e-05, + "loss": 0.0018761500716209412, + "step": 213990 + }, + { + "epoch": 30.376153300212916, + "grad_norm": 0.5063519477844238, + "learning_rate": 6.9636337828247e-05, + "loss": 0.010863712430000306, + "step": 214000 + }, + { + "epoch": 30.376153300212916, + "eval_accuracy": 0.9896356584218223, + "eval_loss": 0.039334967732429504, + "eval_runtime": 33.6846, + "eval_samples_per_second": 466.89, + "eval_steps_per_second": 14.606, + "step": 214000 + }, + { + "epoch": 30.377572746628815, + "grad_norm": 2.5204625129699707, + "learning_rate": 6.963491838183109e-05, + "loss": 0.0024833127856254576, + "step": 214010 + }, + { + "epoch": 30.378992193044713, + "grad_norm": 0.1107277199625969, + "learning_rate": 6.963349893541519e-05, + "loss": 0.0012096840888261794, + "step": 214020 + }, + { + "epoch": 30.38041163946061, + "grad_norm": 0.14221343398094177, + "learning_rate": 6.963207948899929e-05, + "loss": 0.02440156787633896, + "step": 214030 + }, + { + "epoch": 30.38183108587651, + "grad_norm": 0.014424390159547329, + "learning_rate": 6.96306600425834e-05, + "loss": 0.004415082931518555, + "step": 214040 + }, + { + "epoch": 30.383250532292404, + "grad_norm": 10.323341369628906, + "learning_rate": 6.96292405961675e-05, + "loss": 0.03881877660751343, + "step": 214050 + }, + { + "epoch": 30.384669978708303, + "grad_norm": 0.8739731311798096, + "learning_rate": 6.96278211497516e-05, + "loss": 0.010016964375972747, + "step": 214060 + }, + { + "epoch": 30.3860894251242, + "grad_norm": 0.0692279189825058, + "learning_rate": 6.96264017033357e-05, + "loss": 0.03913286030292511, + "step": 214070 + }, + { + "epoch": 30.3875088715401, + "grad_norm": 0.05090710148215294, + "learning_rate": 6.96249822569198e-05, + "loss": 0.0200044646859169, + "step": 214080 + }, + { + "epoch": 30.388928317955997, + "grad_norm": 0.0076075647957623005, + "learning_rate": 6.962356281050391e-05, + "loss": 0.020325055718421935, + "step": 214090 + }, + { + "epoch": 30.390347764371896, + "grad_norm": 0.07389630377292633, + "learning_rate": 6.962214336408801e-05, + "loss": 0.0016705218702554702, + "step": 214100 + }, + { + "epoch": 30.391767210787794, + "grad_norm": 0.7510613799095154, + "learning_rate": 6.962072391767212e-05, + "loss": 0.022483274340629578, + "step": 214110 + }, + { + "epoch": 30.39318665720369, + "grad_norm": 7.05588436126709, + "learning_rate": 6.96194464158978e-05, + "loss": 0.018426933884620668, + "step": 214120 + }, + { + "epoch": 30.394606103619587, + "grad_norm": 0.023326314985752106, + "learning_rate": 6.961802696948191e-05, + "loss": 0.0011625137180089951, + "step": 214130 + }, + { + "epoch": 30.396025550035485, + "grad_norm": 0.07159880548715591, + "learning_rate": 6.961660752306601e-05, + "loss": 0.01350502371788025, + "step": 214140 + }, + { + "epoch": 30.397444996451384, + "grad_norm": 0.05867705121636391, + "learning_rate": 6.961518807665011e-05, + "loss": 0.003032075986266136, + "step": 214150 + }, + { + "epoch": 30.398864442867282, + "grad_norm": 0.3322239816188812, + "learning_rate": 6.961376863023421e-05, + "loss": 0.01077951192855835, + "step": 214160 + }, + { + "epoch": 30.40028388928318, + "grad_norm": 2.5156867504119873, + "learning_rate": 6.961234918381832e-05, + "loss": 0.026361608505249025, + "step": 214170 + }, + { + "epoch": 30.40170333569908, + "grad_norm": 0.21635474264621735, + "learning_rate": 6.961092973740242e-05, + "loss": 0.027087679505348204, + "step": 214180 + }, + { + "epoch": 30.403122782114973, + "grad_norm": 0.9410694241523743, + "learning_rate": 6.960951029098653e-05, + "loss": 0.011853384226560593, + "step": 214190 + }, + { + "epoch": 30.40454222853087, + "grad_norm": 0.3173334002494812, + "learning_rate": 6.960809084457062e-05, + "loss": 0.018872012197971345, + "step": 214200 + }, + { + "epoch": 30.40596167494677, + "grad_norm": 0.022918779402971268, + "learning_rate": 6.960667139815472e-05, + "loss": 0.002634742483496666, + "step": 214210 + }, + { + "epoch": 30.40738112136267, + "grad_norm": 0.09162935614585876, + "learning_rate": 6.960525195173883e-05, + "loss": 0.025371426343917848, + "step": 214220 + }, + { + "epoch": 30.408800567778567, + "grad_norm": 0.1775810569524765, + "learning_rate": 6.960383250532293e-05, + "loss": 0.012070011347532272, + "step": 214230 + }, + { + "epoch": 30.410220014194465, + "grad_norm": 7.228017807006836, + "learning_rate": 6.960241305890704e-05, + "loss": 0.025345572829246522, + "step": 214240 + }, + { + "epoch": 30.411639460610363, + "grad_norm": 0.2933479845523834, + "learning_rate": 6.960099361249112e-05, + "loss": 0.037689656019210815, + "step": 214250 + }, + { + "epoch": 30.413058907026258, + "grad_norm": 0.041505537927150726, + "learning_rate": 6.959957416607524e-05, + "loss": 0.022548122704029082, + "step": 214260 + }, + { + "epoch": 30.414478353442156, + "grad_norm": 0.7912554144859314, + "learning_rate": 6.959815471965933e-05, + "loss": 0.010299193114042282, + "step": 214270 + }, + { + "epoch": 30.415897799858055, + "grad_norm": 0.13897977769374847, + "learning_rate": 6.959673527324344e-05, + "loss": 0.015503749251365662, + "step": 214280 + }, + { + "epoch": 30.417317246273953, + "grad_norm": 0.8370093703269958, + "learning_rate": 6.959531582682754e-05, + "loss": 0.007623147219419479, + "step": 214290 + }, + { + "epoch": 30.41873669268985, + "grad_norm": 0.21673697233200073, + "learning_rate": 6.959389638041164e-05, + "loss": 0.02917708456516266, + "step": 214300 + }, + { + "epoch": 30.42015613910575, + "grad_norm": 0.05493646860122681, + "learning_rate": 6.959247693399575e-05, + "loss": 0.004624433070421219, + "step": 214310 + }, + { + "epoch": 30.421575585521648, + "grad_norm": 22.156347274780273, + "learning_rate": 6.959105748757985e-05, + "loss": 0.011143902689218521, + "step": 214320 + }, + { + "epoch": 30.422995031937543, + "grad_norm": 9.066140174865723, + "learning_rate": 6.958963804116396e-05, + "loss": 0.007907848805189133, + "step": 214330 + }, + { + "epoch": 30.42441447835344, + "grad_norm": 0.33706438541412354, + "learning_rate": 6.958821859474805e-05, + "loss": 0.032618346810340884, + "step": 214340 + }, + { + "epoch": 30.42583392476934, + "grad_norm": 15.068604469299316, + "learning_rate": 6.958679914833215e-05, + "loss": 0.01799548715353012, + "step": 214350 + }, + { + "epoch": 30.427253371185238, + "grad_norm": 4.066751003265381, + "learning_rate": 6.958537970191625e-05, + "loss": 0.011215402185916901, + "step": 214360 + }, + { + "epoch": 30.428672817601136, + "grad_norm": 0.5836976766586304, + "learning_rate": 6.958396025550036e-05, + "loss": 0.0027244996279478075, + "step": 214370 + }, + { + "epoch": 30.430092264017034, + "grad_norm": 0.024863647297024727, + "learning_rate": 6.958254080908446e-05, + "loss": 0.0022091135382652284, + "step": 214380 + }, + { + "epoch": 30.431511710432932, + "grad_norm": 0.013804375194013119, + "learning_rate": 6.958112136266857e-05, + "loss": 0.006755609810352325, + "step": 214390 + }, + { + "epoch": 30.432931156848827, + "grad_norm": 0.004607176408171654, + "learning_rate": 6.957970191625267e-05, + "loss": 0.03358984887599945, + "step": 214400 + }, + { + "epoch": 30.434350603264726, + "grad_norm": 0.036380231380462646, + "learning_rate": 6.957828246983676e-05, + "loss": 0.00778810828924179, + "step": 214410 + }, + { + "epoch": 30.435770049680624, + "grad_norm": 11.742752075195312, + "learning_rate": 6.957686302342087e-05, + "loss": 0.01929762065410614, + "step": 214420 + }, + { + "epoch": 30.437189496096522, + "grad_norm": 0.04998306185007095, + "learning_rate": 6.957544357700497e-05, + "loss": 0.008706947416067123, + "step": 214430 + }, + { + "epoch": 30.43860894251242, + "grad_norm": 0.019992351531982422, + "learning_rate": 6.957402413058908e-05, + "loss": 0.00816059485077858, + "step": 214440 + }, + { + "epoch": 30.44002838892832, + "grad_norm": 8.047316551208496, + "learning_rate": 6.957260468417317e-05, + "loss": 0.026600125432014465, + "step": 214450 + }, + { + "epoch": 30.441447835344217, + "grad_norm": 2.228032350540161, + "learning_rate": 6.957118523775728e-05, + "loss": 0.0117687925696373, + "step": 214460 + }, + { + "epoch": 30.442867281760112, + "grad_norm": 7.269008159637451, + "learning_rate": 6.956976579134137e-05, + "loss": 0.02306679040193558, + "step": 214470 + }, + { + "epoch": 30.44428672817601, + "grad_norm": 0.8149417042732239, + "learning_rate": 6.956834634492549e-05, + "loss": 0.017473000288009643, + "step": 214480 + }, + { + "epoch": 30.44570617459191, + "grad_norm": 0.11564302444458008, + "learning_rate": 6.956692689850958e-05, + "loss": 0.03778222501277924, + "step": 214490 + }, + { + "epoch": 30.447125621007807, + "grad_norm": 1.6095525026321411, + "learning_rate": 6.95655074520937e-05, + "loss": 0.0492600291967392, + "step": 214500 + }, + { + "epoch": 30.447125621007807, + "eval_accuracy": 0.9845488650092198, + "eval_loss": 0.0612000934779644, + "eval_runtime": 34.0208, + "eval_samples_per_second": 462.276, + "eval_steps_per_second": 14.462, + "step": 214500 + }, + { + "epoch": 30.448545067423705, + "grad_norm": 6.4104814529418945, + "learning_rate": 6.956408800567779e-05, + "loss": 0.011859998852014542, + "step": 214510 + }, + { + "epoch": 30.449964513839603, + "grad_norm": 0.6298443675041199, + "learning_rate": 6.956266855926189e-05, + "loss": 0.027115833759307862, + "step": 214520 + }, + { + "epoch": 30.4513839602555, + "grad_norm": 13.909955978393555, + "learning_rate": 6.9561249112846e-05, + "loss": 0.018700408935546874, + "step": 214530 + }, + { + "epoch": 30.4528034066714, + "grad_norm": 5.5668230056762695, + "learning_rate": 6.95598296664301e-05, + "loss": 0.011013130843639373, + "step": 214540 + }, + { + "epoch": 30.454222853087295, + "grad_norm": 0.015791604295372963, + "learning_rate": 6.955841022001421e-05, + "loss": 0.04098716974258423, + "step": 214550 + }, + { + "epoch": 30.455642299503193, + "grad_norm": 0.6011185050010681, + "learning_rate": 6.955699077359829e-05, + "loss": 0.014686641097068787, + "step": 214560 + }, + { + "epoch": 30.45706174591909, + "grad_norm": 1.1975884437561035, + "learning_rate": 6.95555713271824e-05, + "loss": 0.0140543133020401, + "step": 214570 + }, + { + "epoch": 30.45848119233499, + "grad_norm": 0.3401302695274353, + "learning_rate": 6.95541518807665e-05, + "loss": 0.018630969524383544, + "step": 214580 + }, + { + "epoch": 30.459900638750888, + "grad_norm": 0.006957430392503738, + "learning_rate": 6.955273243435061e-05, + "loss": 0.013431251049041748, + "step": 214590 + }, + { + "epoch": 30.461320085166786, + "grad_norm": 0.03871683403849602, + "learning_rate": 6.955131298793471e-05, + "loss": 0.013655215501785278, + "step": 214600 + }, + { + "epoch": 30.462739531582685, + "grad_norm": 0.6758370399475098, + "learning_rate": 6.95498935415188e-05, + "loss": 0.011157501488924026, + "step": 214610 + }, + { + "epoch": 30.46415897799858, + "grad_norm": 10.228954315185547, + "learning_rate": 6.954847409510292e-05, + "loss": 0.00819041281938553, + "step": 214620 + }, + { + "epoch": 30.465578424414478, + "grad_norm": 5.065550327301025, + "learning_rate": 6.954705464868701e-05, + "loss": 0.0050915654748678206, + "step": 214630 + }, + { + "epoch": 30.466997870830376, + "grad_norm": 0.9099671840667725, + "learning_rate": 6.954563520227113e-05, + "loss": 0.0013467881828546524, + "step": 214640 + }, + { + "epoch": 30.468417317246274, + "grad_norm": 0.7846859097480774, + "learning_rate": 6.954421575585522e-05, + "loss": 0.022486016154289246, + "step": 214650 + }, + { + "epoch": 30.469836763662173, + "grad_norm": 0.07558999210596085, + "learning_rate": 6.954279630943932e-05, + "loss": 0.02465900033712387, + "step": 214660 + }, + { + "epoch": 30.47125621007807, + "grad_norm": 2.947685480117798, + "learning_rate": 6.954137686302342e-05, + "loss": 0.007743225246667862, + "step": 214670 + }, + { + "epoch": 30.47267565649397, + "grad_norm": 0.2624059021472931, + "learning_rate": 6.953995741660753e-05, + "loss": 0.00431131087243557, + "step": 214680 + }, + { + "epoch": 30.474095102909864, + "grad_norm": 0.04996887594461441, + "learning_rate": 6.953853797019163e-05, + "loss": 0.001460055634379387, + "step": 214690 + }, + { + "epoch": 30.475514549325762, + "grad_norm": 0.09691139310598373, + "learning_rate": 6.953711852377574e-05, + "loss": 0.04105641841888428, + "step": 214700 + }, + { + "epoch": 30.47693399574166, + "grad_norm": 0.18984052538871765, + "learning_rate": 6.953569907735983e-05, + "loss": 0.00867714211344719, + "step": 214710 + }, + { + "epoch": 30.47835344215756, + "grad_norm": 15.392823219299316, + "learning_rate": 6.953427963094393e-05, + "loss": 0.01060575246810913, + "step": 214720 + }, + { + "epoch": 30.479772888573457, + "grad_norm": 1.4904887676239014, + "learning_rate": 6.953286018452804e-05, + "loss": 0.017408435046672822, + "step": 214730 + }, + { + "epoch": 30.481192334989355, + "grad_norm": 1.9215072393417358, + "learning_rate": 6.953144073811214e-05, + "loss": 0.0051240168511867525, + "step": 214740 + }, + { + "epoch": 30.482611781405254, + "grad_norm": 0.04816721752285957, + "learning_rate": 6.953002129169625e-05, + "loss": 0.006261418759822846, + "step": 214750 + }, + { + "epoch": 30.48403122782115, + "grad_norm": 0.05224836990237236, + "learning_rate": 6.952860184528033e-05, + "loss": 0.03134201169013977, + "step": 214760 + }, + { + "epoch": 30.485450674237047, + "grad_norm": 0.03349020332098007, + "learning_rate": 6.952718239886445e-05, + "loss": 0.02433360517024994, + "step": 214770 + }, + { + "epoch": 30.486870120652945, + "grad_norm": 2.002140760421753, + "learning_rate": 6.952576295244854e-05, + "loss": 0.03227897882461548, + "step": 214780 + }, + { + "epoch": 30.488289567068843, + "grad_norm": 5.991812705993652, + "learning_rate": 6.952434350603265e-05, + "loss": 0.023759786784648896, + "step": 214790 + }, + { + "epoch": 30.48970901348474, + "grad_norm": 0.14116746187210083, + "learning_rate": 6.952292405961675e-05, + "loss": 0.005670697614550591, + "step": 214800 + }, + { + "epoch": 30.49112845990064, + "grad_norm": 0.11713194847106934, + "learning_rate": 6.952150461320085e-05, + "loss": 0.019591839611530305, + "step": 214810 + }, + { + "epoch": 30.49254790631654, + "grad_norm": 0.027771558612585068, + "learning_rate": 6.952008516678496e-05, + "loss": 0.026435551047325135, + "step": 214820 + }, + { + "epoch": 30.493967352732433, + "grad_norm": 0.785439670085907, + "learning_rate": 6.951866572036906e-05, + "loss": 0.010965496301651001, + "step": 214830 + }, + { + "epoch": 30.49538679914833, + "grad_norm": 0.010200303979218006, + "learning_rate": 6.951724627395317e-05, + "loss": 0.005812918767333031, + "step": 214840 + }, + { + "epoch": 30.49680624556423, + "grad_norm": 17.71927261352539, + "learning_rate": 6.951582682753726e-05, + "loss": 0.02433340549468994, + "step": 214850 + }, + { + "epoch": 30.498225691980128, + "grad_norm": 0.04332356154918671, + "learning_rate": 6.951440738112138e-05, + "loss": 0.002726107090711594, + "step": 214860 + }, + { + "epoch": 30.499645138396026, + "grad_norm": 8.889148712158203, + "learning_rate": 6.951298793470546e-05, + "loss": 0.03503900170326233, + "step": 214870 + }, + { + "epoch": 30.501064584811925, + "grad_norm": 0.0611199215054512, + "learning_rate": 6.951156848828957e-05, + "loss": 0.015433949232101441, + "step": 214880 + }, + { + "epoch": 30.502484031227823, + "grad_norm": 0.08437636494636536, + "learning_rate": 6.951014904187367e-05, + "loss": 0.0017985548824071885, + "step": 214890 + }, + { + "epoch": 30.503903477643718, + "grad_norm": 6.381248950958252, + "learning_rate": 6.950872959545778e-05, + "loss": 0.01994054913520813, + "step": 214900 + }, + { + "epoch": 30.505322924059616, + "grad_norm": 0.04598463326692581, + "learning_rate": 6.950731014904188e-05, + "loss": 0.015186133980751037, + "step": 214910 + }, + { + "epoch": 30.506742370475514, + "grad_norm": 0.07534061372280121, + "learning_rate": 6.950589070262597e-05, + "loss": 0.036519107222557065, + "step": 214920 + }, + { + "epoch": 30.508161816891413, + "grad_norm": 0.0095033198595047, + "learning_rate": 6.950447125621008e-05, + "loss": 0.007881461083889008, + "step": 214930 + }, + { + "epoch": 30.50958126330731, + "grad_norm": 0.23439475893974304, + "learning_rate": 6.950305180979418e-05, + "loss": 0.0062052026391029354, + "step": 214940 + }, + { + "epoch": 30.51100070972321, + "grad_norm": 0.6313900947570801, + "learning_rate": 6.950163236337829e-05, + "loss": 0.0045454870909452435, + "step": 214950 + }, + { + "epoch": 30.512420156139108, + "grad_norm": 0.5481003522872925, + "learning_rate": 6.950021291696239e-05, + "loss": 0.00956513062119484, + "step": 214960 + }, + { + "epoch": 30.513839602555002, + "grad_norm": 0.01543770357966423, + "learning_rate": 6.949879347054649e-05, + "loss": 0.059159159660339355, + "step": 214970 + }, + { + "epoch": 30.5152590489709, + "grad_norm": 0.12125207483768463, + "learning_rate": 6.949737402413058e-05, + "loss": 0.007341686636209488, + "step": 214980 + }, + { + "epoch": 30.5166784953868, + "grad_norm": 0.007960792630910873, + "learning_rate": 6.94959545777147e-05, + "loss": 0.003928522393107414, + "step": 214990 + }, + { + "epoch": 30.518097941802697, + "grad_norm": 16.601856231689453, + "learning_rate": 6.949453513129879e-05, + "loss": 0.01142132431268692, + "step": 215000 + }, + { + "epoch": 30.518097941802697, + "eval_accuracy": 0.986837922044891, + "eval_loss": 0.0508023276925087, + "eval_runtime": 33.5861, + "eval_samples_per_second": 468.259, + "eval_steps_per_second": 14.649, + "step": 215000 + }, + { + "epoch": 30.519517388218595, + "grad_norm": 0.04449477046728134, + "learning_rate": 6.94931156848829e-05, + "loss": 0.025666582584381103, + "step": 215010 + }, + { + "epoch": 30.520936834634494, + "grad_norm": 13.680166244506836, + "learning_rate": 6.9491696238467e-05, + "loss": 0.024465903639793396, + "step": 215020 + }, + { + "epoch": 30.522356281050392, + "grad_norm": 0.10861390084028244, + "learning_rate": 6.94902767920511e-05, + "loss": 0.018058516085147858, + "step": 215030 + }, + { + "epoch": 30.523775727466287, + "grad_norm": 0.259863018989563, + "learning_rate": 6.948885734563521e-05, + "loss": 0.009704206883907319, + "step": 215040 + }, + { + "epoch": 30.525195173882185, + "grad_norm": 0.028500951826572418, + "learning_rate": 6.948743789921931e-05, + "loss": 0.0046527616679668425, + "step": 215050 + }, + { + "epoch": 30.526614620298083, + "grad_norm": 0.05175289884209633, + "learning_rate": 6.948601845280342e-05, + "loss": 0.011684497445821762, + "step": 215060 + }, + { + "epoch": 30.528034066713982, + "grad_norm": 0.03419538214802742, + "learning_rate": 6.94845990063875e-05, + "loss": 0.008192062377929688, + "step": 215070 + }, + { + "epoch": 30.52945351312988, + "grad_norm": 0.04043741151690483, + "learning_rate": 6.948317955997161e-05, + "loss": 0.0008646048605442047, + "step": 215080 + }, + { + "epoch": 30.53087295954578, + "grad_norm": 0.33776646852493286, + "learning_rate": 6.948176011355571e-05, + "loss": 0.001237635686993599, + "step": 215090 + }, + { + "epoch": 30.532292405961677, + "grad_norm": 0.012484145350754261, + "learning_rate": 6.948034066713982e-05, + "loss": 0.0037085186690092088, + "step": 215100 + }, + { + "epoch": 30.53371185237757, + "grad_norm": 0.8905991911888123, + "learning_rate": 6.947892122072392e-05, + "loss": 0.009193854779005051, + "step": 215110 + }, + { + "epoch": 30.53513129879347, + "grad_norm": 0.03541100025177002, + "learning_rate": 6.947750177430802e-05, + "loss": 0.014093232154846192, + "step": 215120 + }, + { + "epoch": 30.536550745209368, + "grad_norm": 6.721506595611572, + "learning_rate": 6.947608232789213e-05, + "loss": 0.012281638383865357, + "step": 215130 + }, + { + "epoch": 30.537970191625266, + "grad_norm": 0.09717497229576111, + "learning_rate": 6.947466288147622e-05, + "loss": 0.004601963981986046, + "step": 215140 + }, + { + "epoch": 30.539389638041165, + "grad_norm": 0.012417415156960487, + "learning_rate": 6.947324343506034e-05, + "loss": 0.01614934206008911, + "step": 215150 + }, + { + "epoch": 30.540809084457063, + "grad_norm": 5.768606662750244, + "learning_rate": 6.947182398864443e-05, + "loss": 0.010061257332563401, + "step": 215160 + }, + { + "epoch": 30.54222853087296, + "grad_norm": 6.349483013153076, + "learning_rate": 6.947040454222853e-05, + "loss": 0.025248020887374878, + "step": 215170 + }, + { + "epoch": 30.543647977288856, + "grad_norm": 12.8591890335083, + "learning_rate": 6.946898509581263e-05, + "loss": 0.012160910665988922, + "step": 215180 + }, + { + "epoch": 30.545067423704754, + "grad_norm": 2.1533761024475098, + "learning_rate": 6.946756564939674e-05, + "loss": 0.0056957196444273, + "step": 215190 + }, + { + "epoch": 30.546486870120653, + "grad_norm": 0.0659998431801796, + "learning_rate": 6.946614620298084e-05, + "loss": 0.001416967436671257, + "step": 215200 + }, + { + "epoch": 30.54790631653655, + "grad_norm": 0.8730418086051941, + "learning_rate": 6.946472675656495e-05, + "loss": 0.0048255927860736845, + "step": 215210 + }, + { + "epoch": 30.54932576295245, + "grad_norm": 0.19772323966026306, + "learning_rate": 6.946330731014904e-05, + "loss": 0.000832396000623703, + "step": 215220 + }, + { + "epoch": 30.550745209368348, + "grad_norm": 9.544881820678711, + "learning_rate": 6.946188786373314e-05, + "loss": 0.05045414566993713, + "step": 215230 + }, + { + "epoch": 30.552164655784246, + "grad_norm": 7.817386627197266, + "learning_rate": 6.946046841731725e-05, + "loss": 0.007778728008270263, + "step": 215240 + }, + { + "epoch": 30.55358410220014, + "grad_norm": 0.10844562202692032, + "learning_rate": 6.945904897090135e-05, + "loss": 0.008617138862609864, + "step": 215250 + }, + { + "epoch": 30.55500354861604, + "grad_norm": 0.2658809423446655, + "learning_rate": 6.945762952448546e-05, + "loss": 0.011942073702812195, + "step": 215260 + }, + { + "epoch": 30.556422995031937, + "grad_norm": 0.03642828017473221, + "learning_rate": 6.945621007806956e-05, + "loss": 0.007899542897939682, + "step": 215270 + }, + { + "epoch": 30.557842441447836, + "grad_norm": 0.01671171747148037, + "learning_rate": 6.945479063165366e-05, + "loss": 0.002441328763961792, + "step": 215280 + }, + { + "epoch": 30.559261887863734, + "grad_norm": 0.05424909666180611, + "learning_rate": 6.945337118523775e-05, + "loss": 0.0029842182993888855, + "step": 215290 + }, + { + "epoch": 30.560681334279632, + "grad_norm": 0.018333595246076584, + "learning_rate": 6.945195173882186e-05, + "loss": 0.008060573041439057, + "step": 215300 + }, + { + "epoch": 30.56210078069553, + "grad_norm": 0.010020492598414421, + "learning_rate": 6.945053229240596e-05, + "loss": 0.02215612232685089, + "step": 215310 + }, + { + "epoch": 30.563520227111425, + "grad_norm": 0.20167812705039978, + "learning_rate": 6.944911284599007e-05, + "loss": 0.006655065715312958, + "step": 215320 + }, + { + "epoch": 30.564939673527324, + "grad_norm": 13.17483901977539, + "learning_rate": 6.944769339957417e-05, + "loss": 0.008239691704511642, + "step": 215330 + }, + { + "epoch": 30.566359119943222, + "grad_norm": 0.03308369591832161, + "learning_rate": 6.944627395315827e-05, + "loss": 0.007047249376773835, + "step": 215340 + }, + { + "epoch": 30.56777856635912, + "grad_norm": 1.5259130001068115, + "learning_rate": 6.944485450674238e-05, + "loss": 0.03426990509033203, + "step": 215350 + }, + { + "epoch": 30.56919801277502, + "grad_norm": 0.024667872115969658, + "learning_rate": 6.944343506032647e-05, + "loss": 0.003893521428108215, + "step": 215360 + }, + { + "epoch": 30.570617459190917, + "grad_norm": 0.011464103125035763, + "learning_rate": 6.944201561391059e-05, + "loss": 0.020162414014339446, + "step": 215370 + }, + { + "epoch": 30.572036905606815, + "grad_norm": 0.10198130458593369, + "learning_rate": 6.944059616749467e-05, + "loss": 0.022886180877685548, + "step": 215380 + }, + { + "epoch": 30.57345635202271, + "grad_norm": 2.5585315227508545, + "learning_rate": 6.943917672107878e-05, + "loss": 0.01082737147808075, + "step": 215390 + }, + { + "epoch": 30.574875798438608, + "grad_norm": 0.6638144254684448, + "learning_rate": 6.943775727466288e-05, + "loss": 0.014424093067646027, + "step": 215400 + }, + { + "epoch": 30.576295244854506, + "grad_norm": 0.03662014752626419, + "learning_rate": 6.943633782824699e-05, + "loss": 0.003103240579366684, + "step": 215410 + }, + { + "epoch": 30.577714691270405, + "grad_norm": 0.5032156705856323, + "learning_rate": 6.94349183818311e-05, + "loss": 0.02952774167060852, + "step": 215420 + }, + { + "epoch": 30.579134137686303, + "grad_norm": 0.2974824011325836, + "learning_rate": 6.943349893541518e-05, + "loss": 0.009482115507125854, + "step": 215430 + }, + { + "epoch": 30.5805535841022, + "grad_norm": 0.028143223375082016, + "learning_rate": 6.94320794889993e-05, + "loss": 0.004156490042805672, + "step": 215440 + }, + { + "epoch": 30.5819730305181, + "grad_norm": 0.04918726906180382, + "learning_rate": 6.943066004258339e-05, + "loss": 0.001059601828455925, + "step": 215450 + }, + { + "epoch": 30.583392476933994, + "grad_norm": 0.7781400084495544, + "learning_rate": 6.94292405961675e-05, + "loss": 0.019697165489196776, + "step": 215460 + }, + { + "epoch": 30.584811923349893, + "grad_norm": 0.016945699229836464, + "learning_rate": 6.94278211497516e-05, + "loss": 0.0068174049258232115, + "step": 215470 + }, + { + "epoch": 30.58623136976579, + "grad_norm": 0.002212497405707836, + "learning_rate": 6.94264017033357e-05, + "loss": 0.0016288112848997117, + "step": 215480 + }, + { + "epoch": 30.58765081618169, + "grad_norm": 2.186424970626831, + "learning_rate": 6.94249822569198e-05, + "loss": 0.003649449348449707, + "step": 215490 + }, + { + "epoch": 30.589070262597588, + "grad_norm": 0.3171271085739136, + "learning_rate": 6.94235628105039e-05, + "loss": 0.014193849265575409, + "step": 215500 + }, + { + "epoch": 30.589070262597588, + "eval_accuracy": 0.9848032046798499, + "eval_loss": 0.06065487861633301, + "eval_runtime": 33.7056, + "eval_samples_per_second": 466.598, + "eval_steps_per_second": 14.597, + "step": 215500 + }, + { + "epoch": 30.590489709013486, + "grad_norm": 5.931843280792236, + "learning_rate": 6.942214336408802e-05, + "loss": 0.05329040288925171, + "step": 215510 + }, + { + "epoch": 30.591909155429384, + "grad_norm": 3.0509066581726074, + "learning_rate": 6.942072391767211e-05, + "loss": 0.018586663901805876, + "step": 215520 + }, + { + "epoch": 30.59332860184528, + "grad_norm": 0.11098844558000565, + "learning_rate": 6.941930447125621e-05, + "loss": 0.015715691447257995, + "step": 215530 + }, + { + "epoch": 30.594748048261177, + "grad_norm": 0.08572791516780853, + "learning_rate": 6.941788502484031e-05, + "loss": 0.023678602278232576, + "step": 215540 + }, + { + "epoch": 30.596167494677076, + "grad_norm": 2.566216230392456, + "learning_rate": 6.941646557842442e-05, + "loss": 0.01806049793958664, + "step": 215550 + }, + { + "epoch": 30.597586941092974, + "grad_norm": 12.6263427734375, + "learning_rate": 6.941504613200852e-05, + "loss": 0.02070556432008743, + "step": 215560 + }, + { + "epoch": 30.599006387508872, + "grad_norm": 1.3289250135421753, + "learning_rate": 6.941362668559263e-05, + "loss": 0.01684604287147522, + "step": 215570 + }, + { + "epoch": 30.60042583392477, + "grad_norm": 2.7052648067474365, + "learning_rate": 6.941220723917673e-05, + "loss": 0.02493312507867813, + "step": 215580 + }, + { + "epoch": 30.60184528034067, + "grad_norm": 1.17116117477417, + "learning_rate": 6.941078779276082e-05, + "loss": 0.042052140831947325, + "step": 215590 + }, + { + "epoch": 30.603264726756564, + "grad_norm": 0.22484518587589264, + "learning_rate": 6.940936834634493e-05, + "loss": 0.011887797713279724, + "step": 215600 + }, + { + "epoch": 30.604684173172462, + "grad_norm": 0.018579009920358658, + "learning_rate": 6.940794889992903e-05, + "loss": 0.023867869377136232, + "step": 215610 + }, + { + "epoch": 30.60610361958836, + "grad_norm": 0.867328405380249, + "learning_rate": 6.940652945351314e-05, + "loss": 0.03296869397163391, + "step": 215620 + }, + { + "epoch": 30.60752306600426, + "grad_norm": 0.07785506546497345, + "learning_rate": 6.940511000709724e-05, + "loss": 0.010638514161109924, + "step": 215630 + }, + { + "epoch": 30.608942512420157, + "grad_norm": 1.653104305267334, + "learning_rate": 6.940369056068134e-05, + "loss": 0.008836449682712555, + "step": 215640 + }, + { + "epoch": 30.610361958836055, + "grad_norm": 10.957831382751465, + "learning_rate": 6.940227111426543e-05, + "loss": 0.014841094613075256, + "step": 215650 + }, + { + "epoch": 30.611781405251953, + "grad_norm": 30.76253318786621, + "learning_rate": 6.940085166784955e-05, + "loss": 0.0590395450592041, + "step": 215660 + }, + { + "epoch": 30.613200851667848, + "grad_norm": 0.06368537992238998, + "learning_rate": 6.939943222143364e-05, + "loss": 0.026760390400886534, + "step": 215670 + }, + { + "epoch": 30.614620298083747, + "grad_norm": 0.024164579808712006, + "learning_rate": 6.939801277501775e-05, + "loss": 0.009311264008283615, + "step": 215680 + }, + { + "epoch": 30.616039744499645, + "grad_norm": 0.008622519671916962, + "learning_rate": 6.939659332860185e-05, + "loss": 0.005990379676222801, + "step": 215690 + }, + { + "epoch": 30.617459190915543, + "grad_norm": 1.38267982006073, + "learning_rate": 6.939517388218595e-05, + "loss": 0.015695010125637055, + "step": 215700 + }, + { + "epoch": 30.61887863733144, + "grad_norm": 0.24842244386672974, + "learning_rate": 6.939375443577006e-05, + "loss": 0.029893767833709717, + "step": 215710 + }, + { + "epoch": 30.62029808374734, + "grad_norm": 0.02505352906882763, + "learning_rate": 6.939233498935416e-05, + "loss": 0.005859995260834694, + "step": 215720 + }, + { + "epoch": 30.621717530163238, + "grad_norm": 0.01698969677090645, + "learning_rate": 6.939091554293827e-05, + "loss": 0.0036163754761219026, + "step": 215730 + }, + { + "epoch": 30.623136976579133, + "grad_norm": 6.881640911102295, + "learning_rate": 6.938949609652235e-05, + "loss": 0.004137242585420609, + "step": 215740 + }, + { + "epoch": 30.62455642299503, + "grad_norm": 0.03463999554514885, + "learning_rate": 6.938807665010646e-05, + "loss": 0.0029571570456027986, + "step": 215750 + }, + { + "epoch": 30.62597586941093, + "grad_norm": 0.027694126591086388, + "learning_rate": 6.938665720369056e-05, + "loss": 0.004595466703176498, + "step": 215760 + }, + { + "epoch": 30.627395315826828, + "grad_norm": 0.007551881484687328, + "learning_rate": 6.938523775727467e-05, + "loss": 0.03805612325668335, + "step": 215770 + }, + { + "epoch": 30.628814762242726, + "grad_norm": 1.4271334409713745, + "learning_rate": 6.938381831085877e-05, + "loss": 0.013677330315113067, + "step": 215780 + }, + { + "epoch": 30.630234208658624, + "grad_norm": 13.850367546081543, + "learning_rate": 6.938239886444287e-05, + "loss": 0.012633289396762847, + "step": 215790 + }, + { + "epoch": 30.631653655074523, + "grad_norm": 0.10316607356071472, + "learning_rate": 6.938097941802698e-05, + "loss": 0.0041380409151315686, + "step": 215800 + }, + { + "epoch": 30.633073101490417, + "grad_norm": 0.31673645973205566, + "learning_rate": 6.937955997161107e-05, + "loss": 0.002324211969971657, + "step": 215810 + }, + { + "epoch": 30.634492547906316, + "grad_norm": 0.45479199290275574, + "learning_rate": 6.937814052519518e-05, + "loss": 0.02166728675365448, + "step": 215820 + }, + { + "epoch": 30.635911994322214, + "grad_norm": 0.03127999231219292, + "learning_rate": 6.937672107877928e-05, + "loss": 0.0052260149270296095, + "step": 215830 + }, + { + "epoch": 30.637331440738112, + "grad_norm": 0.1631818264722824, + "learning_rate": 6.937530163236338e-05, + "loss": 0.004121894389390946, + "step": 215840 + }, + { + "epoch": 30.63875088715401, + "grad_norm": 0.013541055843234062, + "learning_rate": 6.937388218594748e-05, + "loss": 0.022102929651737213, + "step": 215850 + }, + { + "epoch": 30.64017033356991, + "grad_norm": 0.1088729053735733, + "learning_rate": 6.937246273953159e-05, + "loss": 0.002510332316160202, + "step": 215860 + }, + { + "epoch": 30.641589779985807, + "grad_norm": 0.019219422712922096, + "learning_rate": 6.937104329311569e-05, + "loss": 0.01303250789642334, + "step": 215870 + }, + { + "epoch": 30.643009226401702, + "grad_norm": 0.07179081439971924, + "learning_rate": 6.93696238466998e-05, + "loss": 0.061884617805480956, + "step": 215880 + }, + { + "epoch": 30.6444286728176, + "grad_norm": 0.29962238669395447, + "learning_rate": 6.93682044002839e-05, + "loss": 0.0051975205540657045, + "step": 215890 + }, + { + "epoch": 30.6458481192335, + "grad_norm": 0.06874889880418777, + "learning_rate": 6.936678495386799e-05, + "loss": 0.03180159628391266, + "step": 215900 + }, + { + "epoch": 30.647267565649397, + "grad_norm": 0.3891178071498871, + "learning_rate": 6.93653655074521e-05, + "loss": 0.0029957223683595656, + "step": 215910 + }, + { + "epoch": 30.648687012065295, + "grad_norm": 0.00969015434384346, + "learning_rate": 6.93639460610362e-05, + "loss": 0.007266192883253098, + "step": 215920 + }, + { + "epoch": 30.650106458481194, + "grad_norm": 0.05472289025783539, + "learning_rate": 6.936252661462031e-05, + "loss": 0.0026956070214509964, + "step": 215930 + }, + { + "epoch": 30.651525904897092, + "grad_norm": 2.13490891456604, + "learning_rate": 6.936110716820441e-05, + "loss": 0.005337231233716011, + "step": 215940 + }, + { + "epoch": 30.652945351312987, + "grad_norm": 1.0783706903457642, + "learning_rate": 6.93596877217885e-05, + "loss": 0.001333685964345932, + "step": 215950 + }, + { + "epoch": 30.654364797728885, + "grad_norm": 8.17844295501709, + "learning_rate": 6.93582682753726e-05, + "loss": 0.013849309086799622, + "step": 215960 + }, + { + "epoch": 30.655784244144783, + "grad_norm": 0.3185134530067444, + "learning_rate": 6.935684882895671e-05, + "loss": 0.013034430146217347, + "step": 215970 + }, + { + "epoch": 30.65720369056068, + "grad_norm": 0.14608201384544373, + "learning_rate": 6.935542938254081e-05, + "loss": 0.0065985307097435, + "step": 215980 + }, + { + "epoch": 30.65862313697658, + "grad_norm": 0.08333662152290344, + "learning_rate": 6.935400993612492e-05, + "loss": 0.020336820185184477, + "step": 215990 + }, + { + "epoch": 30.660042583392478, + "grad_norm": 0.012593520805239677, + "learning_rate": 6.935259048970902e-05, + "loss": 0.0012210350483655929, + "step": 216000 + }, + { + "epoch": 30.660042583392478, + "eval_accuracy": 0.9891905639982196, + "eval_loss": 0.039019327610731125, + "eval_runtime": 33.515, + "eval_samples_per_second": 469.252, + "eval_steps_per_second": 14.68, + "step": 216000 + }, + { + "epoch": 30.661462029808376, + "grad_norm": 8.240349769592285, + "learning_rate": 6.935117104329312e-05, + "loss": 0.006092340871691704, + "step": 216010 + }, + { + "epoch": 30.66288147622427, + "grad_norm": 2.099916696548462, + "learning_rate": 6.934975159687723e-05, + "loss": 0.001627131551504135, + "step": 216020 + }, + { + "epoch": 30.66430092264017, + "grad_norm": 28.09660530090332, + "learning_rate": 6.934833215046132e-05, + "loss": 0.05124871134757995, + "step": 216030 + }, + { + "epoch": 30.665720369056068, + "grad_norm": 0.33609554171562195, + "learning_rate": 6.934691270404544e-05, + "loss": 0.02873336672782898, + "step": 216040 + }, + { + "epoch": 30.667139815471966, + "grad_norm": 7.807661056518555, + "learning_rate": 6.934549325762952e-05, + "loss": 0.015148022770881652, + "step": 216050 + }, + { + "epoch": 30.668559261887864, + "grad_norm": 0.11147323250770569, + "learning_rate": 6.934407381121363e-05, + "loss": 0.048483410477638246, + "step": 216060 + }, + { + "epoch": 30.669978708303763, + "grad_norm": 0.42620691657066345, + "learning_rate": 6.934265436479773e-05, + "loss": 0.0029811743646860124, + "step": 216070 + }, + { + "epoch": 30.67139815471966, + "grad_norm": 0.032398320734500885, + "learning_rate": 6.934123491838184e-05, + "loss": 0.002408459782600403, + "step": 216080 + }, + { + "epoch": 30.672817601135556, + "grad_norm": 0.1525917798280716, + "learning_rate": 6.933981547196594e-05, + "loss": 0.004783783107995987, + "step": 216090 + }, + { + "epoch": 30.674237047551454, + "grad_norm": 10.711019515991211, + "learning_rate": 6.933839602555003e-05, + "loss": 0.017226217687129973, + "step": 216100 + }, + { + "epoch": 30.675656493967352, + "grad_norm": 0.02031802199780941, + "learning_rate": 6.933697657913414e-05, + "loss": 0.023125173151493074, + "step": 216110 + }, + { + "epoch": 30.67707594038325, + "grad_norm": 0.06034954637289047, + "learning_rate": 6.933555713271824e-05, + "loss": 0.015974533557891846, + "step": 216120 + }, + { + "epoch": 30.67849538679915, + "grad_norm": 0.03435724601149559, + "learning_rate": 6.933413768630235e-05, + "loss": 0.028225204348564147, + "step": 216130 + }, + { + "epoch": 30.679914833215047, + "grad_norm": 0.09610234200954437, + "learning_rate": 6.933271823988645e-05, + "loss": 0.0034337185323238375, + "step": 216140 + }, + { + "epoch": 30.681334279630946, + "grad_norm": 0.021926160901784897, + "learning_rate": 6.933129879347055e-05, + "loss": 0.009717724472284316, + "step": 216150 + }, + { + "epoch": 30.68275372604684, + "grad_norm": 1.1303986310958862, + "learning_rate": 6.932987934705464e-05, + "loss": 0.00729091688990593, + "step": 216160 + }, + { + "epoch": 30.68417317246274, + "grad_norm": 0.007496795617043972, + "learning_rate": 6.932845990063876e-05, + "loss": 0.017910251021385194, + "step": 216170 + }, + { + "epoch": 30.685592618878637, + "grad_norm": 0.04213172569870949, + "learning_rate": 6.932704045422285e-05, + "loss": 0.014742036163806916, + "step": 216180 + }, + { + "epoch": 30.687012065294535, + "grad_norm": 0.01458937581628561, + "learning_rate": 6.932562100780696e-05, + "loss": 0.008649620413780212, + "step": 216190 + }, + { + "epoch": 30.688431511710434, + "grad_norm": 2.117666244506836, + "learning_rate": 6.932420156139106e-05, + "loss": 0.00487353429198265, + "step": 216200 + }, + { + "epoch": 30.689850958126332, + "grad_norm": 0.3752743601799011, + "learning_rate": 6.932278211497516e-05, + "loss": 0.007956020534038544, + "step": 216210 + }, + { + "epoch": 30.69127040454223, + "grad_norm": 3.3058924674987793, + "learning_rate": 6.932136266855927e-05, + "loss": 0.016490721702575685, + "step": 216220 + }, + { + "epoch": 30.692689850958125, + "grad_norm": 0.21398192644119263, + "learning_rate": 6.931994322214337e-05, + "loss": 0.002921175956726074, + "step": 216230 + }, + { + "epoch": 30.694109297374023, + "grad_norm": 0.1840810328722, + "learning_rate": 6.931852377572748e-05, + "loss": 0.0006094459444284439, + "step": 216240 + }, + { + "epoch": 30.69552874378992, + "grad_norm": 1.2637214660644531, + "learning_rate": 6.931710432931156e-05, + "loss": 0.0043856360018253325, + "step": 216250 + }, + { + "epoch": 30.69694819020582, + "grad_norm": 0.04307027533650398, + "learning_rate": 6.931568488289567e-05, + "loss": 0.0014246657490730285, + "step": 216260 + }, + { + "epoch": 30.698367636621718, + "grad_norm": 0.1001417264342308, + "learning_rate": 6.931426543647977e-05, + "loss": 0.005052326247096062, + "step": 216270 + }, + { + "epoch": 30.699787083037616, + "grad_norm": 0.6640215516090393, + "learning_rate": 6.931284599006388e-05, + "loss": 0.0165830597281456, + "step": 216280 + }, + { + "epoch": 30.701206529453515, + "grad_norm": 0.0172658022493124, + "learning_rate": 6.931142654364798e-05, + "loss": 0.013405601680278777, + "step": 216290 + }, + { + "epoch": 30.70262597586941, + "grad_norm": 0.04023309424519539, + "learning_rate": 6.931000709723209e-05, + "loss": 0.012554731965065003, + "step": 216300 + }, + { + "epoch": 30.704045422285308, + "grad_norm": 10.528226852416992, + "learning_rate": 6.930858765081619e-05, + "loss": 0.022060540318489075, + "step": 216310 + }, + { + "epoch": 30.705464868701206, + "grad_norm": 15.095396995544434, + "learning_rate": 6.930716820440028e-05, + "loss": 0.014101970195770263, + "step": 216320 + }, + { + "epoch": 30.706884315117104, + "grad_norm": 1.3869178295135498, + "learning_rate": 6.93057487579844e-05, + "loss": 0.0066359549760818485, + "step": 216330 + }, + { + "epoch": 30.708303761533003, + "grad_norm": 15.256940841674805, + "learning_rate": 6.930432931156849e-05, + "loss": 0.007709190249443054, + "step": 216340 + }, + { + "epoch": 30.7097232079489, + "grad_norm": 0.038836318999528885, + "learning_rate": 6.93029098651526e-05, + "loss": 0.012146452069282531, + "step": 216350 + }, + { + "epoch": 30.7111426543648, + "grad_norm": 0.009355288930237293, + "learning_rate": 6.930149041873669e-05, + "loss": 0.0020309340208768843, + "step": 216360 + }, + { + "epoch": 30.712562100780694, + "grad_norm": 6.154882431030273, + "learning_rate": 6.93000709723208e-05, + "loss": 0.037825199961662295, + "step": 216370 + }, + { + "epoch": 30.713981547196592, + "grad_norm": 6.378183364868164, + "learning_rate": 6.92986515259049e-05, + "loss": 0.010407356917858124, + "step": 216380 + }, + { + "epoch": 30.71540099361249, + "grad_norm": 0.18586666882038116, + "learning_rate": 6.9297232079489e-05, + "loss": 0.06719923615455628, + "step": 216390 + }, + { + "epoch": 30.71682044002839, + "grad_norm": 19.066070556640625, + "learning_rate": 6.92958126330731e-05, + "loss": 0.02076660990715027, + "step": 216400 + }, + { + "epoch": 30.718239886444287, + "grad_norm": 14.264588356018066, + "learning_rate": 6.92943931866572e-05, + "loss": 0.015005847811698914, + "step": 216410 + }, + { + "epoch": 30.719659332860186, + "grad_norm": 0.25340279936790466, + "learning_rate": 6.929297374024131e-05, + "loss": 0.01187555193901062, + "step": 216420 + }, + { + "epoch": 30.721078779276084, + "grad_norm": 24.277320861816406, + "learning_rate": 6.929155429382541e-05, + "loss": 0.08293134570121766, + "step": 216430 + }, + { + "epoch": 30.72249822569198, + "grad_norm": 2.775089740753174, + "learning_rate": 6.929013484740952e-05, + "loss": 0.04840065240859985, + "step": 216440 + }, + { + "epoch": 30.723917672107877, + "grad_norm": 2.7158355712890625, + "learning_rate": 6.928871540099362e-05, + "loss": 0.012708361446857452, + "step": 216450 + }, + { + "epoch": 30.725337118523775, + "grad_norm": 3.868049144744873, + "learning_rate": 6.928729595457771e-05, + "loss": 0.025464147329330444, + "step": 216460 + }, + { + "epoch": 30.726756564939674, + "grad_norm": 0.16629473865032196, + "learning_rate": 6.928587650816181e-05, + "loss": 0.011410051584243774, + "step": 216470 + }, + { + "epoch": 30.728176011355572, + "grad_norm": 3.5598182678222656, + "learning_rate": 6.928445706174592e-05, + "loss": 0.017509828507900237, + "step": 216480 + }, + { + "epoch": 30.72959545777147, + "grad_norm": 0.04289602115750313, + "learning_rate": 6.928303761533002e-05, + "loss": 0.0024662092328071594, + "step": 216490 + }, + { + "epoch": 30.73101490418737, + "grad_norm": 3.0040252208709717, + "learning_rate": 6.928161816891413e-05, + "loss": 0.0057405784726142885, + "step": 216500 + }, + { + "epoch": 30.73101490418737, + "eval_accuracy": 0.9855026387740828, + "eval_loss": 0.05936433747410774, + "eval_runtime": 33.8053, + "eval_samples_per_second": 465.223, + "eval_steps_per_second": 14.554, + "step": 216500 + }, + { + "epoch": 30.732434350603263, + "grad_norm": 0.08877170830965042, + "learning_rate": 6.928019872249823e-05, + "loss": 0.0208505854010582, + "step": 216510 + }, + { + "epoch": 30.73385379701916, + "grad_norm": 0.025537380948662758, + "learning_rate": 6.927877927608233e-05, + "loss": 0.023119354248046876, + "step": 216520 + }, + { + "epoch": 30.73527324343506, + "grad_norm": 9.918169975280762, + "learning_rate": 6.927735982966644e-05, + "loss": 0.029215940833091737, + "step": 216530 + }, + { + "epoch": 30.73669268985096, + "grad_norm": 0.1362747848033905, + "learning_rate": 6.927594038325053e-05, + "loss": 0.02884758710861206, + "step": 216540 + }, + { + "epoch": 30.738112136266857, + "grad_norm": 13.027043342590332, + "learning_rate": 6.927452093683465e-05, + "loss": 0.01962668001651764, + "step": 216550 + }, + { + "epoch": 30.739531582682755, + "grad_norm": 0.005680895876139402, + "learning_rate": 6.927324343506033e-05, + "loss": 0.040744373202323915, + "step": 216560 + }, + { + "epoch": 30.740951029098653, + "grad_norm": 0.28513410687446594, + "learning_rate": 6.927182398864444e-05, + "loss": 0.04683162569999695, + "step": 216570 + }, + { + "epoch": 30.742370475514548, + "grad_norm": 0.38527828454971313, + "learning_rate": 6.927040454222854e-05, + "loss": 0.012680888175964355, + "step": 216580 + }, + { + "epoch": 30.743789921930446, + "grad_norm": 0.35934701561927795, + "learning_rate": 6.926898509581264e-05, + "loss": 0.004312145709991455, + "step": 216590 + }, + { + "epoch": 30.745209368346345, + "grad_norm": 0.24417778849601746, + "learning_rate": 6.926756564939673e-05, + "loss": 0.0400823712348938, + "step": 216600 + }, + { + "epoch": 30.746628814762243, + "grad_norm": 0.9623720049858093, + "learning_rate": 6.926614620298084e-05, + "loss": 0.0031246017664670945, + "step": 216610 + }, + { + "epoch": 30.74804826117814, + "grad_norm": 1.8042937517166138, + "learning_rate": 6.926472675656494e-05, + "loss": 0.029795533418655394, + "step": 216620 + }, + { + "epoch": 30.74946770759404, + "grad_norm": 0.07492884248495102, + "learning_rate": 6.926330731014905e-05, + "loss": 0.00539492666721344, + "step": 216630 + }, + { + "epoch": 30.750887154009938, + "grad_norm": 0.054716505110263824, + "learning_rate": 6.926188786373315e-05, + "loss": 0.0044132772833108905, + "step": 216640 + }, + { + "epoch": 30.752306600425833, + "grad_norm": 0.09364310652017593, + "learning_rate": 6.926046841731725e-05, + "loss": 0.007484359294176101, + "step": 216650 + }, + { + "epoch": 30.75372604684173, + "grad_norm": 0.03059927374124527, + "learning_rate": 6.925904897090136e-05, + "loss": 0.017677077651023866, + "step": 216660 + }, + { + "epoch": 30.75514549325763, + "grad_norm": 0.038364022970199585, + "learning_rate": 6.925762952448546e-05, + "loss": 0.01382559835910797, + "step": 216670 + }, + { + "epoch": 30.756564939673527, + "grad_norm": 0.4344567060470581, + "learning_rate": 6.925621007806957e-05, + "loss": 0.028923556208610535, + "step": 216680 + }, + { + "epoch": 30.757984386089426, + "grad_norm": 0.02379538118839264, + "learning_rate": 6.925479063165365e-05, + "loss": 0.0229655385017395, + "step": 216690 + }, + { + "epoch": 30.759403832505324, + "grad_norm": 5.606290817260742, + "learning_rate": 6.925337118523776e-05, + "loss": 0.033587095141410825, + "step": 216700 + }, + { + "epoch": 30.760823278921222, + "grad_norm": 10.127408027648926, + "learning_rate": 6.925195173882186e-05, + "loss": 0.052793759107589724, + "step": 216710 + }, + { + "epoch": 30.762242725337117, + "grad_norm": 0.6015443205833435, + "learning_rate": 6.925053229240597e-05, + "loss": 0.07081390619277954, + "step": 216720 + }, + { + "epoch": 30.763662171753015, + "grad_norm": 0.021507780998945236, + "learning_rate": 6.924911284599007e-05, + "loss": 0.015285371243953705, + "step": 216730 + }, + { + "epoch": 30.765081618168914, + "grad_norm": 1.8708486557006836, + "learning_rate": 6.924769339957416e-05, + "loss": 0.0023040007799863815, + "step": 216740 + }, + { + "epoch": 30.766501064584812, + "grad_norm": 0.7235044240951538, + "learning_rate": 6.924627395315828e-05, + "loss": 0.0028939370065927507, + "step": 216750 + }, + { + "epoch": 30.76792051100071, + "grad_norm": 0.01952683925628662, + "learning_rate": 6.924485450674237e-05, + "loss": 0.03180106282234192, + "step": 216760 + }, + { + "epoch": 30.76933995741661, + "grad_norm": 0.004404210485517979, + "learning_rate": 6.924343506032648e-05, + "loss": 0.0046692084521055225, + "step": 216770 + }, + { + "epoch": 30.770759403832507, + "grad_norm": 0.6594990491867065, + "learning_rate": 6.924201561391058e-05, + "loss": 0.003542621806263924, + "step": 216780 + }, + { + "epoch": 30.7721788502484, + "grad_norm": 0.7441993951797485, + "learning_rate": 6.924059616749468e-05, + "loss": 0.0019206307828426362, + "step": 216790 + }, + { + "epoch": 30.7735982966643, + "grad_norm": 2.006152629852295, + "learning_rate": 6.923917672107878e-05, + "loss": 0.012954635918140412, + "step": 216800 + }, + { + "epoch": 30.7750177430802, + "grad_norm": 0.04598817601799965, + "learning_rate": 6.923775727466289e-05, + "loss": 0.037043902277946475, + "step": 216810 + }, + { + "epoch": 30.776437189496097, + "grad_norm": 1.7148364782333374, + "learning_rate": 6.923633782824698e-05, + "loss": 0.004856040328741073, + "step": 216820 + }, + { + "epoch": 30.777856635911995, + "grad_norm": 0.153082475066185, + "learning_rate": 6.92349183818311e-05, + "loss": 0.0034114021807909014, + "step": 216830 + }, + { + "epoch": 30.779276082327893, + "grad_norm": 0.4799967110157013, + "learning_rate": 6.923349893541519e-05, + "loss": 0.013768278062343597, + "step": 216840 + }, + { + "epoch": 30.78069552874379, + "grad_norm": 4.994696617126465, + "learning_rate": 6.923207948899929e-05, + "loss": 0.013396379351615906, + "step": 216850 + }, + { + "epoch": 30.782114975159686, + "grad_norm": 0.020956043154001236, + "learning_rate": 6.92306600425834e-05, + "loss": 0.023523299396038054, + "step": 216860 + }, + { + "epoch": 30.783534421575585, + "grad_norm": 4.5208916664123535, + "learning_rate": 6.92292405961675e-05, + "loss": 0.050841504335403444, + "step": 216870 + }, + { + "epoch": 30.784953867991483, + "grad_norm": 2.2778985500335693, + "learning_rate": 6.922782114975161e-05, + "loss": 0.00206596739590168, + "step": 216880 + }, + { + "epoch": 30.78637331440738, + "grad_norm": 0.5142453908920288, + "learning_rate": 6.922640170333569e-05, + "loss": 0.008769367635250092, + "step": 216890 + }, + { + "epoch": 30.78779276082328, + "grad_norm": 0.09606867283582687, + "learning_rate": 6.92249822569198e-05, + "loss": 0.015101358294487, + "step": 216900 + }, + { + "epoch": 30.789212207239178, + "grad_norm": 9.411075592041016, + "learning_rate": 6.92235628105039e-05, + "loss": 0.011006078124046326, + "step": 216910 + }, + { + "epoch": 30.790631653655076, + "grad_norm": 0.04286998137831688, + "learning_rate": 6.922214336408801e-05, + "loss": 0.008425452560186387, + "step": 216920 + }, + { + "epoch": 30.79205110007097, + "grad_norm": 2.2967097759246826, + "learning_rate": 6.922072391767211e-05, + "loss": 0.011250613629817963, + "step": 216930 + }, + { + "epoch": 30.79347054648687, + "grad_norm": 11.260876655578613, + "learning_rate": 6.921930447125622e-05, + "loss": 0.010646232962608337, + "step": 216940 + }, + { + "epoch": 30.794889992902768, + "grad_norm": 0.25787219405174255, + "learning_rate": 6.921788502484032e-05, + "loss": 0.017058825492858885, + "step": 216950 + }, + { + "epoch": 30.796309439318666, + "grad_norm": 0.032317787408828735, + "learning_rate": 6.921646557842442e-05, + "loss": 0.018944284319877623, + "step": 216960 + }, + { + "epoch": 30.797728885734564, + "grad_norm": 7.359813690185547, + "learning_rate": 6.921504613200853e-05, + "loss": 0.03968104124069214, + "step": 216970 + }, + { + "epoch": 30.799148332150462, + "grad_norm": 0.07706298679113388, + "learning_rate": 6.921362668559262e-05, + "loss": 0.0030966091901063917, + "step": 216980 + }, + { + "epoch": 30.80056777856636, + "grad_norm": 1.5708123445510864, + "learning_rate": 6.921220723917673e-05, + "loss": 0.011127613484859467, + "step": 216990 + }, + { + "epoch": 30.801987224982255, + "grad_norm": 1.7612652778625488, + "learning_rate": 6.921078779276082e-05, + "loss": 0.011665178835391999, + "step": 217000 + }, + { + "epoch": 30.801987224982255, + "eval_accuracy": 0.987918865645069, + "eval_loss": 0.047477006912231445, + "eval_runtime": 33.787, + "eval_samples_per_second": 465.475, + "eval_steps_per_second": 14.562, + "step": 217000 + }, + { + "epoch": 30.803406671398154, + "grad_norm": 0.3811379075050354, + "learning_rate": 6.920936834634493e-05, + "loss": 0.0027996521443128587, + "step": 217010 + }, + { + "epoch": 30.804826117814052, + "grad_norm": 0.04013770818710327, + "learning_rate": 6.920794889992903e-05, + "loss": 0.003861067071557045, + "step": 217020 + }, + { + "epoch": 30.80624556422995, + "grad_norm": 1.0027867555618286, + "learning_rate": 6.920652945351314e-05, + "loss": 0.0332479864358902, + "step": 217030 + }, + { + "epoch": 30.80766501064585, + "grad_norm": 8.550732612609863, + "learning_rate": 6.920511000709723e-05, + "loss": 0.01847115159034729, + "step": 217040 + }, + { + "epoch": 30.809084457061747, + "grad_norm": 19.08379364013672, + "learning_rate": 6.920369056068133e-05, + "loss": 0.010451021790504455, + "step": 217050 + }, + { + "epoch": 30.810503903477645, + "grad_norm": 3.6798908710479736, + "learning_rate": 6.920227111426544e-05, + "loss": 0.011137820780277252, + "step": 217060 + }, + { + "epoch": 30.81192334989354, + "grad_norm": 0.2382068634033203, + "learning_rate": 6.920085166784954e-05, + "loss": 0.058163774013519284, + "step": 217070 + }, + { + "epoch": 30.81334279630944, + "grad_norm": 20.336130142211914, + "learning_rate": 6.919943222143365e-05, + "loss": 0.016638095676898956, + "step": 217080 + }, + { + "epoch": 30.814762242725337, + "grad_norm": 0.032831549644470215, + "learning_rate": 6.919801277501775e-05, + "loss": 0.043655797839164734, + "step": 217090 + }, + { + "epoch": 30.816181689141235, + "grad_norm": 1.9700372219085693, + "learning_rate": 6.919659332860185e-05, + "loss": 0.010304976999759675, + "step": 217100 + }, + { + "epoch": 30.817601135557133, + "grad_norm": 0.01124432310461998, + "learning_rate": 6.919517388218594e-05, + "loss": 0.001699674502015114, + "step": 217110 + }, + { + "epoch": 30.81902058197303, + "grad_norm": 0.7911727428436279, + "learning_rate": 6.919375443577005e-05, + "loss": 0.020158544182777405, + "step": 217120 + }, + { + "epoch": 30.82044002838893, + "grad_norm": 0.10986807942390442, + "learning_rate": 6.919233498935415e-05, + "loss": 0.010963676869869233, + "step": 217130 + }, + { + "epoch": 30.821859474804825, + "grad_norm": 0.19271115958690643, + "learning_rate": 6.919091554293826e-05, + "loss": 0.0013597112149000168, + "step": 217140 + }, + { + "epoch": 30.823278921220723, + "grad_norm": 0.6980128884315491, + "learning_rate": 6.918949609652236e-05, + "loss": 0.0014142729341983795, + "step": 217150 + }, + { + "epoch": 30.82469836763662, + "grad_norm": 0.008373486809432507, + "learning_rate": 6.918807665010646e-05, + "loss": 0.005096885189414024, + "step": 217160 + }, + { + "epoch": 30.82611781405252, + "grad_norm": 0.016445277258753777, + "learning_rate": 6.918665720369057e-05, + "loss": 0.0009940221905708313, + "step": 217170 + }, + { + "epoch": 30.827537260468418, + "grad_norm": 1.1080684661865234, + "learning_rate": 6.918523775727467e-05, + "loss": 0.006811121106147766, + "step": 217180 + }, + { + "epoch": 30.828956706884316, + "grad_norm": 0.45521295070648193, + "learning_rate": 6.918381831085878e-05, + "loss": 0.013007096946239471, + "step": 217190 + }, + { + "epoch": 30.830376153300215, + "grad_norm": 0.23235481977462769, + "learning_rate": 6.918239886444286e-05, + "loss": 0.004354672506451607, + "step": 217200 + }, + { + "epoch": 30.83179559971611, + "grad_norm": 0.959635853767395, + "learning_rate": 6.918097941802697e-05, + "loss": 0.018985405564308167, + "step": 217210 + }, + { + "epoch": 30.833215046132008, + "grad_norm": 0.7916736006736755, + "learning_rate": 6.917955997161107e-05, + "loss": 0.0017949938774108887, + "step": 217220 + }, + { + "epoch": 30.834634492547906, + "grad_norm": 0.009999649599194527, + "learning_rate": 6.917814052519518e-05, + "loss": 0.0021543703973293305, + "step": 217230 + }, + { + "epoch": 30.836053938963804, + "grad_norm": 0.08258651196956635, + "learning_rate": 6.917672107877928e-05, + "loss": 0.012208770215511321, + "step": 217240 + }, + { + "epoch": 30.837473385379703, + "grad_norm": 0.14893148839473724, + "learning_rate": 6.917530163236337e-05, + "loss": 0.002097783237695694, + "step": 217250 + }, + { + "epoch": 30.8388928317956, + "grad_norm": 0.13882039487361908, + "learning_rate": 6.917388218594749e-05, + "loss": 0.01922561079263687, + "step": 217260 + }, + { + "epoch": 30.8403122782115, + "grad_norm": 0.891461193561554, + "learning_rate": 6.917246273953158e-05, + "loss": 0.010121002793312073, + "step": 217270 + }, + { + "epoch": 30.841731724627394, + "grad_norm": 0.1646980494260788, + "learning_rate": 6.91710432931157e-05, + "loss": 0.019752433896064757, + "step": 217280 + }, + { + "epoch": 30.843151171043292, + "grad_norm": 0.07862994819879532, + "learning_rate": 6.916962384669979e-05, + "loss": 0.007897826284170151, + "step": 217290 + }, + { + "epoch": 30.84457061745919, + "grad_norm": 0.21260082721710205, + "learning_rate": 6.91682044002839e-05, + "loss": 0.016543571650981904, + "step": 217300 + }, + { + "epoch": 30.84599006387509, + "grad_norm": 0.007372829131782055, + "learning_rate": 6.916678495386799e-05, + "loss": 0.007205098867416382, + "step": 217310 + }, + { + "epoch": 30.847409510290987, + "grad_norm": 3.7530322074890137, + "learning_rate": 6.91653655074521e-05, + "loss": 0.0033505745232105255, + "step": 217320 + }, + { + "epoch": 30.848828956706885, + "grad_norm": 0.006773591041564941, + "learning_rate": 6.91639460610362e-05, + "loss": 0.0009563442319631577, + "step": 217330 + }, + { + "epoch": 30.850248403122784, + "grad_norm": 0.3606153726577759, + "learning_rate": 6.91625266146203e-05, + "loss": 0.003295699879527092, + "step": 217340 + }, + { + "epoch": 30.85166784953868, + "grad_norm": 0.0381997786462307, + "learning_rate": 6.91611071682044e-05, + "loss": 0.002570457011461258, + "step": 217350 + }, + { + "epoch": 30.853087295954577, + "grad_norm": 0.0034206954296678305, + "learning_rate": 6.91596877217885e-05, + "loss": 0.01884379833936691, + "step": 217360 + }, + { + "epoch": 30.854506742370475, + "grad_norm": 0.34043461084365845, + "learning_rate": 6.915826827537261e-05, + "loss": 0.011146102845668793, + "step": 217370 + }, + { + "epoch": 30.855926188786373, + "grad_norm": 0.07218156009912491, + "learning_rate": 6.915684882895671e-05, + "loss": 0.0024717412889003754, + "step": 217380 + }, + { + "epoch": 30.85734563520227, + "grad_norm": 0.017047379165887833, + "learning_rate": 6.915542938254082e-05, + "loss": 0.01209658831357956, + "step": 217390 + }, + { + "epoch": 30.85876508161817, + "grad_norm": 0.19148242473602295, + "learning_rate": 6.915400993612492e-05, + "loss": 0.005505566671490669, + "step": 217400 + }, + { + "epoch": 30.86018452803407, + "grad_norm": 0.20742875337600708, + "learning_rate": 6.915259048970901e-05, + "loss": 0.024375173449516296, + "step": 217410 + }, + { + "epoch": 30.861603974449963, + "grad_norm": 0.04557877033948898, + "learning_rate": 6.915117104329311e-05, + "loss": 0.003022715076804161, + "step": 217420 + }, + { + "epoch": 30.86302342086586, + "grad_norm": 1.5614538192749023, + "learning_rate": 6.914975159687722e-05, + "loss": 0.03371726274490357, + "step": 217430 + }, + { + "epoch": 30.86444286728176, + "grad_norm": 0.9068990349769592, + "learning_rate": 6.914833215046132e-05, + "loss": 0.007750953733921051, + "step": 217440 + }, + { + "epoch": 30.865862313697658, + "grad_norm": 4.842378616333008, + "learning_rate": 6.914691270404543e-05, + "loss": 0.01014859676361084, + "step": 217450 + }, + { + "epoch": 30.867281760113556, + "grad_norm": 1.1582311391830444, + "learning_rate": 6.914549325762953e-05, + "loss": 0.04296708405017853, + "step": 217460 + }, + { + "epoch": 30.868701206529455, + "grad_norm": 0.09473714232444763, + "learning_rate": 6.914407381121363e-05, + "loss": 0.025402244925498963, + "step": 217470 + }, + { + "epoch": 30.870120652945353, + "grad_norm": 0.09181398898363113, + "learning_rate": 6.914265436479774e-05, + "loss": 0.012095962464809418, + "step": 217480 + }, + { + "epoch": 30.871540099361248, + "grad_norm": 1.4037142992019653, + "learning_rate": 6.914123491838183e-05, + "loss": 0.00936305895447731, + "step": 217490 + }, + { + "epoch": 30.872959545777146, + "grad_norm": 0.10783647000789642, + "learning_rate": 6.913981547196594e-05, + "loss": 0.007285481691360474, + "step": 217500 + }, + { + "epoch": 30.872959545777146, + "eval_accuracy": 0.990907356774973, + "eval_loss": 0.03200250491499901, + "eval_runtime": 36.0934, + "eval_samples_per_second": 435.73, + "eval_steps_per_second": 13.631, + "step": 217500 + }, + { + "epoch": 30.874378992193044, + "grad_norm": 0.014366721734404564, + "learning_rate": 6.913839602555003e-05, + "loss": 0.006139091774821281, + "step": 217510 + }, + { + "epoch": 30.875798438608943, + "grad_norm": 2.0012807846069336, + "learning_rate": 6.913697657913414e-05, + "loss": 0.0054504193365573885, + "step": 217520 + }, + { + "epoch": 30.87721788502484, + "grad_norm": 0.4351624846458435, + "learning_rate": 6.913555713271824e-05, + "loss": 0.005467503890395164, + "step": 217530 + }, + { + "epoch": 30.87863733144074, + "grad_norm": 6.750432014465332, + "learning_rate": 6.913413768630235e-05, + "loss": 0.006113621965050697, + "step": 217540 + }, + { + "epoch": 30.880056777856637, + "grad_norm": 0.020468052476644516, + "learning_rate": 6.913271823988644e-05, + "loss": 0.021286997199058532, + "step": 217550 + }, + { + "epoch": 30.881476224272532, + "grad_norm": 1.7274608612060547, + "learning_rate": 6.913129879347054e-05, + "loss": 0.019472105801105498, + "step": 217560 + }, + { + "epoch": 30.88289567068843, + "grad_norm": 0.21951191127300262, + "learning_rate": 6.912987934705465e-05, + "loss": 0.0025633390992879867, + "step": 217570 + }, + { + "epoch": 30.88431511710433, + "grad_norm": 5.749380111694336, + "learning_rate": 6.912845990063875e-05, + "loss": 0.005763163790106773, + "step": 217580 + }, + { + "epoch": 30.885734563520227, + "grad_norm": 2.3999741077423096, + "learning_rate": 6.912704045422286e-05, + "loss": 0.004577958956360817, + "step": 217590 + }, + { + "epoch": 30.887154009936125, + "grad_norm": 0.49917006492614746, + "learning_rate": 6.912562100780696e-05, + "loss": 0.022000931203365326, + "step": 217600 + }, + { + "epoch": 30.888573456352024, + "grad_norm": 0.24413591623306274, + "learning_rate": 6.912420156139106e-05, + "loss": 0.002495564892888069, + "step": 217610 + }, + { + "epoch": 30.889992902767922, + "grad_norm": 0.004493415355682373, + "learning_rate": 6.912278211497515e-05, + "loss": 0.003189583122730255, + "step": 217620 + }, + { + "epoch": 30.891412349183817, + "grad_norm": 0.2687409222126007, + "learning_rate": 6.912136266855926e-05, + "loss": 0.03591532707214355, + "step": 217630 + }, + { + "epoch": 30.892831795599715, + "grad_norm": 0.7354759573936462, + "learning_rate": 6.911994322214336e-05, + "loss": 0.019000300765037538, + "step": 217640 + }, + { + "epoch": 30.894251242015613, + "grad_norm": 0.03477047011256218, + "learning_rate": 6.911852377572747e-05, + "loss": 0.018479101359844208, + "step": 217650 + }, + { + "epoch": 30.89567068843151, + "grad_norm": 0.02449078857898712, + "learning_rate": 6.911710432931158e-05, + "loss": 0.007242427766323089, + "step": 217660 + }, + { + "epoch": 30.89709013484741, + "grad_norm": 13.893617630004883, + "learning_rate": 6.911568488289567e-05, + "loss": 0.021191102266311646, + "step": 217670 + }, + { + "epoch": 30.89850958126331, + "grad_norm": 0.45346784591674805, + "learning_rate": 6.911426543647978e-05, + "loss": 0.03834270834922791, + "step": 217680 + }, + { + "epoch": 30.899929027679207, + "grad_norm": 1.0844413042068481, + "learning_rate": 6.911284599006388e-05, + "loss": 0.046379482746124266, + "step": 217690 + }, + { + "epoch": 30.9013484740951, + "grad_norm": 10.917219161987305, + "learning_rate": 6.911142654364799e-05, + "loss": 0.034733086824417114, + "step": 217700 + }, + { + "epoch": 30.902767920511, + "grad_norm": 4.919239044189453, + "learning_rate": 6.911000709723208e-05, + "loss": 0.04620937407016754, + "step": 217710 + }, + { + "epoch": 30.904187366926898, + "grad_norm": 0.09379059821367264, + "learning_rate": 6.910858765081618e-05, + "loss": 0.016663245856761932, + "step": 217720 + }, + { + "epoch": 30.905606813342796, + "grad_norm": 0.053728941828012466, + "learning_rate": 6.910716820440028e-05, + "loss": 0.028030005097389222, + "step": 217730 + }, + { + "epoch": 30.907026259758695, + "grad_norm": 0.14238066971302032, + "learning_rate": 6.910574875798439e-05, + "loss": 0.004538526013493538, + "step": 217740 + }, + { + "epoch": 30.908445706174593, + "grad_norm": 0.010422386229038239, + "learning_rate": 6.91043293115685e-05, + "loss": 0.017392948269844055, + "step": 217750 + }, + { + "epoch": 30.90986515259049, + "grad_norm": 0.4561026692390442, + "learning_rate": 6.91029098651526e-05, + "loss": 0.0059754379093647, + "step": 217760 + }, + { + "epoch": 30.911284599006386, + "grad_norm": 23.796653747558594, + "learning_rate": 6.91014904187367e-05, + "loss": 0.036605936288833615, + "step": 217770 + }, + { + "epoch": 30.912704045422284, + "grad_norm": 0.025466978549957275, + "learning_rate": 6.910007097232079e-05, + "loss": 0.02931970953941345, + "step": 217780 + }, + { + "epoch": 30.914123491838183, + "grad_norm": 15.127531051635742, + "learning_rate": 6.90986515259049e-05, + "loss": 0.04790546596050262, + "step": 217790 + }, + { + "epoch": 30.91554293825408, + "grad_norm": 1.5919642448425293, + "learning_rate": 6.9097232079489e-05, + "loss": 0.032723727822303775, + "step": 217800 + }, + { + "epoch": 30.91696238466998, + "grad_norm": 11.93608283996582, + "learning_rate": 6.909581263307311e-05, + "loss": 0.014034612476825714, + "step": 217810 + }, + { + "epoch": 30.918381831085878, + "grad_norm": 0.03688386455178261, + "learning_rate": 6.90943931866572e-05, + "loss": 0.006539475917816162, + "step": 217820 + }, + { + "epoch": 30.919801277501776, + "grad_norm": 0.02572915144264698, + "learning_rate": 6.909297374024131e-05, + "loss": 0.011722830682992935, + "step": 217830 + }, + { + "epoch": 30.92122072391767, + "grad_norm": 0.08823753893375397, + "learning_rate": 6.909155429382542e-05, + "loss": 0.005629526078701019, + "step": 217840 + }, + { + "epoch": 30.92264017033357, + "grad_norm": 0.34701651334762573, + "learning_rate": 6.909013484740952e-05, + "loss": 0.01592087745666504, + "step": 217850 + }, + { + "epoch": 30.924059616749467, + "grad_norm": 7.348698139190674, + "learning_rate": 6.908871540099363e-05, + "loss": 0.010005777329206466, + "step": 217860 + }, + { + "epoch": 30.925479063165366, + "grad_norm": 0.09392894804477692, + "learning_rate": 6.908729595457771e-05, + "loss": 0.014055365324020385, + "step": 217870 + }, + { + "epoch": 30.926898509581264, + "grad_norm": 0.2994018495082855, + "learning_rate": 6.908587650816182e-05, + "loss": 0.005690639093518257, + "step": 217880 + }, + { + "epoch": 30.928317955997162, + "grad_norm": 0.18303348124027252, + "learning_rate": 6.908445706174592e-05, + "loss": 0.012776882946491241, + "step": 217890 + }, + { + "epoch": 30.92973740241306, + "grad_norm": 1.680920124053955, + "learning_rate": 6.908303761533003e-05, + "loss": 0.016798380017280578, + "step": 217900 + }, + { + "epoch": 30.931156848828955, + "grad_norm": 5.970606327056885, + "learning_rate": 6.908161816891413e-05, + "loss": 0.05330547094345093, + "step": 217910 + }, + { + "epoch": 30.932576295244854, + "grad_norm": 7.25730562210083, + "learning_rate": 6.908019872249822e-05, + "loss": 0.03409905731678009, + "step": 217920 + }, + { + "epoch": 30.933995741660752, + "grad_norm": 5.636159896850586, + "learning_rate": 6.907877927608233e-05, + "loss": 0.00561646968126297, + "step": 217930 + }, + { + "epoch": 30.93541518807665, + "grad_norm": 0.10945527255535126, + "learning_rate": 6.907735982966643e-05, + "loss": 0.01034289300441742, + "step": 217940 + }, + { + "epoch": 30.93683463449255, + "grad_norm": 0.347632497549057, + "learning_rate": 6.907594038325054e-05, + "loss": 0.01606782674789429, + "step": 217950 + }, + { + "epoch": 30.938254080908447, + "grad_norm": 0.06210615113377571, + "learning_rate": 6.907452093683464e-05, + "loss": 0.009167250990867615, + "step": 217960 + }, + { + "epoch": 30.939673527324345, + "grad_norm": 6.302150726318359, + "learning_rate": 6.907310149041874e-05, + "loss": 0.05166071057319641, + "step": 217970 + }, + { + "epoch": 30.94109297374024, + "grad_norm": 10.144251823425293, + "learning_rate": 6.907168204400284e-05, + "loss": 0.029691267013549804, + "step": 217980 + }, + { + "epoch": 30.942512420156138, + "grad_norm": 1.242156744003296, + "learning_rate": 6.907026259758695e-05, + "loss": 0.011773911118507386, + "step": 217990 + }, + { + "epoch": 30.943931866572036, + "grad_norm": 8.977030754089355, + "learning_rate": 6.906884315117104e-05, + "loss": 0.017176058888435364, + "step": 218000 + }, + { + "epoch": 30.943931866572036, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.058002665638923645, + "eval_runtime": 34.1272, + "eval_samples_per_second": 460.835, + "eval_steps_per_second": 14.417, + "step": 218000 + }, + { + "epoch": 30.945351312987935, + "grad_norm": 16.315570831298828, + "learning_rate": 6.906742370475515e-05, + "loss": 0.04062665104866028, + "step": 218010 + }, + { + "epoch": 30.946770759403833, + "grad_norm": 4.111091613769531, + "learning_rate": 6.906600425833925e-05, + "loss": 0.012747302651405334, + "step": 218020 + }, + { + "epoch": 30.94819020581973, + "grad_norm": 6.622082710266113, + "learning_rate": 6.906458481192335e-05, + "loss": 0.03474915027618408, + "step": 218030 + }, + { + "epoch": 30.94960965223563, + "grad_norm": 0.066814124584198, + "learning_rate": 6.906316536550746e-05, + "loss": 0.03338159322738647, + "step": 218040 + }, + { + "epoch": 30.951029098651524, + "grad_norm": 0.154246523976326, + "learning_rate": 6.906174591909156e-05, + "loss": 0.011095117032527923, + "step": 218050 + }, + { + "epoch": 30.952448545067423, + "grad_norm": 13.655344009399414, + "learning_rate": 6.906032647267567e-05, + "loss": 0.03743317127227783, + "step": 218060 + }, + { + "epoch": 30.95386799148332, + "grad_norm": 0.44552457332611084, + "learning_rate": 6.905890702625977e-05, + "loss": 0.019241410493850707, + "step": 218070 + }, + { + "epoch": 30.95528743789922, + "grad_norm": 0.27609264850616455, + "learning_rate": 6.905748757984386e-05, + "loss": 0.002217768132686615, + "step": 218080 + }, + { + "epoch": 30.956706884315118, + "grad_norm": 0.43670278787612915, + "learning_rate": 6.905606813342796e-05, + "loss": 0.033947646617889404, + "step": 218090 + }, + { + "epoch": 30.958126330731016, + "grad_norm": 0.03652486577630043, + "learning_rate": 6.905464868701207e-05, + "loss": 0.0048299930989742276, + "step": 218100 + }, + { + "epoch": 30.959545777146914, + "grad_norm": 0.9775866866111755, + "learning_rate": 6.905322924059617e-05, + "loss": 0.04307197630405426, + "step": 218110 + }, + { + "epoch": 30.96096522356281, + "grad_norm": 0.12286458909511566, + "learning_rate": 6.905180979418028e-05, + "loss": 0.02220318913459778, + "step": 218120 + }, + { + "epoch": 30.962384669978707, + "grad_norm": 0.051401522010564804, + "learning_rate": 6.905039034776438e-05, + "loss": 0.036055338382720944, + "step": 218130 + }, + { + "epoch": 30.963804116394606, + "grad_norm": 5.291470527648926, + "learning_rate": 6.904897090134847e-05, + "loss": 0.004890695586800576, + "step": 218140 + }, + { + "epoch": 30.965223562810504, + "grad_norm": 11.336991310119629, + "learning_rate": 6.904755145493259e-05, + "loss": 0.01107928603887558, + "step": 218150 + }, + { + "epoch": 30.966643009226402, + "grad_norm": 0.03263648599386215, + "learning_rate": 6.904613200851668e-05, + "loss": 0.008920245617628098, + "step": 218160 + }, + { + "epoch": 30.9680624556423, + "grad_norm": 15.199950218200684, + "learning_rate": 6.90447125621008e-05, + "loss": 0.018530185520648956, + "step": 218170 + }, + { + "epoch": 30.9694819020582, + "grad_norm": 0.008291157893836498, + "learning_rate": 6.904329311568488e-05, + "loss": 0.051237326860427854, + "step": 218180 + }, + { + "epoch": 30.970901348474094, + "grad_norm": 0.01058171782642603, + "learning_rate": 6.904187366926899e-05, + "loss": 0.001308365911245346, + "step": 218190 + }, + { + "epoch": 30.972320794889992, + "grad_norm": 2.990431547164917, + "learning_rate": 6.904045422285309e-05, + "loss": 0.00649351105093956, + "step": 218200 + }, + { + "epoch": 30.97374024130589, + "grad_norm": 0.42743971943855286, + "learning_rate": 6.90390347764372e-05, + "loss": 0.008516108989715577, + "step": 218210 + }, + { + "epoch": 30.97515968772179, + "grad_norm": 0.17793358862400055, + "learning_rate": 6.903775727466288e-05, + "loss": 0.01977905333042145, + "step": 218220 + }, + { + "epoch": 30.976579134137687, + "grad_norm": 0.2979911267757416, + "learning_rate": 6.903633782824699e-05, + "loss": 0.0022491831332445145, + "step": 218230 + }, + { + "epoch": 30.977998580553585, + "grad_norm": 2.5720014572143555, + "learning_rate": 6.903491838183109e-05, + "loss": 0.0035245798528194426, + "step": 218240 + }, + { + "epoch": 30.979418026969483, + "grad_norm": 3.7881112098693848, + "learning_rate": 6.903349893541519e-05, + "loss": 0.01188119724392891, + "step": 218250 + }, + { + "epoch": 30.980837473385378, + "grad_norm": 1.6948165893554688, + "learning_rate": 6.903207948899928e-05, + "loss": 0.016461963951587676, + "step": 218260 + }, + { + "epoch": 30.982256919801276, + "grad_norm": 0.13917222619056702, + "learning_rate": 6.90306600425834e-05, + "loss": 0.015410451591014862, + "step": 218270 + }, + { + "epoch": 30.983676366217175, + "grad_norm": 0.23989605903625488, + "learning_rate": 6.902924059616749e-05, + "loss": 0.010152983665466308, + "step": 218280 + }, + { + "epoch": 30.985095812633073, + "grad_norm": 0.5456095337867737, + "learning_rate": 6.90278211497516e-05, + "loss": 0.0029011279344558715, + "step": 218290 + }, + { + "epoch": 30.98651525904897, + "grad_norm": 0.011064635589718819, + "learning_rate": 6.90264017033357e-05, + "loss": 0.041091051697731015, + "step": 218300 + }, + { + "epoch": 30.98793470546487, + "grad_norm": 0.050772108137607574, + "learning_rate": 6.90249822569198e-05, + "loss": 0.042924293875694276, + "step": 218310 + }, + { + "epoch": 30.989354151880768, + "grad_norm": 0.2840860188007355, + "learning_rate": 6.902356281050391e-05, + "loss": 0.01086147576570511, + "step": 218320 + }, + { + "epoch": 30.990773598296663, + "grad_norm": 1.4870294332504272, + "learning_rate": 6.902214336408801e-05, + "loss": 0.010196617245674134, + "step": 218330 + }, + { + "epoch": 30.99219304471256, + "grad_norm": 0.04243546351790428, + "learning_rate": 6.902072391767212e-05, + "loss": 0.04006100296974182, + "step": 218340 + }, + { + "epoch": 30.99361249112846, + "grad_norm": 0.046054355800151825, + "learning_rate": 6.901930447125622e-05, + "loss": 0.01684955656528473, + "step": 218350 + }, + { + "epoch": 30.995031937544358, + "grad_norm": 0.027500882744789124, + "learning_rate": 6.901788502484031e-05, + "loss": 0.0072121858596801754, + "step": 218360 + }, + { + "epoch": 30.996451383960256, + "grad_norm": 1.0464078187942505, + "learning_rate": 6.901646557842441e-05, + "loss": 0.035005074739456174, + "step": 218370 + }, + { + "epoch": 30.997870830376154, + "grad_norm": 0.7983553409576416, + "learning_rate": 6.901504613200852e-05, + "loss": 0.0040014971047639845, + "step": 218380 + }, + { + "epoch": 30.999290276792053, + "grad_norm": 0.22902528941631317, + "learning_rate": 6.901362668559262e-05, + "loss": 0.029035943746566772, + "step": 218390 + }, + { + "epoch": 31.000709723207947, + "grad_norm": 0.028507312759757042, + "learning_rate": 6.901220723917673e-05, + "loss": 0.008148349821567535, + "step": 218400 + }, + { + "epoch": 31.002129169623846, + "grad_norm": 0.6189132332801819, + "learning_rate": 6.901078779276083e-05, + "loss": 0.02330598384141922, + "step": 218410 + }, + { + "epoch": 31.003548616039744, + "grad_norm": 0.11498984694480896, + "learning_rate": 6.900936834634492e-05, + "loss": 0.004932021722197533, + "step": 218420 + }, + { + "epoch": 31.004968062455642, + "grad_norm": 1.0408570766448975, + "learning_rate": 6.900794889992904e-05, + "loss": 0.01227278932929039, + "step": 218430 + }, + { + "epoch": 31.00638750887154, + "grad_norm": 0.056782178580760956, + "learning_rate": 6.900652945351313e-05, + "loss": 0.003726310282945633, + "step": 218440 + }, + { + "epoch": 31.00780695528744, + "grad_norm": 2.120448112487793, + "learning_rate": 6.900511000709724e-05, + "loss": 0.0014927156269550323, + "step": 218450 + }, + { + "epoch": 31.009226401703337, + "grad_norm": 26.985031127929688, + "learning_rate": 6.900369056068133e-05, + "loss": 0.03398476839065552, + "step": 218460 + }, + { + "epoch": 31.010645848119232, + "grad_norm": 27.844417572021484, + "learning_rate": 6.900227111426544e-05, + "loss": 0.02710336446762085, + "step": 218470 + }, + { + "epoch": 31.01206529453513, + "grad_norm": 0.6428626775741577, + "learning_rate": 6.900085166784954e-05, + "loss": 0.029636266827583312, + "step": 218480 + }, + { + "epoch": 31.01348474095103, + "grad_norm": 0.028043055906891823, + "learning_rate": 6.899943222143365e-05, + "loss": 0.0018726225942373275, + "step": 218490 + }, + { + "epoch": 31.014904187366927, + "grad_norm": 2.2279398441314697, + "learning_rate": 6.899801277501776e-05, + "loss": 0.018565312027931213, + "step": 218500 + }, + { + "epoch": 31.014904187366927, + "eval_accuracy": 0.988872639409932, + "eval_loss": 0.04177171736955643, + "eval_runtime": 34.9629, + "eval_samples_per_second": 449.819, + "eval_steps_per_second": 14.072, + "step": 218500 + }, + { + "epoch": 31.016323633782825, + "grad_norm": 0.016840629279613495, + "learning_rate": 6.899659332860184e-05, + "loss": 0.035480457544326785, + "step": 218510 + }, + { + "epoch": 31.017743080198724, + "grad_norm": 0.026836568489670753, + "learning_rate": 6.899517388218595e-05, + "loss": 0.024409669637680053, + "step": 218520 + }, + { + "epoch": 31.019162526614622, + "grad_norm": 0.21576671302318573, + "learning_rate": 6.899375443577005e-05, + "loss": 0.030394068360328673, + "step": 218530 + }, + { + "epoch": 31.020581973030517, + "grad_norm": 0.02352849580347538, + "learning_rate": 6.899233498935416e-05, + "loss": 0.001692395657300949, + "step": 218540 + }, + { + "epoch": 31.022001419446415, + "grad_norm": 0.045097995549440384, + "learning_rate": 6.899091554293826e-05, + "loss": 0.013879945874214173, + "step": 218550 + }, + { + "epoch": 31.023420865862313, + "grad_norm": 2.700695753097534, + "learning_rate": 6.898949609652236e-05, + "loss": 0.004514498263597488, + "step": 218560 + }, + { + "epoch": 31.02484031227821, + "grad_norm": 0.013800583779811859, + "learning_rate": 6.898807665010645e-05, + "loss": 0.02877415418624878, + "step": 218570 + }, + { + "epoch": 31.02625975869411, + "grad_norm": 0.1507587432861328, + "learning_rate": 6.898665720369056e-05, + "loss": 0.013513992726802825, + "step": 218580 + }, + { + "epoch": 31.027679205110008, + "grad_norm": 0.4634220600128174, + "learning_rate": 6.898523775727467e-05, + "loss": 0.0022949449717998504, + "step": 218590 + }, + { + "epoch": 31.029098651525906, + "grad_norm": 0.0066350847482681274, + "learning_rate": 6.898381831085877e-05, + "loss": 0.011270388960838318, + "step": 218600 + }, + { + "epoch": 31.0305180979418, + "grad_norm": 0.02670341357588768, + "learning_rate": 6.898239886444287e-05, + "loss": 0.0540935218334198, + "step": 218610 + }, + { + "epoch": 31.0319375443577, + "grad_norm": 0.10761069506406784, + "learning_rate": 6.898097941802697e-05, + "loss": 0.02121012508869171, + "step": 218620 + }, + { + "epoch": 31.033356990773598, + "grad_norm": 0.24376215040683746, + "learning_rate": 6.897955997161108e-05, + "loss": 0.03629013299942017, + "step": 218630 + }, + { + "epoch": 31.034776437189496, + "grad_norm": 0.6065888404846191, + "learning_rate": 6.897814052519517e-05, + "loss": 0.006763703376054764, + "step": 218640 + }, + { + "epoch": 31.036195883605394, + "grad_norm": 0.10831452161073685, + "learning_rate": 6.897672107877929e-05, + "loss": 0.000760607048869133, + "step": 218650 + }, + { + "epoch": 31.037615330021293, + "grad_norm": 0.1210494115948677, + "learning_rate": 6.897530163236337e-05, + "loss": 0.004783189669251442, + "step": 218660 + }, + { + "epoch": 31.03903477643719, + "grad_norm": 0.10557562112808228, + "learning_rate": 6.897388218594748e-05, + "loss": 0.001550363376736641, + "step": 218670 + }, + { + "epoch": 31.040454222853086, + "grad_norm": 5.008120059967041, + "learning_rate": 6.897246273953159e-05, + "loss": 0.04311458170413971, + "step": 218680 + }, + { + "epoch": 31.041873669268984, + "grad_norm": 0.09262681007385254, + "learning_rate": 6.897104329311569e-05, + "loss": 0.0006346475332975387, + "step": 218690 + }, + { + "epoch": 31.043293115684882, + "grad_norm": 0.004941217135637999, + "learning_rate": 6.89696238466998e-05, + "loss": 0.020514318346977235, + "step": 218700 + }, + { + "epoch": 31.04471256210078, + "grad_norm": 0.0054657007567584515, + "learning_rate": 6.89682044002839e-05, + "loss": 0.006454160809516907, + "step": 218710 + }, + { + "epoch": 31.04613200851668, + "grad_norm": 10.193109512329102, + "learning_rate": 6.8966784953868e-05, + "loss": 0.016236190497875214, + "step": 218720 + }, + { + "epoch": 31.047551454932577, + "grad_norm": 0.017385803163051605, + "learning_rate": 6.896536550745209e-05, + "loss": 0.02682589888572693, + "step": 218730 + }, + { + "epoch": 31.048970901348476, + "grad_norm": 0.006026296876370907, + "learning_rate": 6.89639460610362e-05, + "loss": 0.00944252610206604, + "step": 218740 + }, + { + "epoch": 31.05039034776437, + "grad_norm": 1.1992169618606567, + "learning_rate": 6.89625266146203e-05, + "loss": 0.002296384423971176, + "step": 218750 + }, + { + "epoch": 31.05180979418027, + "grad_norm": 0.2871740162372589, + "learning_rate": 6.896110716820441e-05, + "loss": 0.012893518805503846, + "step": 218760 + }, + { + "epoch": 31.053229240596167, + "grad_norm": 0.019393233582377434, + "learning_rate": 6.895968772178851e-05, + "loss": 0.002794738858938217, + "step": 218770 + }, + { + "epoch": 31.054648687012065, + "grad_norm": 0.8181583881378174, + "learning_rate": 6.89582682753726e-05, + "loss": 0.0013241365551948548, + "step": 218780 + }, + { + "epoch": 31.056068133427964, + "grad_norm": 0.1653231978416443, + "learning_rate": 6.895684882895672e-05, + "loss": 0.000614871084690094, + "step": 218790 + }, + { + "epoch": 31.057487579843862, + "grad_norm": 0.035190023481845856, + "learning_rate": 6.895542938254081e-05, + "loss": 0.008346594125032424, + "step": 218800 + }, + { + "epoch": 31.05890702625976, + "grad_norm": 0.04123973473906517, + "learning_rate": 6.895400993612493e-05, + "loss": 0.0026463184505701063, + "step": 218810 + }, + { + "epoch": 31.060326472675655, + "grad_norm": 0.0011823591776192188, + "learning_rate": 6.895259048970901e-05, + "loss": 0.0016870401799678803, + "step": 218820 + }, + { + "epoch": 31.061745919091553, + "grad_norm": 0.12667034566402435, + "learning_rate": 6.895117104329312e-05, + "loss": 0.003058033436536789, + "step": 218830 + }, + { + "epoch": 31.06316536550745, + "grad_norm": 0.03110463358461857, + "learning_rate": 6.894975159687722e-05, + "loss": 0.0017398864030838013, + "step": 218840 + }, + { + "epoch": 31.06458481192335, + "grad_norm": 0.013825062662363052, + "learning_rate": 6.894833215046133e-05, + "loss": 0.03547472655773163, + "step": 218850 + }, + { + "epoch": 31.066004258339248, + "grad_norm": 0.3721359372138977, + "learning_rate": 6.894691270404543e-05, + "loss": 0.017557957768440248, + "step": 218860 + }, + { + "epoch": 31.067423704755146, + "grad_norm": 0.01174880936741829, + "learning_rate": 6.894549325762952e-05, + "loss": 0.011327019333839417, + "step": 218870 + }, + { + "epoch": 31.068843151171045, + "grad_norm": 0.5477786064147949, + "learning_rate": 6.894407381121363e-05, + "loss": 0.019897142052650453, + "step": 218880 + }, + { + "epoch": 31.07026259758694, + "grad_norm": 0.007668695878237486, + "learning_rate": 6.894265436479773e-05, + "loss": 0.0039004512131214143, + "step": 218890 + }, + { + "epoch": 31.071682044002838, + "grad_norm": 0.027467379346489906, + "learning_rate": 6.894123491838184e-05, + "loss": 0.022104290127754212, + "step": 218900 + }, + { + "epoch": 31.073101490418736, + "grad_norm": 0.19030317664146423, + "learning_rate": 6.893981547196594e-05, + "loss": 0.0024758756160736086, + "step": 218910 + }, + { + "epoch": 31.074520936834634, + "grad_norm": 7.220515727996826, + "learning_rate": 6.893839602555004e-05, + "loss": 0.0063797950744628905, + "step": 218920 + }, + { + "epoch": 31.075940383250533, + "grad_norm": 2.3156416416168213, + "learning_rate": 6.893697657913413e-05, + "loss": 0.0036709126085042953, + "step": 218930 + }, + { + "epoch": 31.07735982966643, + "grad_norm": 0.1663624495267868, + "learning_rate": 6.893555713271825e-05, + "loss": 0.030813470482826233, + "step": 218940 + }, + { + "epoch": 31.07877927608233, + "grad_norm": 0.559123694896698, + "learning_rate": 6.893413768630234e-05, + "loss": 0.023840148746967316, + "step": 218950 + }, + { + "epoch": 31.080198722498224, + "grad_norm": 3.3608405590057373, + "learning_rate": 6.893271823988645e-05, + "loss": 0.02655896544456482, + "step": 218960 + }, + { + "epoch": 31.081618168914122, + "grad_norm": 0.21705284714698792, + "learning_rate": 6.893129879347055e-05, + "loss": 0.0022833026945590974, + "step": 218970 + }, + { + "epoch": 31.08303761533002, + "grad_norm": 0.5979782342910767, + "learning_rate": 6.892987934705465e-05, + "loss": 0.030339199304580688, + "step": 218980 + }, + { + "epoch": 31.08445706174592, + "grad_norm": 0.0420905277132988, + "learning_rate": 6.892845990063876e-05, + "loss": 0.005056886002421379, + "step": 218990 + }, + { + "epoch": 31.085876508161817, + "grad_norm": 0.2508150041103363, + "learning_rate": 6.892704045422286e-05, + "loss": 0.05618956685066223, + "step": 219000 + }, + { + "epoch": 31.085876508161817, + "eval_accuracy": 0.9904622623513702, + "eval_loss": 0.03584443777799606, + "eval_runtime": 33.4531, + "eval_samples_per_second": 470.12, + "eval_steps_per_second": 14.707, + "step": 219000 + }, + { + "epoch": 31.087295954577716, + "grad_norm": 3.2345783710479736, + "learning_rate": 6.892562100780697e-05, + "loss": 0.008427906036376952, + "step": 219010 + }, + { + "epoch": 31.088715400993614, + "grad_norm": 0.05782344192266464, + "learning_rate": 6.892420156139106e-05, + "loss": 0.02180960476398468, + "step": 219020 + }, + { + "epoch": 31.09013484740951, + "grad_norm": 0.43696093559265137, + "learning_rate": 6.892278211497516e-05, + "loss": 0.0016447719186544417, + "step": 219030 + }, + { + "epoch": 31.091554293825407, + "grad_norm": 1.195481538772583, + "learning_rate": 6.892136266855926e-05, + "loss": 0.005552778393030167, + "step": 219040 + }, + { + "epoch": 31.092973740241305, + "grad_norm": 0.4772183895111084, + "learning_rate": 6.891994322214337e-05, + "loss": 0.02390202581882477, + "step": 219050 + }, + { + "epoch": 31.094393186657204, + "grad_norm": 3.698054313659668, + "learning_rate": 6.891852377572747e-05, + "loss": 0.005993632227182388, + "step": 219060 + }, + { + "epoch": 31.095812633073102, + "grad_norm": 0.3212509751319885, + "learning_rate": 6.891710432931158e-05, + "loss": 0.028082695603370667, + "step": 219070 + }, + { + "epoch": 31.097232079489, + "grad_norm": 1.9296443462371826, + "learning_rate": 6.891568488289568e-05, + "loss": 0.02758604884147644, + "step": 219080 + }, + { + "epoch": 31.0986515259049, + "grad_norm": 8.814900398254395, + "learning_rate": 6.891426543647977e-05, + "loss": 0.008135750889778137, + "step": 219090 + }, + { + "epoch": 31.100070972320793, + "grad_norm": 4.63097620010376, + "learning_rate": 6.891284599006388e-05, + "loss": 0.0107059508562088, + "step": 219100 + }, + { + "epoch": 31.10149041873669, + "grad_norm": 0.018751338124275208, + "learning_rate": 6.891142654364798e-05, + "loss": 0.007927034050226212, + "step": 219110 + }, + { + "epoch": 31.10290986515259, + "grad_norm": 0.11793588101863861, + "learning_rate": 6.891000709723209e-05, + "loss": 0.0033687226474285125, + "step": 219120 + }, + { + "epoch": 31.10432931156849, + "grad_norm": 0.4982536733150482, + "learning_rate": 6.890858765081618e-05, + "loss": 0.01187099888920784, + "step": 219130 + }, + { + "epoch": 31.105748757984387, + "grad_norm": 0.007281994912773371, + "learning_rate": 6.890716820440029e-05, + "loss": 0.007238540053367615, + "step": 219140 + }, + { + "epoch": 31.107168204400285, + "grad_norm": 0.5757606029510498, + "learning_rate": 6.890574875798438e-05, + "loss": 0.002067934349179268, + "step": 219150 + }, + { + "epoch": 31.108587650816183, + "grad_norm": 0.11489522457122803, + "learning_rate": 6.89043293115685e-05, + "loss": 0.000993959978222847, + "step": 219160 + }, + { + "epoch": 31.110007097232078, + "grad_norm": 0.14458531141281128, + "learning_rate": 6.89029098651526e-05, + "loss": 0.027562275528907776, + "step": 219170 + }, + { + "epoch": 31.111426543647976, + "grad_norm": 0.6787318587303162, + "learning_rate": 6.890149041873669e-05, + "loss": 0.0014292217791080474, + "step": 219180 + }, + { + "epoch": 31.112845990063875, + "grad_norm": 0.003931879997253418, + "learning_rate": 6.89000709723208e-05, + "loss": 0.0029543161392211912, + "step": 219190 + }, + { + "epoch": 31.114265436479773, + "grad_norm": 0.09115836769342422, + "learning_rate": 6.88986515259049e-05, + "loss": 0.003065287694334984, + "step": 219200 + }, + { + "epoch": 31.11568488289567, + "grad_norm": 0.5486524701118469, + "learning_rate": 6.889723207948901e-05, + "loss": 0.014378365874290467, + "step": 219210 + }, + { + "epoch": 31.11710432931157, + "grad_norm": 0.793498694896698, + "learning_rate": 6.889581263307311e-05, + "loss": 0.005398581549525261, + "step": 219220 + }, + { + "epoch": 31.118523775727468, + "grad_norm": 0.01818156987428665, + "learning_rate": 6.88943931866572e-05, + "loss": 0.007694406062364578, + "step": 219230 + }, + { + "epoch": 31.119943222143363, + "grad_norm": 0.4495571255683899, + "learning_rate": 6.88929737402413e-05, + "loss": 0.00571603886783123, + "step": 219240 + }, + { + "epoch": 31.12136266855926, + "grad_norm": 0.06300406157970428, + "learning_rate": 6.889155429382541e-05, + "loss": 0.04136318862438202, + "step": 219250 + }, + { + "epoch": 31.12278211497516, + "grad_norm": 0.0832853615283966, + "learning_rate": 6.889013484740951e-05, + "loss": 0.007575711607933045, + "step": 219260 + }, + { + "epoch": 31.124201561391057, + "grad_norm": 0.03774312138557434, + "learning_rate": 6.888871540099362e-05, + "loss": 0.0036825813353061677, + "step": 219270 + }, + { + "epoch": 31.125621007806956, + "grad_norm": 2.8512556552886963, + "learning_rate": 6.888729595457772e-05, + "loss": 0.051840394735336304, + "step": 219280 + }, + { + "epoch": 31.127040454222854, + "grad_norm": 0.026537630707025528, + "learning_rate": 6.888587650816182e-05, + "loss": 0.005039861053228378, + "step": 219290 + }, + { + "epoch": 31.128459900638752, + "grad_norm": 0.24212121963500977, + "learning_rate": 6.888445706174593e-05, + "loss": 0.013564282655715942, + "step": 219300 + }, + { + "epoch": 31.129879347054647, + "grad_norm": 0.22334450483322144, + "learning_rate": 6.888303761533002e-05, + "loss": 0.013692733645439149, + "step": 219310 + }, + { + "epoch": 31.131298793470545, + "grad_norm": 0.07293250411748886, + "learning_rate": 6.888161816891414e-05, + "loss": 0.0058593347668647764, + "step": 219320 + }, + { + "epoch": 31.132718239886444, + "grad_norm": 0.00785731803625822, + "learning_rate": 6.888019872249822e-05, + "loss": 0.00704069510102272, + "step": 219330 + }, + { + "epoch": 31.134137686302342, + "grad_norm": 0.5402151346206665, + "learning_rate": 6.887877927608233e-05, + "loss": 0.035092076659202574, + "step": 219340 + }, + { + "epoch": 31.13555713271824, + "grad_norm": 0.2919203042984009, + "learning_rate": 6.887735982966643e-05, + "loss": 0.038936829566955565, + "step": 219350 + }, + { + "epoch": 31.13697657913414, + "grad_norm": 0.6526214480400085, + "learning_rate": 6.887594038325054e-05, + "loss": 0.015769262611865998, + "step": 219360 + }, + { + "epoch": 31.138396025550037, + "grad_norm": 0.353145956993103, + "learning_rate": 6.887452093683464e-05, + "loss": 0.024828499555587767, + "step": 219370 + }, + { + "epoch": 31.13981547196593, + "grad_norm": 0.3478642404079437, + "learning_rate": 6.887310149041875e-05, + "loss": 0.02033410519361496, + "step": 219380 + }, + { + "epoch": 31.14123491838183, + "grad_norm": 0.010300693102180958, + "learning_rate": 6.887168204400284e-05, + "loss": 0.02747294306755066, + "step": 219390 + }, + { + "epoch": 31.14265436479773, + "grad_norm": 0.3368070125579834, + "learning_rate": 6.887026259758694e-05, + "loss": 0.025465017557144164, + "step": 219400 + }, + { + "epoch": 31.144073811213627, + "grad_norm": 0.004111597780138254, + "learning_rate": 6.886884315117105e-05, + "loss": 0.004739707708358765, + "step": 219410 + }, + { + "epoch": 31.145493257629525, + "grad_norm": 0.355139821767807, + "learning_rate": 6.886742370475515e-05, + "loss": 0.008568046987056733, + "step": 219420 + }, + { + "epoch": 31.146912704045423, + "grad_norm": 0.0851484090089798, + "learning_rate": 6.886600425833926e-05, + "loss": 0.004787729680538177, + "step": 219430 + }, + { + "epoch": 31.14833215046132, + "grad_norm": 11.297762870788574, + "learning_rate": 6.886458481192334e-05, + "loss": 0.010216746479272842, + "step": 219440 + }, + { + "epoch": 31.149751596877216, + "grad_norm": 14.67841911315918, + "learning_rate": 6.886316536550746e-05, + "loss": 0.055475515127182004, + "step": 219450 + }, + { + "epoch": 31.151171043293115, + "grad_norm": 1.3889485597610474, + "learning_rate": 6.886174591909155e-05, + "loss": 0.03249044120311737, + "step": 219460 + }, + { + "epoch": 31.152590489709013, + "grad_norm": 0.044420938938856125, + "learning_rate": 6.886032647267566e-05, + "loss": 0.005462997406721115, + "step": 219470 + }, + { + "epoch": 31.15400993612491, + "grad_norm": 0.3407905399799347, + "learning_rate": 6.885890702625976e-05, + "loss": 0.0022284138947725298, + "step": 219480 + }, + { + "epoch": 31.15542938254081, + "grad_norm": 0.00943032093346119, + "learning_rate": 6.885748757984386e-05, + "loss": 0.004657066613435745, + "step": 219490 + }, + { + "epoch": 31.156848828956708, + "grad_norm": 0.027316465973854065, + "learning_rate": 6.885606813342797e-05, + "loss": 0.04014646410942078, + "step": 219500 + }, + { + "epoch": 31.156848828956708, + "eval_accuracy": 0.991034526610288, + "eval_loss": 0.03361093997955322, + "eval_runtime": 33.6619, + "eval_samples_per_second": 467.205, + "eval_steps_per_second": 14.616, + "step": 219500 + }, + { + "epoch": 31.158268275372606, + "grad_norm": 0.0875822901725769, + "learning_rate": 6.885464868701207e-05, + "loss": 0.003999730944633484, + "step": 219510 + }, + { + "epoch": 31.1596877217885, + "grad_norm": 0.030154433101415634, + "learning_rate": 6.885322924059618e-05, + "loss": 0.004246667772531509, + "step": 219520 + }, + { + "epoch": 31.1611071682044, + "grad_norm": 1.0304555892944336, + "learning_rate": 6.885180979418027e-05, + "loss": 0.011771075427532196, + "step": 219530 + }, + { + "epoch": 31.162526614620297, + "grad_norm": 0.08551715314388275, + "learning_rate": 6.885039034776437e-05, + "loss": 0.005671734362840653, + "step": 219540 + }, + { + "epoch": 31.163946061036196, + "grad_norm": 11.727227210998535, + "learning_rate": 6.884897090134847e-05, + "loss": 0.03169549405574799, + "step": 219550 + }, + { + "epoch": 31.165365507452094, + "grad_norm": 3.786351203918457, + "learning_rate": 6.884755145493258e-05, + "loss": 0.0038984201848506926, + "step": 219560 + }, + { + "epoch": 31.166784953867992, + "grad_norm": 0.4970429837703705, + "learning_rate": 6.884613200851668e-05, + "loss": 0.001026470586657524, + "step": 219570 + }, + { + "epoch": 31.16820440028389, + "grad_norm": 0.10006266832351685, + "learning_rate": 6.884471256210079e-05, + "loss": 0.011575206369161605, + "step": 219580 + }, + { + "epoch": 31.169623846699785, + "grad_norm": 0.004476322326809168, + "learning_rate": 6.884329311568489e-05, + "loss": 0.0259895384311676, + "step": 219590 + }, + { + "epoch": 31.171043293115684, + "grad_norm": 0.2796310782432556, + "learning_rate": 6.884187366926898e-05, + "loss": 0.02389564961194992, + "step": 219600 + }, + { + "epoch": 31.172462739531582, + "grad_norm": 0.030796727165579796, + "learning_rate": 6.88404542228531e-05, + "loss": 0.00034244731068611145, + "step": 219610 + }, + { + "epoch": 31.17388218594748, + "grad_norm": 0.09422022849321365, + "learning_rate": 6.883903477643719e-05, + "loss": 0.00976952314376831, + "step": 219620 + }, + { + "epoch": 31.17530163236338, + "grad_norm": 0.0668853223323822, + "learning_rate": 6.88376153300213e-05, + "loss": 0.01934586465358734, + "step": 219630 + }, + { + "epoch": 31.176721078779277, + "grad_norm": 0.18960557878017426, + "learning_rate": 6.883619588360539e-05, + "loss": 0.007330893725156784, + "step": 219640 + }, + { + "epoch": 31.178140525195175, + "grad_norm": 2.9332895278930664, + "learning_rate": 6.88347764371895e-05, + "loss": 0.002667032554745674, + "step": 219650 + }, + { + "epoch": 31.17955997161107, + "grad_norm": 0.2162364423274994, + "learning_rate": 6.88333569907736e-05, + "loss": 0.013262423872947692, + "step": 219660 + }, + { + "epoch": 31.18097941802697, + "grad_norm": 0.2596551477909088, + "learning_rate": 6.88319375443577e-05, + "loss": 0.003945933282375335, + "step": 219670 + }, + { + "epoch": 31.182398864442867, + "grad_norm": 0.2035646140575409, + "learning_rate": 6.88305180979418e-05, + "loss": 0.013613691926002503, + "step": 219680 + }, + { + "epoch": 31.183818310858765, + "grad_norm": 7.31583833694458, + "learning_rate": 6.88290986515259e-05, + "loss": 0.03246985971927643, + "step": 219690 + }, + { + "epoch": 31.185237757274663, + "grad_norm": 0.023580478504300117, + "learning_rate": 6.882767920511001e-05, + "loss": 0.005691865086555481, + "step": 219700 + }, + { + "epoch": 31.18665720369056, + "grad_norm": 3.9662160873413086, + "learning_rate": 6.882625975869411e-05, + "loss": 0.015060454607009888, + "step": 219710 + }, + { + "epoch": 31.18807665010646, + "grad_norm": 4.014016628265381, + "learning_rate": 6.882484031227822e-05, + "loss": 0.024630264937877656, + "step": 219720 + }, + { + "epoch": 31.189496096522355, + "grad_norm": 0.01504560373723507, + "learning_rate": 6.882342086586232e-05, + "loss": 0.008061933517456054, + "step": 219730 + }, + { + "epoch": 31.190915542938253, + "grad_norm": 2.933316946029663, + "learning_rate": 6.882200141944643e-05, + "loss": 0.009068354964256287, + "step": 219740 + }, + { + "epoch": 31.19233498935415, + "grad_norm": 18.39481544494629, + "learning_rate": 6.882058197303051e-05, + "loss": 0.019167734682559966, + "step": 219750 + }, + { + "epoch": 31.19375443577005, + "grad_norm": 0.010713084600865841, + "learning_rate": 6.881916252661462e-05, + "loss": 0.004394670203328133, + "step": 219760 + }, + { + "epoch": 31.195173882185948, + "grad_norm": 0.5441117882728577, + "learning_rate": 6.881774308019872e-05, + "loss": 0.007805943489074707, + "step": 219770 + }, + { + "epoch": 31.196593328601846, + "grad_norm": 0.01917114481329918, + "learning_rate": 6.881632363378283e-05, + "loss": 0.0041467204689979555, + "step": 219780 + }, + { + "epoch": 31.198012775017745, + "grad_norm": 16.404027938842773, + "learning_rate": 6.881490418736693e-05, + "loss": 0.02029733210802078, + "step": 219790 + }, + { + "epoch": 31.19943222143364, + "grad_norm": 0.1506527066230774, + "learning_rate": 6.881348474095103e-05, + "loss": 0.0024494312703609465, + "step": 219800 + }, + { + "epoch": 31.200851667849538, + "grad_norm": 0.05264353007078171, + "learning_rate": 6.881206529453514e-05, + "loss": 0.027317333221435546, + "step": 219810 + }, + { + "epoch": 31.202271114265436, + "grad_norm": 0.5003364682197571, + "learning_rate": 6.881064584811923e-05, + "loss": 0.0013045407831668854, + "step": 219820 + }, + { + "epoch": 31.203690560681334, + "grad_norm": 0.23704756796360016, + "learning_rate": 6.880922640170335e-05, + "loss": 0.0012886855751276017, + "step": 219830 + }, + { + "epoch": 31.205110007097232, + "grad_norm": 0.33372074365615845, + "learning_rate": 6.880780695528744e-05, + "loss": 0.002421651780605316, + "step": 219840 + }, + { + "epoch": 31.20652945351313, + "grad_norm": 0.07559653371572495, + "learning_rate": 6.880638750887154e-05, + "loss": 0.016430996358394623, + "step": 219850 + }, + { + "epoch": 31.20794889992903, + "grad_norm": 0.035255178809165955, + "learning_rate": 6.880496806245564e-05, + "loss": 0.004492615908384323, + "step": 219860 + }, + { + "epoch": 31.209368346344924, + "grad_norm": 0.13424375653266907, + "learning_rate": 6.880354861603975e-05, + "loss": 0.0027627617120742796, + "step": 219870 + }, + { + "epoch": 31.210787792760822, + "grad_norm": 0.053438376635313034, + "learning_rate": 6.880212916962385e-05, + "loss": 0.032784882187843326, + "step": 219880 + }, + { + "epoch": 31.21220723917672, + "grad_norm": 0.10583126544952393, + "learning_rate": 6.880070972320796e-05, + "loss": 0.011003807187080383, + "step": 219890 + }, + { + "epoch": 31.21362668559262, + "grad_norm": 0.004241324495524168, + "learning_rate": 6.879929027679205e-05, + "loss": 0.001823725923895836, + "step": 219900 + }, + { + "epoch": 31.215046132008517, + "grad_norm": 0.033524297177791595, + "learning_rate": 6.879787083037615e-05, + "loss": 0.007850950211286544, + "step": 219910 + }, + { + "epoch": 31.216465578424415, + "grad_norm": 0.09793982654809952, + "learning_rate": 6.879645138396026e-05, + "loss": 0.0058452699333429335, + "step": 219920 + }, + { + "epoch": 31.217885024840314, + "grad_norm": 0.01204716507345438, + "learning_rate": 6.879503193754436e-05, + "loss": 0.005086122453212738, + "step": 219930 + }, + { + "epoch": 31.21930447125621, + "grad_norm": 0.013267734088003635, + "learning_rate": 6.879361249112847e-05, + "loss": 0.0014021117240190506, + "step": 219940 + }, + { + "epoch": 31.220723917672107, + "grad_norm": 0.011946274898946285, + "learning_rate": 6.879219304471255e-05, + "loss": 0.015305927395820618, + "step": 219950 + }, + { + "epoch": 31.222143364088005, + "grad_norm": 0.049911901354789734, + "learning_rate": 6.879077359829667e-05, + "loss": 0.0008541584014892578, + "step": 219960 + }, + { + "epoch": 31.223562810503903, + "grad_norm": 3.0907301902770996, + "learning_rate": 6.878935415188076e-05, + "loss": 0.0065728768706321715, + "step": 219970 + }, + { + "epoch": 31.2249822569198, + "grad_norm": 0.7252798676490784, + "learning_rate": 6.878793470546487e-05, + "loss": 0.004357310011982918, + "step": 219980 + }, + { + "epoch": 31.2264017033357, + "grad_norm": 0.11699429154396057, + "learning_rate": 6.878651525904898e-05, + "loss": 0.003483788296580315, + "step": 219990 + }, + { + "epoch": 31.2278211497516, + "grad_norm": 0.2622471749782562, + "learning_rate": 6.878509581263307e-05, + "loss": 0.039548730850219725, + "step": 220000 + }, + { + "epoch": 31.2278211497516, + "eval_accuracy": 0.9896356584218223, + "eval_loss": 0.04221094027161598, + "eval_runtime": 34.3652, + "eval_samples_per_second": 457.643, + "eval_steps_per_second": 14.317, + "step": 220000 + }, + { + "epoch": 31.229240596167493, + "grad_norm": 0.14322689175605774, + "learning_rate": 6.878367636621718e-05, + "loss": 0.0014741215854883194, + "step": 220010 + }, + { + "epoch": 31.23066004258339, + "grad_norm": 0.04442407563328743, + "learning_rate": 6.878225691980128e-05, + "loss": 0.0418280690908432, + "step": 220020 + }, + { + "epoch": 31.23207948899929, + "grad_norm": 0.3895086646080017, + "learning_rate": 6.878083747338539e-05, + "loss": 0.009897936880588532, + "step": 220030 + }, + { + "epoch": 31.233498935415188, + "grad_norm": 0.08528546243906021, + "learning_rate": 6.877941802696949e-05, + "loss": 0.012790778279304504, + "step": 220040 + }, + { + "epoch": 31.234918381831086, + "grad_norm": 0.03438718989491463, + "learning_rate": 6.877799858055358e-05, + "loss": 0.007434043288230896, + "step": 220050 + }, + { + "epoch": 31.236337828246985, + "grad_norm": 0.2273254692554474, + "learning_rate": 6.877657913413768e-05, + "loss": 0.006926451623439789, + "step": 220060 + }, + { + "epoch": 31.237757274662883, + "grad_norm": 1.98627769947052, + "learning_rate": 6.877515968772179e-05, + "loss": 0.006444590538740158, + "step": 220070 + }, + { + "epoch": 31.239176721078778, + "grad_norm": 0.011688735336065292, + "learning_rate": 6.87737402413059e-05, + "loss": 0.002337285876274109, + "step": 220080 + }, + { + "epoch": 31.240596167494676, + "grad_norm": 0.09343311935663223, + "learning_rate": 6.877232079489e-05, + "loss": 0.001806166023015976, + "step": 220090 + }, + { + "epoch": 31.242015613910574, + "grad_norm": 5.192276954650879, + "learning_rate": 6.877090134847411e-05, + "loss": 0.002431897073984146, + "step": 220100 + }, + { + "epoch": 31.243435060326473, + "grad_norm": 0.00762700941413641, + "learning_rate": 6.87694819020582e-05, + "loss": 0.005531877651810646, + "step": 220110 + }, + { + "epoch": 31.24485450674237, + "grad_norm": 0.018797829747200012, + "learning_rate": 6.87680624556423e-05, + "loss": 0.017296023666858673, + "step": 220120 + }, + { + "epoch": 31.24627395315827, + "grad_norm": 5.334298610687256, + "learning_rate": 6.87666430092264e-05, + "loss": 0.009347623586654663, + "step": 220130 + }, + { + "epoch": 31.247693399574167, + "grad_norm": 4.458310604095459, + "learning_rate": 6.876522356281051e-05, + "loss": 0.01912481486797333, + "step": 220140 + }, + { + "epoch": 31.249112845990062, + "grad_norm": 0.8217010498046875, + "learning_rate": 6.876380411639461e-05, + "loss": 0.003133658319711685, + "step": 220150 + }, + { + "epoch": 31.25053229240596, + "grad_norm": 0.9042649865150452, + "learning_rate": 6.876238466997871e-05, + "loss": 0.00232887826859951, + "step": 220160 + }, + { + "epoch": 31.25195173882186, + "grad_norm": 8.454756736755371, + "learning_rate": 6.876096522356282e-05, + "loss": 0.05061564445495605, + "step": 220170 + }, + { + "epoch": 31.253371185237757, + "grad_norm": 4.449641704559326, + "learning_rate": 6.875954577714692e-05, + "loss": 0.007485348731279373, + "step": 220180 + }, + { + "epoch": 31.254790631653655, + "grad_norm": 0.006894103717058897, + "learning_rate": 6.875812633073103e-05, + "loss": 0.029975342750549316, + "step": 220190 + }, + { + "epoch": 31.256210078069554, + "grad_norm": 0.16082240641117096, + "learning_rate": 6.875670688431512e-05, + "loss": 0.03345993161201477, + "step": 220200 + }, + { + "epoch": 31.257629524485452, + "grad_norm": 0.13730163872241974, + "learning_rate": 6.875528743789922e-05, + "loss": 0.021544355154037475, + "step": 220210 + }, + { + "epoch": 31.259048970901347, + "grad_norm": 0.18533159792423248, + "learning_rate": 6.875386799148332e-05, + "loss": 0.01824038177728653, + "step": 220220 + }, + { + "epoch": 31.260468417317245, + "grad_norm": 9.25625991821289, + "learning_rate": 6.875244854506743e-05, + "loss": 0.03746497333049774, + "step": 220230 + }, + { + "epoch": 31.261887863733143, + "grad_norm": 0.006450203713029623, + "learning_rate": 6.875102909865153e-05, + "loss": 0.02314048111438751, + "step": 220240 + }, + { + "epoch": 31.26330731014904, + "grad_norm": 10.893736839294434, + "learning_rate": 6.874960965223564e-05, + "loss": 0.06902470588684081, + "step": 220250 + }, + { + "epoch": 31.26472675656494, + "grad_norm": 0.08596265316009521, + "learning_rate": 6.874819020581974e-05, + "loss": 0.06462767124176025, + "step": 220260 + }, + { + "epoch": 31.26614620298084, + "grad_norm": 0.13541658222675323, + "learning_rate": 6.874677075940383e-05, + "loss": 0.008662118762731551, + "step": 220270 + }, + { + "epoch": 31.267565649396737, + "grad_norm": 10.71608829498291, + "learning_rate": 6.874535131298794e-05, + "loss": 0.017952930927276612, + "step": 220280 + }, + { + "epoch": 31.26898509581263, + "grad_norm": 2.6246373653411865, + "learning_rate": 6.874393186657204e-05, + "loss": 0.025669825077056885, + "step": 220290 + }, + { + "epoch": 31.27040454222853, + "grad_norm": 8.242669105529785, + "learning_rate": 6.874251242015615e-05, + "loss": 0.016703905165195466, + "step": 220300 + }, + { + "epoch": 31.271823988644428, + "grad_norm": 0.023293420672416687, + "learning_rate": 6.874109297374024e-05, + "loss": 0.004116210713982582, + "step": 220310 + }, + { + "epoch": 31.273243435060326, + "grad_norm": 9.093667030334473, + "learning_rate": 6.873967352732435e-05, + "loss": 0.012115438282489777, + "step": 220320 + }, + { + "epoch": 31.274662881476225, + "grad_norm": 1.0503547191619873, + "learning_rate": 6.873825408090844e-05, + "loss": 0.016875916719436647, + "step": 220330 + }, + { + "epoch": 31.276082327892123, + "grad_norm": 8.20504093170166, + "learning_rate": 6.873683463449256e-05, + "loss": 0.023848150670528413, + "step": 220340 + }, + { + "epoch": 31.27750177430802, + "grad_norm": 0.02279217727482319, + "learning_rate": 6.873541518807665e-05, + "loss": 0.05714033246040344, + "step": 220350 + }, + { + "epoch": 31.278921220723916, + "grad_norm": 2.2766528129577637, + "learning_rate": 6.873399574166075e-05, + "loss": 0.016217365860939026, + "step": 220360 + }, + { + "epoch": 31.280340667139814, + "grad_norm": 0.4839312434196472, + "learning_rate": 6.873257629524486e-05, + "loss": 0.006292640417814255, + "step": 220370 + }, + { + "epoch": 31.281760113555713, + "grad_norm": 0.7582332491874695, + "learning_rate": 6.873115684882896e-05, + "loss": 0.006877187639474869, + "step": 220380 + }, + { + "epoch": 31.28317955997161, + "grad_norm": 5.989778518676758, + "learning_rate": 6.872973740241307e-05, + "loss": 0.004462876170873642, + "step": 220390 + }, + { + "epoch": 31.28459900638751, + "grad_norm": 0.012331314384937286, + "learning_rate": 6.872831795599717e-05, + "loss": 0.004672634974122047, + "step": 220400 + }, + { + "epoch": 31.286018452803408, + "grad_norm": 0.09531120210886002, + "learning_rate": 6.872689850958126e-05, + "loss": 0.010312440991401672, + "step": 220410 + }, + { + "epoch": 31.287437899219306, + "grad_norm": 3.943279504776001, + "learning_rate": 6.872547906316536e-05, + "loss": 0.007172832638025284, + "step": 220420 + }, + { + "epoch": 31.2888573456352, + "grad_norm": 0.037402164191007614, + "learning_rate": 6.872405961674947e-05, + "loss": 0.004423761740326881, + "step": 220430 + }, + { + "epoch": 31.2902767920511, + "grad_norm": 0.037320345640182495, + "learning_rate": 6.872264017033357e-05, + "loss": 0.013656924664974212, + "step": 220440 + }, + { + "epoch": 31.291696238466997, + "grad_norm": 4.211592197418213, + "learning_rate": 6.872122072391768e-05, + "loss": 0.029834932088851927, + "step": 220450 + }, + { + "epoch": 31.293115684882896, + "grad_norm": 2.359312057495117, + "learning_rate": 6.871980127750178e-05, + "loss": 0.011903828382492066, + "step": 220460 + }, + { + "epoch": 31.294535131298794, + "grad_norm": 0.15735816955566406, + "learning_rate": 6.871838183108588e-05, + "loss": 0.016021013259887695, + "step": 220470 + }, + { + "epoch": 31.295954577714692, + "grad_norm": 11.047264099121094, + "learning_rate": 6.871696238466999e-05, + "loss": 0.044622236490249635, + "step": 220480 + }, + { + "epoch": 31.29737402413059, + "grad_norm": 0.11851929873228073, + "learning_rate": 6.871554293825408e-05, + "loss": 0.008498968183994293, + "step": 220490 + }, + { + "epoch": 31.298793470546485, + "grad_norm": 0.030306046828627586, + "learning_rate": 6.87141234918382e-05, + "loss": 0.0029967699199914934, + "step": 220500 + }, + { + "epoch": 31.298793470546485, + "eval_accuracy": 0.9821326381382336, + "eval_loss": 0.08388637751340866, + "eval_runtime": 33.9104, + "eval_samples_per_second": 463.781, + "eval_steps_per_second": 14.509, + "step": 220500 + }, + { + "epoch": 31.300212916962384, + "grad_norm": 0.12262436002492905, + "learning_rate": 6.871270404542229e-05, + "loss": 0.09183643460273742, + "step": 220510 + }, + { + "epoch": 31.301632363378282, + "grad_norm": 0.16052350401878357, + "learning_rate": 6.871128459900639e-05, + "loss": 0.048023977875709535, + "step": 220520 + }, + { + "epoch": 31.30305180979418, + "grad_norm": 14.949708938598633, + "learning_rate": 6.870986515259049e-05, + "loss": 0.07848787307739258, + "step": 220530 + }, + { + "epoch": 31.30447125621008, + "grad_norm": 6.520812511444092, + "learning_rate": 6.87084457061746e-05, + "loss": 0.010861020535230637, + "step": 220540 + }, + { + "epoch": 31.305890702625977, + "grad_norm": 0.008859147317707539, + "learning_rate": 6.87070262597587e-05, + "loss": 0.024957549571990967, + "step": 220550 + }, + { + "epoch": 31.307310149041875, + "grad_norm": 1.6959044933319092, + "learning_rate": 6.87056068133428e-05, + "loss": 0.02333634942770004, + "step": 220560 + }, + { + "epoch": 31.30872959545777, + "grad_norm": 0.0373995415866375, + "learning_rate": 6.87041873669269e-05, + "loss": 0.032201313972473146, + "step": 220570 + }, + { + "epoch": 31.310149041873668, + "grad_norm": 0.6864528059959412, + "learning_rate": 6.8702767920511e-05, + "loss": 0.015788394212722778, + "step": 220580 + }, + { + "epoch": 31.311568488289566, + "grad_norm": 0.06553277373313904, + "learning_rate": 6.870134847409511e-05, + "loss": 0.021818920969963074, + "step": 220590 + }, + { + "epoch": 31.312987934705465, + "grad_norm": 0.024674147367477417, + "learning_rate": 6.869992902767921e-05, + "loss": 0.052134263515472415, + "step": 220600 + }, + { + "epoch": 31.314407381121363, + "grad_norm": 0.009466307237744331, + "learning_rate": 6.869850958126332e-05, + "loss": 0.005807154253125191, + "step": 220610 + }, + { + "epoch": 31.31582682753726, + "grad_norm": 0.1484924703836441, + "learning_rate": 6.86970901348474e-05, + "loss": 0.042452472448349, + "step": 220620 + }, + { + "epoch": 31.31724627395316, + "grad_norm": 0.009469152428209782, + "learning_rate": 6.869567068843151e-05, + "loss": 0.014630787074565887, + "step": 220630 + }, + { + "epoch": 31.318665720369054, + "grad_norm": 0.008286673575639725, + "learning_rate": 6.869425124201561e-05, + "loss": 0.02866791784763336, + "step": 220640 + }, + { + "epoch": 31.320085166784953, + "grad_norm": 0.48190221190452576, + "learning_rate": 6.869283179559972e-05, + "loss": 0.00397137925028801, + "step": 220650 + }, + { + "epoch": 31.32150461320085, + "grad_norm": 0.12000402063131332, + "learning_rate": 6.869141234918382e-05, + "loss": 0.022017350792884825, + "step": 220660 + }, + { + "epoch": 31.32292405961675, + "grad_norm": 0.12064395844936371, + "learning_rate": 6.868999290276792e-05, + "loss": 0.01281091570854187, + "step": 220670 + }, + { + "epoch": 31.324343506032648, + "grad_norm": 2.9287445545196533, + "learning_rate": 6.868857345635203e-05, + "loss": 0.02742786705493927, + "step": 220680 + }, + { + "epoch": 31.325762952448546, + "grad_norm": 0.018959909677505493, + "learning_rate": 6.868715400993613e-05, + "loss": 0.008823098242282867, + "step": 220690 + }, + { + "epoch": 31.327182398864444, + "grad_norm": 0.020790157839655876, + "learning_rate": 6.868573456352024e-05, + "loss": 0.0013614587485790252, + "step": 220700 + }, + { + "epoch": 31.32860184528034, + "grad_norm": 0.3622206151485443, + "learning_rate": 6.868431511710433e-05, + "loss": 0.05382434129714966, + "step": 220710 + }, + { + "epoch": 31.330021291696237, + "grad_norm": 0.06045395880937576, + "learning_rate": 6.868289567068843e-05, + "loss": 0.03141040205955505, + "step": 220720 + }, + { + "epoch": 31.331440738112136, + "grad_norm": 0.04196469485759735, + "learning_rate": 6.868147622427253e-05, + "loss": 0.0008498478680849075, + "step": 220730 + }, + { + "epoch": 31.332860184528034, + "grad_norm": 0.004878521431237459, + "learning_rate": 6.868005677785664e-05, + "loss": 0.004365305602550507, + "step": 220740 + }, + { + "epoch": 31.334279630943932, + "grad_norm": 0.04860696196556091, + "learning_rate": 6.867863733144074e-05, + "loss": 0.010660245269536971, + "step": 220750 + }, + { + "epoch": 31.33569907735983, + "grad_norm": 4.989412784576416, + "learning_rate": 6.867721788502485e-05, + "loss": 0.006016276776790619, + "step": 220760 + }, + { + "epoch": 31.33711852377573, + "grad_norm": 0.02932031825184822, + "learning_rate": 6.867579843860895e-05, + "loss": 0.0025256693363189696, + "step": 220770 + }, + { + "epoch": 31.338537970191624, + "grad_norm": 0.37385082244873047, + "learning_rate": 6.867437899219304e-05, + "loss": 0.011817613989114762, + "step": 220780 + }, + { + "epoch": 31.339957416607522, + "grad_norm": 0.008826169185340405, + "learning_rate": 6.867295954577715e-05, + "loss": 0.02242153137922287, + "step": 220790 + }, + { + "epoch": 31.34137686302342, + "grad_norm": 2.4080562591552734, + "learning_rate": 6.867154009936125e-05, + "loss": 0.015804225206375123, + "step": 220800 + }, + { + "epoch": 31.34279630943932, + "grad_norm": 0.5428903102874756, + "learning_rate": 6.867012065294536e-05, + "loss": 0.0030164476484060287, + "step": 220810 + }, + { + "epoch": 31.344215755855217, + "grad_norm": 0.11118257790803909, + "learning_rate": 6.866870120652946e-05, + "loss": 0.009034644067287444, + "step": 220820 + }, + { + "epoch": 31.345635202271115, + "grad_norm": 0.13400641083717346, + "learning_rate": 6.866728176011356e-05, + "loss": 0.03119920492172241, + "step": 220830 + }, + { + "epoch": 31.347054648687013, + "grad_norm": 0.06523504853248596, + "learning_rate": 6.866586231369765e-05, + "loss": 0.014598830044269562, + "step": 220840 + }, + { + "epoch": 31.348474095102908, + "grad_norm": 11.382403373718262, + "learning_rate": 6.866444286728177e-05, + "loss": 0.06250744462013244, + "step": 220850 + }, + { + "epoch": 31.349893541518806, + "grad_norm": 0.005892597138881683, + "learning_rate": 6.866302342086586e-05, + "loss": 0.021600204706192016, + "step": 220860 + }, + { + "epoch": 31.351312987934705, + "grad_norm": 1.8148778676986694, + "learning_rate": 6.866160397444997e-05, + "loss": 0.010081024467945099, + "step": 220870 + }, + { + "epoch": 31.352732434350603, + "grad_norm": 2.618760824203491, + "learning_rate": 6.866018452803407e-05, + "loss": 0.0017087813466787338, + "step": 220880 + }, + { + "epoch": 31.3541518807665, + "grad_norm": 0.012027561664581299, + "learning_rate": 6.865876508161817e-05, + "loss": 0.00919821411371231, + "step": 220890 + }, + { + "epoch": 31.3555713271824, + "grad_norm": 1.117550015449524, + "learning_rate": 6.865734563520228e-05, + "loss": 0.01573888063430786, + "step": 220900 + }, + { + "epoch": 31.356990773598298, + "grad_norm": 10.42905330657959, + "learning_rate": 6.865592618878638e-05, + "loss": 0.02586914598941803, + "step": 220910 + }, + { + "epoch": 31.358410220014193, + "grad_norm": 3.3474268913269043, + "learning_rate": 6.865450674237049e-05, + "loss": 0.008779560029506684, + "step": 220920 + }, + { + "epoch": 31.35982966643009, + "grad_norm": 3.3236522674560547, + "learning_rate": 6.865308729595457e-05, + "loss": 0.025068145990371705, + "step": 220930 + }, + { + "epoch": 31.36124911284599, + "grad_norm": 0.14701086282730103, + "learning_rate": 6.865166784953868e-05, + "loss": 0.0026981223374605177, + "step": 220940 + }, + { + "epoch": 31.362668559261888, + "grad_norm": 0.026379866525530815, + "learning_rate": 6.865024840312278e-05, + "loss": 0.0024911422282457353, + "step": 220950 + }, + { + "epoch": 31.364088005677786, + "grad_norm": 1.7901653051376343, + "learning_rate": 6.864882895670689e-05, + "loss": 0.004488689824938774, + "step": 220960 + }, + { + "epoch": 31.365507452093684, + "grad_norm": 0.021580470725893974, + "learning_rate": 6.864740951029099e-05, + "loss": 0.0019684329628944395, + "step": 220970 + }, + { + "epoch": 31.366926898509583, + "grad_norm": 0.03386443853378296, + "learning_rate": 6.864599006387509e-05, + "loss": 0.03941264450550079, + "step": 220980 + }, + { + "epoch": 31.368346344925477, + "grad_norm": 0.2975333034992218, + "learning_rate": 6.86445706174592e-05, + "loss": 0.005241693183779717, + "step": 220990 + }, + { + "epoch": 31.369765791341376, + "grad_norm": 0.001883160206489265, + "learning_rate": 6.86431511710433e-05, + "loss": 0.005766087025403976, + "step": 221000 + }, + { + "epoch": 31.369765791341376, + "eval_accuracy": 0.9877281108920964, + "eval_loss": 0.051218535751104355, + "eval_runtime": 33.331, + "eval_samples_per_second": 471.844, + "eval_steps_per_second": 14.761, + "step": 221000 + }, + { + "epoch": 31.371185237757274, + "grad_norm": 0.09532391279935837, + "learning_rate": 6.86417317246274e-05, + "loss": 0.006311381608247757, + "step": 221010 + }, + { + "epoch": 31.372604684173172, + "grad_norm": 1.1145035028457642, + "learning_rate": 6.86403122782115e-05, + "loss": 0.03544992506504059, + "step": 221020 + }, + { + "epoch": 31.37402413058907, + "grad_norm": 0.2479870468378067, + "learning_rate": 6.86388928317956e-05, + "loss": 0.005884605273604393, + "step": 221030 + }, + { + "epoch": 31.37544357700497, + "grad_norm": 24.34555435180664, + "learning_rate": 6.86374733853797e-05, + "loss": 0.023039628565311433, + "step": 221040 + }, + { + "epoch": 31.376863023420867, + "grad_norm": 0.06121248006820679, + "learning_rate": 6.863605393896381e-05, + "loss": 0.010092812776565551, + "step": 221050 + }, + { + "epoch": 31.378282469836762, + "grad_norm": 0.04757869616150856, + "learning_rate": 6.86346344925479e-05, + "loss": 0.0319668173789978, + "step": 221060 + }, + { + "epoch": 31.37970191625266, + "grad_norm": 0.2323496788740158, + "learning_rate": 6.863321504613202e-05, + "loss": 0.020358264446258545, + "step": 221070 + }, + { + "epoch": 31.38112136266856, + "grad_norm": 0.6113802790641785, + "learning_rate": 6.863179559971611e-05, + "loss": 0.03444598913192749, + "step": 221080 + }, + { + "epoch": 31.382540809084457, + "grad_norm": 8.637332916259766, + "learning_rate": 6.863037615330021e-05, + "loss": 0.011593516170978545, + "step": 221090 + }, + { + "epoch": 31.383960255500355, + "grad_norm": 0.051972124725580215, + "learning_rate": 6.862895670688432e-05, + "loss": 0.05317922830581665, + "step": 221100 + }, + { + "epoch": 31.385379701916253, + "grad_norm": 0.0507974699139595, + "learning_rate": 6.862753726046842e-05, + "loss": 0.05862310528755188, + "step": 221110 + }, + { + "epoch": 31.386799148332152, + "grad_norm": 0.007907535880804062, + "learning_rate": 6.862611781405253e-05, + "loss": 0.0021351084113121034, + "step": 221120 + }, + { + "epoch": 31.388218594748047, + "grad_norm": 3.390395402908325, + "learning_rate": 6.862469836763661e-05, + "loss": 0.014491751790046692, + "step": 221130 + }, + { + "epoch": 31.389638041163945, + "grad_norm": 0.8060437440872192, + "learning_rate": 6.862327892122072e-05, + "loss": 0.00459304116666317, + "step": 221140 + }, + { + "epoch": 31.391057487579843, + "grad_norm": 1.248711347579956, + "learning_rate": 6.862185947480482e-05, + "loss": 0.00559164434671402, + "step": 221150 + }, + { + "epoch": 31.39247693399574, + "grad_norm": 0.6024468541145325, + "learning_rate": 6.862044002838893e-05, + "loss": 0.028774493932724, + "step": 221160 + }, + { + "epoch": 31.39389638041164, + "grad_norm": 0.08870872110128403, + "learning_rate": 6.861902058197303e-05, + "loss": 0.020894166827201844, + "step": 221170 + }, + { + "epoch": 31.395315826827538, + "grad_norm": 4.136611461639404, + "learning_rate": 6.861760113555714e-05, + "loss": 0.01361497789621353, + "step": 221180 + }, + { + "epoch": 31.396735273243436, + "grad_norm": 0.04772168770432472, + "learning_rate": 6.861618168914124e-05, + "loss": 0.003031349927186966, + "step": 221190 + }, + { + "epoch": 31.39815471965933, + "grad_norm": 0.03946156054735184, + "learning_rate": 6.861476224272534e-05, + "loss": 0.0072357386350631716, + "step": 221200 + }, + { + "epoch": 31.39957416607523, + "grad_norm": 0.718329906463623, + "learning_rate": 6.861334279630945e-05, + "loss": 0.014791537821292878, + "step": 221210 + }, + { + "epoch": 31.400993612491128, + "grad_norm": 0.032200928777456284, + "learning_rate": 6.861192334989354e-05, + "loss": 0.003921668976545334, + "step": 221220 + }, + { + "epoch": 31.402413058907026, + "grad_norm": 0.10213381052017212, + "learning_rate": 6.861050390347766e-05, + "loss": 0.0585306465625763, + "step": 221230 + }, + { + "epoch": 31.403832505322924, + "grad_norm": 0.7277308106422424, + "learning_rate": 6.860908445706174e-05, + "loss": 0.023336485028266907, + "step": 221240 + }, + { + "epoch": 31.405251951738823, + "grad_norm": 0.0674285814166069, + "learning_rate": 6.860766501064585e-05, + "loss": 0.0015948142856359482, + "step": 221250 + }, + { + "epoch": 31.40667139815472, + "grad_norm": 7.643856525421143, + "learning_rate": 6.860624556422995e-05, + "loss": 0.010558698326349258, + "step": 221260 + }, + { + "epoch": 31.408090844570616, + "grad_norm": 0.03461117297410965, + "learning_rate": 6.860482611781406e-05, + "loss": 0.009204374253749847, + "step": 221270 + }, + { + "epoch": 31.409510290986514, + "grad_norm": 0.030789652839303017, + "learning_rate": 6.860340667139816e-05, + "loss": 0.029488492012023925, + "step": 221280 + }, + { + "epoch": 31.410929737402412, + "grad_norm": 0.5532309412956238, + "learning_rate": 6.860198722498225e-05, + "loss": 0.002060743048787117, + "step": 221290 + }, + { + "epoch": 31.41234918381831, + "grad_norm": 0.08315418660640717, + "learning_rate": 6.860056777856636e-05, + "loss": 0.0022279083728790282, + "step": 221300 + }, + { + "epoch": 31.41376863023421, + "grad_norm": 0.024847159162163734, + "learning_rate": 6.859914833215046e-05, + "loss": 0.0014483634382486343, + "step": 221310 + }, + { + "epoch": 31.415188076650107, + "grad_norm": 0.1219697892665863, + "learning_rate": 6.859772888573457e-05, + "loss": 0.007374505698680878, + "step": 221320 + }, + { + "epoch": 31.416607523066006, + "grad_norm": 0.01081501878798008, + "learning_rate": 6.859630943931867e-05, + "loss": 0.007887397706508637, + "step": 221330 + }, + { + "epoch": 31.4180269694819, + "grad_norm": 0.8837466835975647, + "learning_rate": 6.859488999290277e-05, + "loss": 0.0032607704401016236, + "step": 221340 + }, + { + "epoch": 31.4194464158978, + "grad_norm": 0.18638451397418976, + "learning_rate": 6.859347054648686e-05, + "loss": 0.003052917867898941, + "step": 221350 + }, + { + "epoch": 31.420865862313697, + "grad_norm": 0.04929227754473686, + "learning_rate": 6.859205110007098e-05, + "loss": 0.011839616298675536, + "step": 221360 + }, + { + "epoch": 31.422285308729595, + "grad_norm": 0.04206482693552971, + "learning_rate": 6.859063165365507e-05, + "loss": 0.0008335180580615997, + "step": 221370 + }, + { + "epoch": 31.423704755145494, + "grad_norm": 8.18696117401123, + "learning_rate": 6.858921220723918e-05, + "loss": 0.016082735359668733, + "step": 221380 + }, + { + "epoch": 31.425124201561392, + "grad_norm": 0.49574652314186096, + "learning_rate": 6.858779276082328e-05, + "loss": 0.0008584089577198028, + "step": 221390 + }, + { + "epoch": 31.42654364797729, + "grad_norm": 0.02681078389286995, + "learning_rate": 6.858637331440738e-05, + "loss": 0.0021269913762807847, + "step": 221400 + }, + { + "epoch": 31.427963094393185, + "grad_norm": 6.209242343902588, + "learning_rate": 6.858495386799149e-05, + "loss": 0.011580832302570343, + "step": 221410 + }, + { + "epoch": 31.429382540809083, + "grad_norm": 0.08684221655130386, + "learning_rate": 6.858353442157559e-05, + "loss": 0.001984124630689621, + "step": 221420 + }, + { + "epoch": 31.43080198722498, + "grad_norm": 1.737552285194397, + "learning_rate": 6.85821149751597e-05, + "loss": 0.009815201908349992, + "step": 221430 + }, + { + "epoch": 31.43222143364088, + "grad_norm": 5.846996784210205, + "learning_rate": 6.858069552874378e-05, + "loss": 0.008263161033391952, + "step": 221440 + }, + { + "epoch": 31.433640880056778, + "grad_norm": 0.09669127315282822, + "learning_rate": 6.857927608232789e-05, + "loss": 0.00524287186563015, + "step": 221450 + }, + { + "epoch": 31.435060326472676, + "grad_norm": 0.05565854534506798, + "learning_rate": 6.857785663591199e-05, + "loss": 0.02404606342315674, + "step": 221460 + }, + { + "epoch": 31.436479772888575, + "grad_norm": 0.2932218611240387, + "learning_rate": 6.85764371894961e-05, + "loss": 0.0011381961405277253, + "step": 221470 + }, + { + "epoch": 31.43789921930447, + "grad_norm": 0.15158407390117645, + "learning_rate": 6.857501774308021e-05, + "loss": 0.009587246179580688, + "step": 221480 + }, + { + "epoch": 31.439318665720368, + "grad_norm": 0.0551505982875824, + "learning_rate": 6.85735982966643e-05, + "loss": 0.004476193711161614, + "step": 221490 + }, + { + "epoch": 31.440738112136266, + "grad_norm": 0.0677921399474144, + "learning_rate": 6.85721788502484e-05, + "loss": 0.003802812099456787, + "step": 221500 + }, + { + "epoch": 31.440738112136266, + "eval_accuracy": 0.991034526610288, + "eval_loss": 0.032093144953250885, + "eval_runtime": 33.6548, + "eval_samples_per_second": 467.303, + "eval_steps_per_second": 14.619, + "step": 221500 + }, + { + "epoch": 31.442157558552164, + "grad_norm": 0.2810420095920563, + "learning_rate": 6.85707594038325e-05, + "loss": 0.002024787291884422, + "step": 221510 + }, + { + "epoch": 31.443577004968063, + "grad_norm": 5.361693382263184, + "learning_rate": 6.856933995741661e-05, + "loss": 0.007444373518228531, + "step": 221520 + }, + { + "epoch": 31.44499645138396, + "grad_norm": 0.016330039128661156, + "learning_rate": 6.856792051100071e-05, + "loss": 0.014786088466644287, + "step": 221530 + }, + { + "epoch": 31.44641589779986, + "grad_norm": 0.10246646404266357, + "learning_rate": 6.856650106458482e-05, + "loss": 0.005022970959544182, + "step": 221540 + }, + { + "epoch": 31.447835344215754, + "grad_norm": 4.661182403564453, + "learning_rate": 6.856508161816891e-05, + "loss": 0.015033535659313202, + "step": 221550 + }, + { + "epoch": 31.449254790631652, + "grad_norm": 0.006836557295173407, + "learning_rate": 6.856366217175302e-05, + "loss": 0.009295473247766495, + "step": 221560 + }, + { + "epoch": 31.45067423704755, + "grad_norm": 0.8799571394920349, + "learning_rate": 6.856224272533713e-05, + "loss": 0.014251533150672912, + "step": 221570 + }, + { + "epoch": 31.45209368346345, + "grad_norm": 0.009611022658646107, + "learning_rate": 6.856082327892123e-05, + "loss": 0.004048918560147286, + "step": 221580 + }, + { + "epoch": 31.453513129879347, + "grad_norm": 0.03772978484630585, + "learning_rate": 6.855940383250534e-05, + "loss": 0.0012250050902366638, + "step": 221590 + }, + { + "epoch": 31.454932576295246, + "grad_norm": 22.376544952392578, + "learning_rate": 6.855798438608942e-05, + "loss": 0.028877830505371092, + "step": 221600 + }, + { + "epoch": 31.456352022711144, + "grad_norm": 0.2531074583530426, + "learning_rate": 6.855656493967353e-05, + "loss": 0.002277180552482605, + "step": 221610 + }, + { + "epoch": 31.45777146912704, + "grad_norm": 0.5765177607536316, + "learning_rate": 6.855514549325763e-05, + "loss": 0.013197356462478637, + "step": 221620 + }, + { + "epoch": 31.459190915542937, + "grad_norm": 0.11405781656503677, + "learning_rate": 6.855372604684174e-05, + "loss": 0.06075608730316162, + "step": 221630 + }, + { + "epoch": 31.460610361958835, + "grad_norm": 0.057894591242074966, + "learning_rate": 6.855230660042584e-05, + "loss": 0.043431058526039124, + "step": 221640 + }, + { + "epoch": 31.462029808374734, + "grad_norm": 0.051882654428482056, + "learning_rate": 6.855088715400994e-05, + "loss": 0.004258008301258087, + "step": 221650 + }, + { + "epoch": 31.463449254790632, + "grad_norm": 16.5135440826416, + "learning_rate": 6.854946770759405e-05, + "loss": 0.04157745838165283, + "step": 221660 + }, + { + "epoch": 31.46486870120653, + "grad_norm": 0.2301860749721527, + "learning_rate": 6.854804826117814e-05, + "loss": 0.013105246424674987, + "step": 221670 + }, + { + "epoch": 31.46628814762243, + "grad_norm": 0.23665274679660797, + "learning_rate": 6.854662881476225e-05, + "loss": 0.0038491491228342055, + "step": 221680 + }, + { + "epoch": 31.467707594038323, + "grad_norm": 0.0073006413877010345, + "learning_rate": 6.854520936834635e-05, + "loss": 0.0018721722066402436, + "step": 221690 + }, + { + "epoch": 31.46912704045422, + "grad_norm": 2.3442227840423584, + "learning_rate": 6.854378992193045e-05, + "loss": 0.004265763610601425, + "step": 221700 + }, + { + "epoch": 31.47054648687012, + "grad_norm": 0.044203873723745346, + "learning_rate": 6.854237047551455e-05, + "loss": 0.010742741078138352, + "step": 221710 + }, + { + "epoch": 31.471965933286018, + "grad_norm": 0.23313207924365997, + "learning_rate": 6.854095102909866e-05, + "loss": 0.02426009774208069, + "step": 221720 + }, + { + "epoch": 31.473385379701917, + "grad_norm": 1.4135171175003052, + "learning_rate": 6.853953158268275e-05, + "loss": 0.04656355977058411, + "step": 221730 + }, + { + "epoch": 31.474804826117815, + "grad_norm": 0.061432838439941406, + "learning_rate": 6.853811213626687e-05, + "loss": 0.013503937423229218, + "step": 221740 + }, + { + "epoch": 31.476224272533713, + "grad_norm": 5.264010906219482, + "learning_rate": 6.853669268985096e-05, + "loss": 0.002039783075451851, + "step": 221750 + }, + { + "epoch": 31.477643718949608, + "grad_norm": 0.016986379399895668, + "learning_rate": 6.853527324343506e-05, + "loss": 0.000998028740286827, + "step": 221760 + }, + { + "epoch": 31.479063165365506, + "grad_norm": 0.04877394437789917, + "learning_rate": 6.853385379701917e-05, + "loss": 0.01133342981338501, + "step": 221770 + }, + { + "epoch": 31.480482611781405, + "grad_norm": 0.08726673573255539, + "learning_rate": 6.853243435060327e-05, + "loss": 0.0013738051056861877, + "step": 221780 + }, + { + "epoch": 31.481902058197303, + "grad_norm": 0.0032661438453942537, + "learning_rate": 6.853101490418738e-05, + "loss": 0.013262896239757538, + "step": 221790 + }, + { + "epoch": 31.4833215046132, + "grad_norm": 0.2456192523241043, + "learning_rate": 6.852959545777146e-05, + "loss": 0.014154723286628723, + "step": 221800 + }, + { + "epoch": 31.4847409510291, + "grad_norm": 2.345886468887329, + "learning_rate": 6.852817601135557e-05, + "loss": 0.028337162733078004, + "step": 221810 + }, + { + "epoch": 31.486160397444998, + "grad_norm": 0.5350990295410156, + "learning_rate": 6.852675656493967e-05, + "loss": 0.014966361224651337, + "step": 221820 + }, + { + "epoch": 31.487579843860892, + "grad_norm": 0.10023520886898041, + "learning_rate": 6.852533711852378e-05, + "loss": 0.0017322998493909836, + "step": 221830 + }, + { + "epoch": 31.48899929027679, + "grad_norm": 0.0070836711674928665, + "learning_rate": 6.852391767210788e-05, + "loss": 0.009242992103099822, + "step": 221840 + }, + { + "epoch": 31.49041873669269, + "grad_norm": 0.07407588511705399, + "learning_rate": 6.852249822569199e-05, + "loss": 0.023464563488960265, + "step": 221850 + }, + { + "epoch": 31.491838183108587, + "grad_norm": 0.2658078670501709, + "learning_rate": 6.852107877927609e-05, + "loss": 0.015133580565452576, + "step": 221860 + }, + { + "epoch": 31.493257629524486, + "grad_norm": 18.3512020111084, + "learning_rate": 6.851965933286019e-05, + "loss": 0.047467547655105594, + "step": 221870 + }, + { + "epoch": 31.494677075940384, + "grad_norm": 13.289896011352539, + "learning_rate": 6.85182398864443e-05, + "loss": 0.017280672490596772, + "step": 221880 + }, + { + "epoch": 31.496096522356282, + "grad_norm": 0.09300863742828369, + "learning_rate": 6.85168204400284e-05, + "loss": 0.009277057647705079, + "step": 221890 + }, + { + "epoch": 31.497515968772177, + "grad_norm": 0.024898679926991463, + "learning_rate": 6.85154009936125e-05, + "loss": 0.06010374426841736, + "step": 221900 + }, + { + "epoch": 31.498935415188075, + "grad_norm": 1.030182957649231, + "learning_rate": 6.851398154719659e-05, + "loss": 0.011904476583003998, + "step": 221910 + }, + { + "epoch": 31.500354861603974, + "grad_norm": 19.60990333557129, + "learning_rate": 6.85125621007807e-05, + "loss": 0.039625927805900574, + "step": 221920 + }, + { + "epoch": 31.501774308019872, + "grad_norm": 3.20418381690979, + "learning_rate": 6.85111426543648e-05, + "loss": 0.009254249185323716, + "step": 221930 + }, + { + "epoch": 31.50319375443577, + "grad_norm": 0.17159149050712585, + "learning_rate": 6.850972320794891e-05, + "loss": 0.02445649206638336, + "step": 221940 + }, + { + "epoch": 31.50461320085167, + "grad_norm": 0.9886060357093811, + "learning_rate": 6.8508303761533e-05, + "loss": 0.03058106601238251, + "step": 221950 + }, + { + "epoch": 31.506032647267567, + "grad_norm": 2.1166832447052, + "learning_rate": 6.85068843151171e-05, + "loss": 0.0037167970091104507, + "step": 221960 + }, + { + "epoch": 31.50745209368346, + "grad_norm": 9.927234649658203, + "learning_rate": 6.850546486870121e-05, + "loss": 0.026875889301300047, + "step": 221970 + }, + { + "epoch": 31.50887154009936, + "grad_norm": 0.03339625149965286, + "learning_rate": 6.850404542228531e-05, + "loss": 0.029787799715995787, + "step": 221980 + }, + { + "epoch": 31.51029098651526, + "grad_norm": 0.6644856333732605, + "learning_rate": 6.850262597586942e-05, + "loss": 0.04896318316459656, + "step": 221990 + }, + { + "epoch": 31.511710432931157, + "grad_norm": 0.28756093978881836, + "learning_rate": 6.850120652945352e-05, + "loss": 0.019694322347640993, + "step": 222000 + }, + { + "epoch": 31.511710432931157, + "eval_accuracy": 0.988046035480384, + "eval_loss": 0.04483793303370476, + "eval_runtime": 34.3123, + "eval_samples_per_second": 458.349, + "eval_steps_per_second": 14.339, + "step": 222000 + }, + { + "epoch": 31.513129879347055, + "grad_norm": 0.08846868574619293, + "learning_rate": 6.849978708303762e-05, + "loss": 0.04807647466659546, + "step": 222010 + }, + { + "epoch": 31.514549325762953, + "grad_norm": 0.09505094587802887, + "learning_rate": 6.849836763662171e-05, + "loss": 0.009834969043731689, + "step": 222020 + }, + { + "epoch": 31.51596877217885, + "grad_norm": 0.8123778700828552, + "learning_rate": 6.849694819020583e-05, + "loss": 0.026328179240226745, + "step": 222030 + }, + { + "epoch": 31.517388218594746, + "grad_norm": 2.683861494064331, + "learning_rate": 6.849552874378992e-05, + "loss": 0.030101868510246276, + "step": 222040 + }, + { + "epoch": 31.518807665010645, + "grad_norm": 0.3452575206756592, + "learning_rate": 6.849410929737403e-05, + "loss": 0.03542779684066773, + "step": 222050 + }, + { + "epoch": 31.520227111426543, + "grad_norm": 0.05482000485062599, + "learning_rate": 6.849268985095813e-05, + "loss": 0.023825015127658843, + "step": 222060 + }, + { + "epoch": 31.52164655784244, + "grad_norm": 0.01716591604053974, + "learning_rate": 6.849127040454223e-05, + "loss": 0.033162495493888854, + "step": 222070 + }, + { + "epoch": 31.52306600425834, + "grad_norm": 0.14592626690864563, + "learning_rate": 6.848985095812634e-05, + "loss": 0.0008876163512468338, + "step": 222080 + }, + { + "epoch": 31.524485450674238, + "grad_norm": 0.13217805325984955, + "learning_rate": 6.848843151171044e-05, + "loss": 0.01816250681877136, + "step": 222090 + }, + { + "epoch": 31.525904897090136, + "grad_norm": 0.006954367738217115, + "learning_rate": 6.848701206529455e-05, + "loss": 0.003739676997065544, + "step": 222100 + }, + { + "epoch": 31.52732434350603, + "grad_norm": 0.12673215568065643, + "learning_rate": 6.848559261887863e-05, + "loss": 0.013144095242023469, + "step": 222110 + }, + { + "epoch": 31.52874378992193, + "grad_norm": 0.015528416261076927, + "learning_rate": 6.848417317246274e-05, + "loss": 0.008012336492538453, + "step": 222120 + }, + { + "epoch": 31.530163236337827, + "grad_norm": 0.026655280962586403, + "learning_rate": 6.848275372604684e-05, + "loss": 0.008508038520812989, + "step": 222130 + }, + { + "epoch": 31.531582682753726, + "grad_norm": 0.08345367014408112, + "learning_rate": 6.848133427963095e-05, + "loss": 0.02855041027069092, + "step": 222140 + }, + { + "epoch": 31.533002129169624, + "grad_norm": 7.56699275970459, + "learning_rate": 6.847991483321505e-05, + "loss": 0.01373351365327835, + "step": 222150 + }, + { + "epoch": 31.534421575585522, + "grad_norm": 0.32072383165359497, + "learning_rate": 6.847849538679915e-05, + "loss": 0.0011138606816530228, + "step": 222160 + }, + { + "epoch": 31.53584102200142, + "grad_norm": 0.04613662511110306, + "learning_rate": 6.847707594038326e-05, + "loss": 0.0203217476606369, + "step": 222170 + }, + { + "epoch": 31.537260468417315, + "grad_norm": 3.241795778274536, + "learning_rate": 6.847565649396735e-05, + "loss": 0.0043614178895950316, + "step": 222180 + }, + { + "epoch": 31.538679914833214, + "grad_norm": 0.22630196809768677, + "learning_rate": 6.847423704755146e-05, + "loss": 0.0031866539269685746, + "step": 222190 + }, + { + "epoch": 31.540099361249112, + "grad_norm": 1.5139012336730957, + "learning_rate": 6.847281760113556e-05, + "loss": 0.0029083069413900374, + "step": 222200 + }, + { + "epoch": 31.54151880766501, + "grad_norm": 0.10423379391431808, + "learning_rate": 6.847139815471967e-05, + "loss": 0.012162964046001434, + "step": 222210 + }, + { + "epoch": 31.54293825408091, + "grad_norm": 2.507601737976074, + "learning_rate": 6.846997870830376e-05, + "loss": 0.002100534364581108, + "step": 222220 + }, + { + "epoch": 31.544357700496807, + "grad_norm": 2.15395450592041, + "learning_rate": 6.846855926188787e-05, + "loss": 0.018679863214492796, + "step": 222230 + }, + { + "epoch": 31.545777146912705, + "grad_norm": 0.3214964270591736, + "learning_rate": 6.846713981547196e-05, + "loss": 0.005801632255315781, + "step": 222240 + }, + { + "epoch": 31.5471965933286, + "grad_norm": 1.3370298147201538, + "learning_rate": 6.846572036905608e-05, + "loss": 0.003627047687768936, + "step": 222250 + }, + { + "epoch": 31.5486160397445, + "grad_norm": 0.011807695962488651, + "learning_rate": 6.846430092264017e-05, + "loss": 0.01107315719127655, + "step": 222260 + }, + { + "epoch": 31.550035486160397, + "grad_norm": 9.41503620147705, + "learning_rate": 6.846288147622427e-05, + "loss": 0.012172196060419083, + "step": 222270 + }, + { + "epoch": 31.551454932576295, + "grad_norm": 0.2691578269004822, + "learning_rate": 6.846146202980838e-05, + "loss": 0.010789214819669723, + "step": 222280 + }, + { + "epoch": 31.552874378992193, + "grad_norm": 0.09895250201225281, + "learning_rate": 6.846004258339248e-05, + "loss": 0.010325319319963455, + "step": 222290 + }, + { + "epoch": 31.55429382540809, + "grad_norm": 3.285295248031616, + "learning_rate": 6.845862313697659e-05, + "loss": 0.04795163571834564, + "step": 222300 + }, + { + "epoch": 31.55571327182399, + "grad_norm": 0.13277719914913177, + "learning_rate": 6.845720369056069e-05, + "loss": 0.0369661271572113, + "step": 222310 + }, + { + "epoch": 31.557132718239885, + "grad_norm": 0.008112408220767975, + "learning_rate": 6.845578424414478e-05, + "loss": 0.0033081788569688797, + "step": 222320 + }, + { + "epoch": 31.558552164655783, + "grad_norm": 0.003341653849929571, + "learning_rate": 6.845436479772888e-05, + "loss": 0.03091157376766205, + "step": 222330 + }, + { + "epoch": 31.55997161107168, + "grad_norm": 0.21124424040317535, + "learning_rate": 6.845294535131299e-05, + "loss": 0.006407482922077179, + "step": 222340 + }, + { + "epoch": 31.56139105748758, + "grad_norm": 1.297658920288086, + "learning_rate": 6.845152590489709e-05, + "loss": 0.003619643673300743, + "step": 222350 + }, + { + "epoch": 31.562810503903478, + "grad_norm": 0.033972110599279404, + "learning_rate": 6.84501064584812e-05, + "loss": 0.028224876523017882, + "step": 222360 + }, + { + "epoch": 31.564229950319376, + "grad_norm": 0.023184245452284813, + "learning_rate": 6.84486870120653e-05, + "loss": 0.0039539124816656114, + "step": 222370 + }, + { + "epoch": 31.565649396735274, + "grad_norm": 3.583712339401245, + "learning_rate": 6.84472675656494e-05, + "loss": 0.03639311194419861, + "step": 222380 + }, + { + "epoch": 31.56706884315117, + "grad_norm": 0.10429982841014862, + "learning_rate": 6.84458481192335e-05, + "loss": 0.0022097595036029817, + "step": 222390 + }, + { + "epoch": 31.568488289567068, + "grad_norm": 0.2767055928707123, + "learning_rate": 6.84444286728176e-05, + "loss": 0.031649190187454226, + "step": 222400 + }, + { + "epoch": 31.569907735982966, + "grad_norm": 0.05100799351930618, + "learning_rate": 6.844300922640172e-05, + "loss": 0.004150751978158951, + "step": 222410 + }, + { + "epoch": 31.571327182398864, + "grad_norm": 4.214309215545654, + "learning_rate": 6.84415897799858e-05, + "loss": 0.027370449900627137, + "step": 222420 + }, + { + "epoch": 31.572746628814762, + "grad_norm": 0.03972742334008217, + "learning_rate": 6.844017033356991e-05, + "loss": 0.010953378677368165, + "step": 222430 + }, + { + "epoch": 31.57416607523066, + "grad_norm": 0.057993050664663315, + "learning_rate": 6.843875088715401e-05, + "loss": 0.007481519877910614, + "step": 222440 + }, + { + "epoch": 31.57558552164656, + "grad_norm": 0.2307097464799881, + "learning_rate": 6.843733144073812e-05, + "loss": 0.0012964900583028794, + "step": 222450 + }, + { + "epoch": 31.577004968062454, + "grad_norm": 0.026052605360746384, + "learning_rate": 6.843591199432222e-05, + "loss": 0.007038526982069015, + "step": 222460 + }, + { + "epoch": 31.578424414478352, + "grad_norm": 0.07118277251720428, + "learning_rate": 6.843449254790631e-05, + "loss": 0.01919879615306854, + "step": 222470 + }, + { + "epoch": 31.57984386089425, + "grad_norm": 0.3343837559223175, + "learning_rate": 6.843307310149042e-05, + "loss": 0.02765730917453766, + "step": 222480 + }, + { + "epoch": 31.58126330731015, + "grad_norm": 18.943763732910156, + "learning_rate": 6.843165365507452e-05, + "loss": 0.029001539945602416, + "step": 222490 + }, + { + "epoch": 31.582682753726047, + "grad_norm": 0.03788690268993378, + "learning_rate": 6.843023420865863e-05, + "loss": 0.034218376874923705, + "step": 222500 + }, + { + "epoch": 31.582682753726047, + "eval_accuracy": 0.9846760348445349, + "eval_loss": 0.07114436477422714, + "eval_runtime": 35.1744, + "eval_samples_per_second": 447.115, + "eval_steps_per_second": 13.987, + "step": 222500 + }, + { + "epoch": 31.584102200141945, + "grad_norm": 0.044092804193496704, + "learning_rate": 6.842881476224273e-05, + "loss": 0.005282947421073913, + "step": 222510 + }, + { + "epoch": 31.585521646557844, + "grad_norm": 1.485288143157959, + "learning_rate": 6.842739531582683e-05, + "loss": 0.00314519964158535, + "step": 222520 + }, + { + "epoch": 31.58694109297374, + "grad_norm": 0.03009200468659401, + "learning_rate": 6.842597586941092e-05, + "loss": 0.038224583864212035, + "step": 222530 + }, + { + "epoch": 31.588360539389637, + "grad_norm": 0.022409336641430855, + "learning_rate": 6.842455642299504e-05, + "loss": 0.024249163269996644, + "step": 222540 + }, + { + "epoch": 31.589779985805535, + "grad_norm": 0.2030828446149826, + "learning_rate": 6.842313697657913e-05, + "loss": 0.03408604562282562, + "step": 222550 + }, + { + "epoch": 31.591199432221433, + "grad_norm": 1.8709450960159302, + "learning_rate": 6.842171753016324e-05, + "loss": 0.021239930391311647, + "step": 222560 + }, + { + "epoch": 31.59261887863733, + "grad_norm": 0.11051110178232193, + "learning_rate": 6.842029808374734e-05, + "loss": 0.022453394532203675, + "step": 222570 + }, + { + "epoch": 31.59403832505323, + "grad_norm": 0.011826016008853912, + "learning_rate": 6.841887863733144e-05, + "loss": 0.002120371162891388, + "step": 222580 + }, + { + "epoch": 31.59545777146913, + "grad_norm": 7.6935648918151855, + "learning_rate": 6.841745919091555e-05, + "loss": 0.010693655908107757, + "step": 222590 + }, + { + "epoch": 31.596877217885023, + "grad_norm": 0.08129948377609253, + "learning_rate": 6.841603974449965e-05, + "loss": 0.03820379078388214, + "step": 222600 + }, + { + "epoch": 31.59829666430092, + "grad_norm": 7.014500617980957, + "learning_rate": 6.841462029808376e-05, + "loss": 0.03264212608337402, + "step": 222610 + }, + { + "epoch": 31.59971611071682, + "grad_norm": 0.2476232945919037, + "learning_rate": 6.841320085166785e-05, + "loss": 0.03118674159049988, + "step": 222620 + }, + { + "epoch": 31.601135557132718, + "grad_norm": 0.012365617789328098, + "learning_rate": 6.841178140525195e-05, + "loss": 0.007416977733373642, + "step": 222630 + }, + { + "epoch": 31.602555003548616, + "grad_norm": 0.15671426057815552, + "learning_rate": 6.841036195883605e-05, + "loss": 0.002590512111783028, + "step": 222640 + }, + { + "epoch": 31.603974449964515, + "grad_norm": 0.04080082103610039, + "learning_rate": 6.840894251242016e-05, + "loss": 0.038687902688980105, + "step": 222650 + }, + { + "epoch": 31.605393896380413, + "grad_norm": 0.13197286427021027, + "learning_rate": 6.840752306600426e-05, + "loss": 0.003405524045228958, + "step": 222660 + }, + { + "epoch": 31.606813342796308, + "grad_norm": 2.9949944019317627, + "learning_rate": 6.840610361958837e-05, + "loss": 0.0029617238789796827, + "step": 222670 + }, + { + "epoch": 31.608232789212206, + "grad_norm": 0.033075977116823196, + "learning_rate": 6.840468417317247e-05, + "loss": 0.0022783927619457246, + "step": 222680 + }, + { + "epoch": 31.609652235628104, + "grad_norm": 0.08474726229906082, + "learning_rate": 6.840326472675656e-05, + "loss": 0.015998953580856325, + "step": 222690 + }, + { + "epoch": 31.611071682044003, + "grad_norm": 0.23356769979000092, + "learning_rate": 6.840184528034067e-05, + "loss": 0.0029844820499420168, + "step": 222700 + }, + { + "epoch": 31.6124911284599, + "grad_norm": 0.06870406866073608, + "learning_rate": 6.840042583392477e-05, + "loss": 0.009503785520792007, + "step": 222710 + }, + { + "epoch": 31.6139105748758, + "grad_norm": 0.13064171373844147, + "learning_rate": 6.839900638750888e-05, + "loss": 0.02331961989402771, + "step": 222720 + }, + { + "epoch": 31.615330021291697, + "grad_norm": 0.0388622023165226, + "learning_rate": 6.839758694109297e-05, + "loss": 0.019750523567199706, + "step": 222730 + }, + { + "epoch": 31.616749467707596, + "grad_norm": 0.006855187006294727, + "learning_rate": 6.839616749467708e-05, + "loss": 0.003961005061864853, + "step": 222740 + }, + { + "epoch": 31.61816891412349, + "grad_norm": 0.04754229635000229, + "learning_rate": 6.839474804826117e-05, + "loss": 0.025153601169586183, + "step": 222750 + }, + { + "epoch": 31.61958836053939, + "grad_norm": 11.83218002319336, + "learning_rate": 6.839332860184529e-05, + "loss": 0.010895866155624389, + "step": 222760 + }, + { + "epoch": 31.621007806955287, + "grad_norm": 0.7706044316291809, + "learning_rate": 6.83919091554294e-05, + "loss": 0.012851741909980775, + "step": 222770 + }, + { + "epoch": 31.622427253371185, + "grad_norm": 0.014592118561267853, + "learning_rate": 6.839048970901348e-05, + "loss": 0.025425416231155396, + "step": 222780 + }, + { + "epoch": 31.623846699787084, + "grad_norm": 0.04885948821902275, + "learning_rate": 6.838907026259759e-05, + "loss": 0.0023203358054161073, + "step": 222790 + }, + { + "epoch": 31.625266146202982, + "grad_norm": 0.016803158447146416, + "learning_rate": 6.838765081618169e-05, + "loss": 0.006234152615070343, + "step": 222800 + }, + { + "epoch": 31.62668559261888, + "grad_norm": 2.1404531002044678, + "learning_rate": 6.83862313697658e-05, + "loss": 0.0017139039933681487, + "step": 222810 + }, + { + "epoch": 31.628105039034775, + "grad_norm": 0.00566613906994462, + "learning_rate": 6.83848119233499e-05, + "loss": 0.001807573065161705, + "step": 222820 + }, + { + "epoch": 31.629524485450673, + "grad_norm": 0.048521798104047775, + "learning_rate": 6.8383392476934e-05, + "loss": 0.0009917344897985459, + "step": 222830 + }, + { + "epoch": 31.63094393186657, + "grad_norm": 0.009265604428946972, + "learning_rate": 6.838197303051809e-05, + "loss": 0.006887301802635193, + "step": 222840 + }, + { + "epoch": 31.63236337828247, + "grad_norm": 11.800583839416504, + "learning_rate": 6.83805535841022e-05, + "loss": 0.009177829325199127, + "step": 222850 + }, + { + "epoch": 31.63378282469837, + "grad_norm": 0.4080844819545746, + "learning_rate": 6.837913413768631e-05, + "loss": 0.007553595304489136, + "step": 222860 + }, + { + "epoch": 31.635202271114267, + "grad_norm": 0.0018285185797140002, + "learning_rate": 6.8377856635912e-05, + "loss": 0.01779095232486725, + "step": 222870 + }, + { + "epoch": 31.636621717530165, + "grad_norm": 0.03449669107794762, + "learning_rate": 6.83764371894961e-05, + "loss": 0.018007725477218628, + "step": 222880 + }, + { + "epoch": 31.63804116394606, + "grad_norm": 0.24209725856781006, + "learning_rate": 6.837501774308021e-05, + "loss": 0.0215227872133255, + "step": 222890 + }, + { + "epoch": 31.639460610361958, + "grad_norm": 2.468851327896118, + "learning_rate": 6.83735982966643e-05, + "loss": 0.0029796525835990907, + "step": 222900 + }, + { + "epoch": 31.640880056777856, + "grad_norm": 16.67860984802246, + "learning_rate": 6.83721788502484e-05, + "loss": 0.02167486697435379, + "step": 222910 + }, + { + "epoch": 31.642299503193755, + "grad_norm": 0.2607094645500183, + "learning_rate": 6.837075940383251e-05, + "loss": 0.0017653383314609528, + "step": 222920 + }, + { + "epoch": 31.643718949609653, + "grad_norm": 2.1198508739471436, + "learning_rate": 6.836933995741661e-05, + "loss": 0.0024989504367113114, + "step": 222930 + }, + { + "epoch": 31.64513839602555, + "grad_norm": 0.02861275151371956, + "learning_rate": 6.836792051100072e-05, + "loss": 0.004880928993225097, + "step": 222940 + }, + { + "epoch": 31.64655784244145, + "grad_norm": 0.12024761736392975, + "learning_rate": 6.836650106458482e-05, + "loss": 0.014322616159915924, + "step": 222950 + }, + { + "epoch": 31.647977288857344, + "grad_norm": 0.035328369587659836, + "learning_rate": 6.836508161816892e-05, + "loss": 0.01396665871143341, + "step": 222960 + }, + { + "epoch": 31.649396735273243, + "grad_norm": 0.4697469174861908, + "learning_rate": 6.836366217175301e-05, + "loss": 0.006544413417577744, + "step": 222970 + }, + { + "epoch": 31.65081618168914, + "grad_norm": 0.018260298296809196, + "learning_rate": 6.836224272533712e-05, + "loss": 0.00144447460770607, + "step": 222980 + }, + { + "epoch": 31.65223562810504, + "grad_norm": 0.011228426359593868, + "learning_rate": 6.836082327892122e-05, + "loss": 0.0008092857897281647, + "step": 222990 + }, + { + "epoch": 31.653655074520938, + "grad_norm": 0.06900524348020554, + "learning_rate": 6.835940383250533e-05, + "loss": 0.013278785347938537, + "step": 223000 + }, + { + "epoch": 31.653655074520938, + "eval_accuracy": 0.9907166020220004, + "eval_loss": 0.03621116280555725, + "eval_runtime": 33.9505, + "eval_samples_per_second": 463.233, + "eval_steps_per_second": 14.492, + "step": 223000 + }, + { + "epoch": 31.655074520936836, + "grad_norm": 3.3719635009765625, + "learning_rate": 6.835798438608943e-05, + "loss": 0.007353644818067551, + "step": 223010 + }, + { + "epoch": 31.656493967352734, + "grad_norm": 0.4968966543674469, + "learning_rate": 6.835656493967353e-05, + "loss": 0.002920231968164444, + "step": 223020 + }, + { + "epoch": 31.65791341376863, + "grad_norm": 4.3725504875183105, + "learning_rate": 6.835514549325764e-05, + "loss": 0.00894884616136551, + "step": 223030 + }, + { + "epoch": 31.659332860184527, + "grad_norm": 0.006588222458958626, + "learning_rate": 6.835372604684174e-05, + "loss": 0.0017477348446846008, + "step": 223040 + }, + { + "epoch": 31.660752306600425, + "grad_norm": 0.010969736613333225, + "learning_rate": 6.835230660042585e-05, + "loss": 0.014901578426361084, + "step": 223050 + }, + { + "epoch": 31.662171753016324, + "grad_norm": 0.000990204862318933, + "learning_rate": 6.835088715400993e-05, + "loss": 0.008131691813468933, + "step": 223060 + }, + { + "epoch": 31.663591199432222, + "grad_norm": 0.05719216912984848, + "learning_rate": 6.834946770759404e-05, + "loss": 0.004780464991927147, + "step": 223070 + }, + { + "epoch": 31.66501064584812, + "grad_norm": 1.2438042163848877, + "learning_rate": 6.834804826117814e-05, + "loss": 0.01676145792007446, + "step": 223080 + }, + { + "epoch": 31.66643009226402, + "grad_norm": 0.03246510028839111, + "learning_rate": 6.834662881476225e-05, + "loss": 0.001724611222743988, + "step": 223090 + }, + { + "epoch": 31.667849538679913, + "grad_norm": 0.011804759502410889, + "learning_rate": 6.834520936834635e-05, + "loss": 0.0006045691668987274, + "step": 223100 + }, + { + "epoch": 31.669268985095812, + "grad_norm": 0.9982975125312805, + "learning_rate": 6.834378992193044e-05, + "loss": 0.02604215741157532, + "step": 223110 + }, + { + "epoch": 31.67068843151171, + "grad_norm": 0.0069404779933393, + "learning_rate": 6.834237047551456e-05, + "loss": 0.0002958387136459351, + "step": 223120 + }, + { + "epoch": 31.67210787792761, + "grad_norm": 2.2386491298675537, + "learning_rate": 6.834095102909865e-05, + "loss": 0.00380823016166687, + "step": 223130 + }, + { + "epoch": 31.673527324343507, + "grad_norm": 0.34998929500579834, + "learning_rate": 6.833953158268276e-05, + "loss": 0.02500310242176056, + "step": 223140 + }, + { + "epoch": 31.674946770759405, + "grad_norm": 1.5214958190917969, + "learning_rate": 6.833811213626686e-05, + "loss": 0.001595694199204445, + "step": 223150 + }, + { + "epoch": 31.676366217175303, + "grad_norm": 0.10733998566865921, + "learning_rate": 6.833669268985096e-05, + "loss": 0.006046069785952568, + "step": 223160 + }, + { + "epoch": 31.677785663591198, + "grad_norm": 0.14332182705402374, + "learning_rate": 6.833527324343506e-05, + "loss": 0.01280159205198288, + "step": 223170 + }, + { + "epoch": 31.679205110007096, + "grad_norm": 6.025905132293701, + "learning_rate": 6.833385379701917e-05, + "loss": 0.004153395816683769, + "step": 223180 + }, + { + "epoch": 31.680624556422995, + "grad_norm": 7.804877758026123, + "learning_rate": 6.833243435060326e-05, + "loss": 0.014350013434886932, + "step": 223190 + }, + { + "epoch": 31.682044002838893, + "grad_norm": 0.45640119910240173, + "learning_rate": 6.833101490418737e-05, + "loss": 0.006997407972812652, + "step": 223200 + }, + { + "epoch": 31.68346344925479, + "grad_norm": 0.5180522203445435, + "learning_rate": 6.832959545777147e-05, + "loss": 0.0015610262751579284, + "step": 223210 + }, + { + "epoch": 31.68488289567069, + "grad_norm": 0.0351821668446064, + "learning_rate": 6.832817601135557e-05, + "loss": 0.03155014216899872, + "step": 223220 + }, + { + "epoch": 31.686302342086588, + "grad_norm": 0.05952237918972969, + "learning_rate": 6.832675656493968e-05, + "loss": 0.011961592733860016, + "step": 223230 + }, + { + "epoch": 31.687721788502483, + "grad_norm": 0.14668956398963928, + "learning_rate": 6.832533711852378e-05, + "loss": 0.008540813624858857, + "step": 223240 + }, + { + "epoch": 31.68914123491838, + "grad_norm": 0.04025439918041229, + "learning_rate": 6.832391767210789e-05, + "loss": 0.007485276460647583, + "step": 223250 + }, + { + "epoch": 31.69056068133428, + "grad_norm": 0.10885985940694809, + "learning_rate": 6.832249822569199e-05, + "loss": 0.011143691837787628, + "step": 223260 + }, + { + "epoch": 31.691980127750178, + "grad_norm": 0.06652465462684631, + "learning_rate": 6.832107877927608e-05, + "loss": 0.0006835099309682846, + "step": 223270 + }, + { + "epoch": 31.693399574166076, + "grad_norm": 4.017155170440674, + "learning_rate": 6.831965933286018e-05, + "loss": 0.044855788350105286, + "step": 223280 + }, + { + "epoch": 31.694819020581974, + "grad_norm": 0.025939736515283585, + "learning_rate": 6.831823988644429e-05, + "loss": 0.02786514163017273, + "step": 223290 + }, + { + "epoch": 31.696238466997873, + "grad_norm": 0.6671558022499084, + "learning_rate": 6.831682044002839e-05, + "loss": 0.03611772656440735, + "step": 223300 + }, + { + "epoch": 31.697657913413767, + "grad_norm": 0.08376771956682205, + "learning_rate": 6.83154009936125e-05, + "loss": 0.01585201770067215, + "step": 223310 + }, + { + "epoch": 31.699077359829666, + "grad_norm": 9.572676658630371, + "learning_rate": 6.83139815471966e-05, + "loss": 0.024765777587890624, + "step": 223320 + }, + { + "epoch": 31.700496806245564, + "grad_norm": 1.0281789302825928, + "learning_rate": 6.83125621007807e-05, + "loss": 0.005622114986181259, + "step": 223330 + }, + { + "epoch": 31.701916252661462, + "grad_norm": 0.3400513827800751, + "learning_rate": 6.83111426543648e-05, + "loss": 0.0009052693843841553, + "step": 223340 + }, + { + "epoch": 31.70333569907736, + "grad_norm": 0.05599188432097435, + "learning_rate": 6.83097232079489e-05, + "loss": 0.004798116162419319, + "step": 223350 + }, + { + "epoch": 31.70475514549326, + "grad_norm": 10.646720886230469, + "learning_rate": 6.830830376153301e-05, + "loss": 0.007104592025279998, + "step": 223360 + }, + { + "epoch": 31.706174591909157, + "grad_norm": 1.0445104837417603, + "learning_rate": 6.83068843151171e-05, + "loss": 0.010865803062915801, + "step": 223370 + }, + { + "epoch": 31.707594038325052, + "grad_norm": 0.07234109938144684, + "learning_rate": 6.830546486870121e-05, + "loss": 0.024058878421783447, + "step": 223380 + }, + { + "epoch": 31.70901348474095, + "grad_norm": 1.9220213890075684, + "learning_rate": 6.83040454222853e-05, + "loss": 0.012638336420059204, + "step": 223390 + }, + { + "epoch": 31.71043293115685, + "grad_norm": 0.3439975380897522, + "learning_rate": 6.830262597586942e-05, + "loss": 0.039442673325538635, + "step": 223400 + }, + { + "epoch": 31.711852377572747, + "grad_norm": 0.45828667283058167, + "learning_rate": 6.830120652945351e-05, + "loss": 0.0049579482525587085, + "step": 223410 + }, + { + "epoch": 31.713271823988645, + "grad_norm": 0.0043206303380429745, + "learning_rate": 6.829978708303761e-05, + "loss": 0.003312531113624573, + "step": 223420 + }, + { + "epoch": 31.714691270404543, + "grad_norm": 0.031436946243047714, + "learning_rate": 6.829836763662172e-05, + "loss": 0.01701676845550537, + "step": 223430 + }, + { + "epoch": 31.71611071682044, + "grad_norm": 0.9944525361061096, + "learning_rate": 6.829694819020582e-05, + "loss": 0.00414823517203331, + "step": 223440 + }, + { + "epoch": 31.717530163236336, + "grad_norm": 0.019600097090005875, + "learning_rate": 6.829552874378993e-05, + "loss": 0.03885144591331482, + "step": 223450 + }, + { + "epoch": 31.718949609652235, + "grad_norm": 1.0832105875015259, + "learning_rate": 6.829410929737403e-05, + "loss": 0.035064446926116946, + "step": 223460 + }, + { + "epoch": 31.720369056068133, + "grad_norm": 0.3725796937942505, + "learning_rate": 6.829268985095813e-05, + "loss": 0.01143563985824585, + "step": 223470 + }, + { + "epoch": 31.72178850248403, + "grad_norm": 1.7360414266586304, + "learning_rate": 6.829127040454222e-05, + "loss": 0.025855940580368043, + "step": 223480 + }, + { + "epoch": 31.72320794889993, + "grad_norm": 1.0071865320205688, + "learning_rate": 6.828985095812633e-05, + "loss": 0.015663693845272064, + "step": 223490 + }, + { + "epoch": 31.724627395315828, + "grad_norm": 0.07008396089076996, + "learning_rate": 6.828843151171043e-05, + "loss": 0.03633090853691101, + "step": 223500 + }, + { + "epoch": 31.724627395315828, + "eval_accuracy": 0.9860749030330006, + "eval_loss": 0.06570654362440109, + "eval_runtime": 33.621, + "eval_samples_per_second": 467.773, + "eval_steps_per_second": 14.634, + "step": 223500 + }, + { + "epoch": 31.726046841731726, + "grad_norm": 0.013601127080619335, + "learning_rate": 6.828701206529454e-05, + "loss": 0.017069250345230103, + "step": 223510 + }, + { + "epoch": 31.72746628814762, + "grad_norm": 0.3130601644515991, + "learning_rate": 6.828559261887864e-05, + "loss": 0.0051381587982177734, + "step": 223520 + }, + { + "epoch": 31.72888573456352, + "grad_norm": 0.9860876798629761, + "learning_rate": 6.828417317246274e-05, + "loss": 0.011081652343273162, + "step": 223530 + }, + { + "epoch": 31.730305180979418, + "grad_norm": 0.09381795674562454, + "learning_rate": 6.828275372604685e-05, + "loss": 0.02100779414176941, + "step": 223540 + }, + { + "epoch": 31.731724627395316, + "grad_norm": 0.35021936893463135, + "learning_rate": 6.828133427963095e-05, + "loss": 0.011875247955322266, + "step": 223550 + }, + { + "epoch": 31.733144073811214, + "grad_norm": 0.05887637659907341, + "learning_rate": 6.827991483321506e-05, + "loss": 0.005282459035515785, + "step": 223560 + }, + { + "epoch": 31.734563520227113, + "grad_norm": 1.3365617990493774, + "learning_rate": 6.827849538679914e-05, + "loss": 0.0017488140612840653, + "step": 223570 + }, + { + "epoch": 31.73598296664301, + "grad_norm": 0.011204976588487625, + "learning_rate": 6.827707594038325e-05, + "loss": 0.0016194436699151992, + "step": 223580 + }, + { + "epoch": 31.737402413058906, + "grad_norm": 0.10904113203287125, + "learning_rate": 6.827565649396735e-05, + "loss": 0.009504275023937225, + "step": 223590 + }, + { + "epoch": 31.738821859474804, + "grad_norm": 0.04531412199139595, + "learning_rate": 6.827423704755146e-05, + "loss": 0.013770096004009247, + "step": 223600 + }, + { + "epoch": 31.740241305890702, + "grad_norm": 10.987714767456055, + "learning_rate": 6.827281760113556e-05, + "loss": 0.009967963397502898, + "step": 223610 + }, + { + "epoch": 31.7416607523066, + "grad_norm": 0.43902814388275146, + "learning_rate": 6.827139815471967e-05, + "loss": 0.049802336096763614, + "step": 223620 + }, + { + "epoch": 31.7430801987225, + "grad_norm": 0.11195246130228043, + "learning_rate": 6.826997870830377e-05, + "loss": 0.004388966783881188, + "step": 223630 + }, + { + "epoch": 31.744499645138397, + "grad_norm": 0.0074621038511395454, + "learning_rate": 6.826855926188786e-05, + "loss": 0.019420751929283143, + "step": 223640 + }, + { + "epoch": 31.745919091554295, + "grad_norm": 3.8930721282958984, + "learning_rate": 6.826713981547197e-05, + "loss": 0.026943275332450868, + "step": 223650 + }, + { + "epoch": 31.74733853797019, + "grad_norm": 0.2144065499305725, + "learning_rate": 6.826572036905607e-05, + "loss": 0.03113187551498413, + "step": 223660 + }, + { + "epoch": 31.74875798438609, + "grad_norm": 0.0722707062959671, + "learning_rate": 6.826430092264018e-05, + "loss": 0.002054213359951973, + "step": 223670 + }, + { + "epoch": 31.750177430801987, + "grad_norm": 0.5650181770324707, + "learning_rate": 6.826288147622427e-05, + "loss": 0.010818377137184143, + "step": 223680 + }, + { + "epoch": 31.751596877217885, + "grad_norm": 0.18305693566799164, + "learning_rate": 6.826146202980838e-05, + "loss": 0.04246515929698944, + "step": 223690 + }, + { + "epoch": 31.753016323633783, + "grad_norm": 0.029111234471201897, + "learning_rate": 6.826004258339247e-05, + "loss": 0.02686658799648285, + "step": 223700 + }, + { + "epoch": 31.75443577004968, + "grad_norm": 11.997270584106445, + "learning_rate": 6.825862313697658e-05, + "loss": 0.018415844440460204, + "step": 223710 + }, + { + "epoch": 31.75585521646558, + "grad_norm": 0.09546487778425217, + "learning_rate": 6.82572036905607e-05, + "loss": 0.024895817041397095, + "step": 223720 + }, + { + "epoch": 31.757274662881475, + "grad_norm": 0.08950861543416977, + "learning_rate": 6.825578424414478e-05, + "loss": 0.010691942274570465, + "step": 223730 + }, + { + "epoch": 31.758694109297373, + "grad_norm": 3.1020469665527344, + "learning_rate": 6.825436479772889e-05, + "loss": 0.007276716828346253, + "step": 223740 + }, + { + "epoch": 31.76011355571327, + "grad_norm": 0.02223939821124077, + "learning_rate": 6.825294535131299e-05, + "loss": 0.004922940582036972, + "step": 223750 + }, + { + "epoch": 31.76153300212917, + "grad_norm": 0.011609531939029694, + "learning_rate": 6.82515259048971e-05, + "loss": 0.006637999415397644, + "step": 223760 + }, + { + "epoch": 31.762952448545068, + "grad_norm": 0.005347327794879675, + "learning_rate": 6.82501064584812e-05, + "loss": 0.01080808937549591, + "step": 223770 + }, + { + "epoch": 31.764371894960966, + "grad_norm": 2.2167298793792725, + "learning_rate": 6.82486870120653e-05, + "loss": 0.0009717993438243866, + "step": 223780 + }, + { + "epoch": 31.765791341376865, + "grad_norm": 0.052583761513233185, + "learning_rate": 6.824726756564939e-05, + "loss": 0.0039213914424180984, + "step": 223790 + }, + { + "epoch": 31.76721078779276, + "grad_norm": 0.030414456501603127, + "learning_rate": 6.82458481192335e-05, + "loss": 0.023243750631809234, + "step": 223800 + }, + { + "epoch": 31.768630234208658, + "grad_norm": 0.528081476688385, + "learning_rate": 6.824442867281761e-05, + "loss": 0.01606452763080597, + "step": 223810 + }, + { + "epoch": 31.770049680624556, + "grad_norm": 0.1785333752632141, + "learning_rate": 6.824300922640171e-05, + "loss": 0.002950696274638176, + "step": 223820 + }, + { + "epoch": 31.771469127040454, + "grad_norm": 0.12994703650474548, + "learning_rate": 6.824158977998581e-05, + "loss": 0.011146466434001922, + "step": 223830 + }, + { + "epoch": 31.772888573456353, + "grad_norm": 6.4452691078186035, + "learning_rate": 6.82401703335699e-05, + "loss": 0.011087252199649811, + "step": 223840 + }, + { + "epoch": 31.77430801987225, + "grad_norm": 1.9199570417404175, + "learning_rate": 6.823875088715402e-05, + "loss": 0.008415699750185014, + "step": 223850 + }, + { + "epoch": 31.77572746628815, + "grad_norm": 0.16721801459789276, + "learning_rate": 6.823733144073811e-05, + "loss": 0.0015848480165004731, + "step": 223860 + }, + { + "epoch": 31.777146912704044, + "grad_norm": 0.0593513585627079, + "learning_rate": 6.823591199432222e-05, + "loss": 0.007658667862415314, + "step": 223870 + }, + { + "epoch": 31.778566359119942, + "grad_norm": 2.2206766605377197, + "learning_rate": 6.823449254790631e-05, + "loss": 0.012895773351192474, + "step": 223880 + }, + { + "epoch": 31.77998580553584, + "grad_norm": 4.516350269317627, + "learning_rate": 6.823307310149042e-05, + "loss": 0.011592229455709457, + "step": 223890 + }, + { + "epoch": 31.78140525195174, + "grad_norm": 0.029054878279566765, + "learning_rate": 6.823165365507453e-05, + "loss": 0.002844672277569771, + "step": 223900 + }, + { + "epoch": 31.782824698367637, + "grad_norm": 0.15707527101039886, + "learning_rate": 6.823037615330021e-05, + "loss": 0.03316796720027924, + "step": 223910 + }, + { + "epoch": 31.784244144783536, + "grad_norm": 2.6533868312835693, + "learning_rate": 6.822895670688431e-05, + "loss": 0.01937539577484131, + "step": 223920 + }, + { + "epoch": 31.785663591199434, + "grad_norm": 3.8101108074188232, + "learning_rate": 6.822753726046842e-05, + "loss": 0.02757905423641205, + "step": 223930 + }, + { + "epoch": 31.78708303761533, + "grad_norm": 0.21399390697479248, + "learning_rate": 6.822611781405252e-05, + "loss": 0.006366153061389923, + "step": 223940 + }, + { + "epoch": 31.788502484031227, + "grad_norm": 5.088465213775635, + "learning_rate": 6.822469836763663e-05, + "loss": 0.009004554152488709, + "step": 223950 + }, + { + "epoch": 31.789921930447125, + "grad_norm": 0.728005051612854, + "learning_rate": 6.822327892122073e-05, + "loss": 0.00292392373085022, + "step": 223960 + }, + { + "epoch": 31.791341376863024, + "grad_norm": 1.7708125114440918, + "learning_rate": 6.822185947480483e-05, + "loss": 0.0026606760919094087, + "step": 223970 + }, + { + "epoch": 31.792760823278922, + "grad_norm": 12.613724708557129, + "learning_rate": 6.822044002838894e-05, + "loss": 0.018227294087409973, + "step": 223980 + }, + { + "epoch": 31.79418026969482, + "grad_norm": 0.5408245921134949, + "learning_rate": 6.821902058197303e-05, + "loss": 0.002522206678986549, + "step": 223990 + }, + { + "epoch": 31.79559971611072, + "grad_norm": 0.3058329224586487, + "learning_rate": 6.821760113555715e-05, + "loss": 0.009641411900520324, + "step": 224000 + }, + { + "epoch": 31.79559971611072, + "eval_accuracy": 0.9914796210338908, + "eval_loss": 0.033988114446401596, + "eval_runtime": 34.0636, + "eval_samples_per_second": 461.695, + "eval_steps_per_second": 14.444, + "step": 224000 + }, + { + "epoch": 31.797019162526613, + "grad_norm": 0.03667677938938141, + "learning_rate": 6.821618168914123e-05, + "loss": 0.023062190413475035, + "step": 224010 + }, + { + "epoch": 31.79843860894251, + "grad_norm": 0.8405537605285645, + "learning_rate": 6.821476224272534e-05, + "loss": 0.009657284617424012, + "step": 224020 + }, + { + "epoch": 31.79985805535841, + "grad_norm": 0.2767288088798523, + "learning_rate": 6.821334279630944e-05, + "loss": 0.0027938339859247207, + "step": 224030 + }, + { + "epoch": 31.801277501774308, + "grad_norm": 21.8995361328125, + "learning_rate": 6.821192334989355e-05, + "loss": 0.020475694537162782, + "step": 224040 + }, + { + "epoch": 31.802696948190206, + "grad_norm": 0.035002682358026505, + "learning_rate": 6.821050390347765e-05, + "loss": 0.01001128852367401, + "step": 224050 + }, + { + "epoch": 31.804116394606105, + "grad_norm": 0.22882527112960815, + "learning_rate": 6.820908445706174e-05, + "loss": 0.003256431221961975, + "step": 224060 + }, + { + "epoch": 31.805535841022003, + "grad_norm": 3.025855541229248, + "learning_rate": 6.820766501064585e-05, + "loss": 0.012163002789020539, + "step": 224070 + }, + { + "epoch": 31.806955287437898, + "grad_norm": 0.026076937094330788, + "learning_rate": 6.820624556422995e-05, + "loss": 0.0013769693672657014, + "step": 224080 + }, + { + "epoch": 31.808374733853796, + "grad_norm": 2.2237024307250977, + "learning_rate": 6.820482611781406e-05, + "loss": 0.007462120056152344, + "step": 224090 + }, + { + "epoch": 31.809794180269694, + "grad_norm": 1.3418560028076172, + "learning_rate": 6.820340667139816e-05, + "loss": 0.008130122721195222, + "step": 224100 + }, + { + "epoch": 31.811213626685593, + "grad_norm": 0.028037158772349358, + "learning_rate": 6.820198722498226e-05, + "loss": 0.009841575473546981, + "step": 224110 + }, + { + "epoch": 31.81263307310149, + "grad_norm": 0.04332972317934036, + "learning_rate": 6.820056777856635e-05, + "loss": 0.02037636935710907, + "step": 224120 + }, + { + "epoch": 31.81405251951739, + "grad_norm": 1.754348635673523, + "learning_rate": 6.819914833215047e-05, + "loss": 0.025161886215209962, + "step": 224130 + }, + { + "epoch": 31.815471965933288, + "grad_norm": 0.03437039628624916, + "learning_rate": 6.819772888573456e-05, + "loss": 0.02137935906648636, + "step": 224140 + }, + { + "epoch": 31.816891412349182, + "grad_norm": 0.042069293558597565, + "learning_rate": 6.819630943931867e-05, + "loss": 0.031444883346557616, + "step": 224150 + }, + { + "epoch": 31.81831085876508, + "grad_norm": 0.04614044725894928, + "learning_rate": 6.819488999290277e-05, + "loss": 0.027843153476715087, + "step": 224160 + }, + { + "epoch": 31.81973030518098, + "grad_norm": 0.044688139110803604, + "learning_rate": 6.819347054648687e-05, + "loss": 0.009853728115558624, + "step": 224170 + }, + { + "epoch": 31.821149751596877, + "grad_norm": 0.5983924865722656, + "learning_rate": 6.819205110007098e-05, + "loss": 0.02702178657054901, + "step": 224180 + }, + { + "epoch": 31.822569198012776, + "grad_norm": 4.240438461303711, + "learning_rate": 6.819063165365508e-05, + "loss": 0.0048965513706207275, + "step": 224190 + }, + { + "epoch": 31.823988644428674, + "grad_norm": 0.11061026155948639, + "learning_rate": 6.818921220723919e-05, + "loss": 0.021498021483421326, + "step": 224200 + }, + { + "epoch": 31.825408090844572, + "grad_norm": 0.13908447325229645, + "learning_rate": 6.818779276082327e-05, + "loss": 0.028437876701354982, + "step": 224210 + }, + { + "epoch": 31.826827537260467, + "grad_norm": 0.2979036271572113, + "learning_rate": 6.818637331440738e-05, + "loss": 0.007227513194084168, + "step": 224220 + }, + { + "epoch": 31.828246983676365, + "grad_norm": 18.109729766845703, + "learning_rate": 6.818495386799148e-05, + "loss": 0.013986316323280335, + "step": 224230 + }, + { + "epoch": 31.829666430092264, + "grad_norm": 0.04214845970273018, + "learning_rate": 6.818353442157559e-05, + "loss": 0.0032639671117067337, + "step": 224240 + }, + { + "epoch": 31.831085876508162, + "grad_norm": 3.7225115299224854, + "learning_rate": 6.818211497515969e-05, + "loss": 0.052258795499801634, + "step": 224250 + }, + { + "epoch": 31.83250532292406, + "grad_norm": 0.023389266803860664, + "learning_rate": 6.81806955287438e-05, + "loss": 0.030897301435470582, + "step": 224260 + }, + { + "epoch": 31.83392476933996, + "grad_norm": 0.01111950259655714, + "learning_rate": 6.81792760823279e-05, + "loss": 0.02819797396659851, + "step": 224270 + }, + { + "epoch": 31.835344215755857, + "grad_norm": 0.021966369822621346, + "learning_rate": 6.8177856635912e-05, + "loss": 0.007414685189723968, + "step": 224280 + }, + { + "epoch": 31.83676366217175, + "grad_norm": 0.008778875693678856, + "learning_rate": 6.81764371894961e-05, + "loss": 0.00407436415553093, + "step": 224290 + }, + { + "epoch": 31.83818310858765, + "grad_norm": 0.2907489538192749, + "learning_rate": 6.81750177430802e-05, + "loss": 0.007965266704559326, + "step": 224300 + }, + { + "epoch": 31.839602555003548, + "grad_norm": 4.1807756423950195, + "learning_rate": 6.817359829666431e-05, + "loss": 0.0020814482122659683, + "step": 224310 + }, + { + "epoch": 31.841022001419446, + "grad_norm": 0.06005258485674858, + "learning_rate": 6.81721788502484e-05, + "loss": 0.028629392385482788, + "step": 224320 + }, + { + "epoch": 31.842441447835345, + "grad_norm": 0.24015426635742188, + "learning_rate": 6.817075940383251e-05, + "loss": 0.002596981078386307, + "step": 224330 + }, + { + "epoch": 31.843860894251243, + "grad_norm": 0.0067658014595508575, + "learning_rate": 6.81693399574166e-05, + "loss": 0.03875541090965271, + "step": 224340 + }, + { + "epoch": 31.84528034066714, + "grad_norm": 0.20778998732566833, + "learning_rate": 6.816792051100072e-05, + "loss": 0.019889393448829652, + "step": 224350 + }, + { + "epoch": 31.846699787083036, + "grad_norm": 0.04795345291495323, + "learning_rate": 6.816650106458481e-05, + "loss": 0.006530588865280152, + "step": 224360 + }, + { + "epoch": 31.848119233498934, + "grad_norm": 2.1729118824005127, + "learning_rate": 6.816508161816891e-05, + "loss": 0.00262531079351902, + "step": 224370 + }, + { + "epoch": 31.849538679914833, + "grad_norm": 0.023167716339230537, + "learning_rate": 6.816366217175302e-05, + "loss": 0.00924011766910553, + "step": 224380 + }, + { + "epoch": 31.85095812633073, + "grad_norm": 0.0126135079190135, + "learning_rate": 6.816224272533712e-05, + "loss": 0.004769825935363769, + "step": 224390 + }, + { + "epoch": 31.85237757274663, + "grad_norm": 0.0022798164281994104, + "learning_rate": 6.816082327892123e-05, + "loss": 0.015084171295166015, + "step": 224400 + }, + { + "epoch": 31.853797019162528, + "grad_norm": 0.5620907545089722, + "learning_rate": 6.815940383250533e-05, + "loss": 0.015223684906959533, + "step": 224410 + }, + { + "epoch": 31.855216465578426, + "grad_norm": 0.21543985605239868, + "learning_rate": 6.815798438608942e-05, + "loss": 0.012539428472518922, + "step": 224420 + }, + { + "epoch": 31.85663591199432, + "grad_norm": 1.155545949935913, + "learning_rate": 6.815656493967352e-05, + "loss": 0.014900805056095123, + "step": 224430 + }, + { + "epoch": 31.85805535841022, + "grad_norm": 0.011175284162163734, + "learning_rate": 6.815514549325763e-05, + "loss": 0.04125121533870697, + "step": 224440 + }, + { + "epoch": 31.859474804826117, + "grad_norm": 2.499943256378174, + "learning_rate": 6.815372604684173e-05, + "loss": 0.043642657995224, + "step": 224450 + }, + { + "epoch": 31.860894251242016, + "grad_norm": 0.34457188844680786, + "learning_rate": 6.815230660042584e-05, + "loss": 0.037349849939346313, + "step": 224460 + }, + { + "epoch": 31.862313697657914, + "grad_norm": 5.5591959953308105, + "learning_rate": 6.815088715400994e-05, + "loss": 0.017282789945602416, + "step": 224470 + }, + { + "epoch": 31.863733144073812, + "grad_norm": 0.11828640848398209, + "learning_rate": 6.814946770759404e-05, + "loss": 0.06485535502433777, + "step": 224480 + }, + { + "epoch": 31.86515259048971, + "grad_norm": 9.19913101196289, + "learning_rate": 6.814804826117815e-05, + "loss": 0.0188859298825264, + "step": 224490 + }, + { + "epoch": 31.866572036905605, + "grad_norm": 0.45978859066963196, + "learning_rate": 6.814662881476224e-05, + "loss": 0.010989348590373992, + "step": 224500 + }, + { + "epoch": 31.866572036905605, + "eval_accuracy": 0.9888090544922744, + "eval_loss": 0.045874785631895065, + "eval_runtime": 34.5072, + "eval_samples_per_second": 455.76, + "eval_steps_per_second": 14.258, + "step": 224500 + }, + { + "epoch": 31.867991483321504, + "grad_norm": 0.7502192854881287, + "learning_rate": 6.814520936834636e-05, + "loss": 0.0016719866544008255, + "step": 224510 + }, + { + "epoch": 31.869410929737402, + "grad_norm": 7.455332279205322, + "learning_rate": 6.814378992193044e-05, + "loss": 0.04349375367164612, + "step": 224520 + }, + { + "epoch": 31.8708303761533, + "grad_norm": 17.491947174072266, + "learning_rate": 6.814237047551455e-05, + "loss": 0.008632956445217133, + "step": 224530 + }, + { + "epoch": 31.8722498225692, + "grad_norm": 0.08092310279607773, + "learning_rate": 6.814095102909865e-05, + "loss": 0.010861162841320039, + "step": 224540 + }, + { + "epoch": 31.873669268985097, + "grad_norm": 3.5000932216644287, + "learning_rate": 6.813953158268276e-05, + "loss": 0.01661835014820099, + "step": 224550 + }, + { + "epoch": 31.875088715400995, + "grad_norm": 0.768786609172821, + "learning_rate": 6.813811213626687e-05, + "loss": 0.004906551539897918, + "step": 224560 + }, + { + "epoch": 31.87650816181689, + "grad_norm": 0.01089998334646225, + "learning_rate": 6.813669268985095e-05, + "loss": 0.0009212210774421692, + "step": 224570 + }, + { + "epoch": 31.87792760823279, + "grad_norm": 0.021497441455721855, + "learning_rate": 6.813527324343506e-05, + "loss": 0.025685659050941466, + "step": 224580 + }, + { + "epoch": 31.879347054648687, + "grad_norm": 0.028152689337730408, + "learning_rate": 6.813385379701916e-05, + "loss": 0.028789982199668884, + "step": 224590 + }, + { + "epoch": 31.880766501064585, + "grad_norm": 5.891561508178711, + "learning_rate": 6.813243435060327e-05, + "loss": 0.007612830400466919, + "step": 224600 + }, + { + "epoch": 31.882185947480483, + "grad_norm": 0.04528242349624634, + "learning_rate": 6.813101490418737e-05, + "loss": 0.029650384187698366, + "step": 224610 + }, + { + "epoch": 31.88360539389638, + "grad_norm": 0.011663914658129215, + "learning_rate": 6.812959545777148e-05, + "loss": 0.0016444973647594453, + "step": 224620 + }, + { + "epoch": 31.88502484031228, + "grad_norm": 6.505465984344482, + "learning_rate": 6.812817601135556e-05, + "loss": 0.006769774854183197, + "step": 224630 + }, + { + "epoch": 31.886444286728175, + "grad_norm": 0.021072303876280785, + "learning_rate": 6.812675656493968e-05, + "loss": 0.0014495164155960082, + "step": 224640 + }, + { + "epoch": 31.887863733144073, + "grad_norm": 0.017044518142938614, + "learning_rate": 6.812533711852379e-05, + "loss": 0.002707330510020256, + "step": 224650 + }, + { + "epoch": 31.88928317955997, + "grad_norm": 0.0675484910607338, + "learning_rate": 6.812391767210788e-05, + "loss": 0.004461589083075523, + "step": 224660 + }, + { + "epoch": 31.89070262597587, + "grad_norm": 0.05534088611602783, + "learning_rate": 6.8122498225692e-05, + "loss": 0.005926653370261192, + "step": 224670 + }, + { + "epoch": 31.892122072391768, + "grad_norm": 0.04696843400597572, + "learning_rate": 6.812107877927608e-05, + "loss": 0.0052670188248157505, + "step": 224680 + }, + { + "epoch": 31.893541518807666, + "grad_norm": 0.2911098599433899, + "learning_rate": 6.811965933286019e-05, + "loss": 0.0025550249963998796, + "step": 224690 + }, + { + "epoch": 31.894960965223564, + "grad_norm": 0.010398434475064278, + "learning_rate": 6.811823988644429e-05, + "loss": 0.0036330878734588622, + "step": 224700 + }, + { + "epoch": 31.89638041163946, + "grad_norm": 0.43333080410957336, + "learning_rate": 6.81168204400284e-05, + "loss": 0.009508013725280762, + "step": 224710 + }, + { + "epoch": 31.897799858055357, + "grad_norm": 0.0255559254437685, + "learning_rate": 6.81154009936125e-05, + "loss": 0.08565595746040344, + "step": 224720 + }, + { + "epoch": 31.899219304471256, + "grad_norm": 0.07698985934257507, + "learning_rate": 6.811398154719659e-05, + "loss": 0.007607724517583847, + "step": 224730 + }, + { + "epoch": 31.900638750887154, + "grad_norm": 0.17710554599761963, + "learning_rate": 6.81125621007807e-05, + "loss": 0.005893680080771446, + "step": 224740 + }, + { + "epoch": 31.902058197303052, + "grad_norm": 20.049663543701172, + "learning_rate": 6.81111426543648e-05, + "loss": 0.04339771866798401, + "step": 224750 + }, + { + "epoch": 31.90347764371895, + "grad_norm": 0.015247308649122715, + "learning_rate": 6.810972320794891e-05, + "loss": 0.0143109530210495, + "step": 224760 + }, + { + "epoch": 31.90489709013485, + "grad_norm": 10.414000511169434, + "learning_rate": 6.810830376153301e-05, + "loss": 0.013583861291408539, + "step": 224770 + }, + { + "epoch": 31.906316536550744, + "grad_norm": 0.02318751811981201, + "learning_rate": 6.81068843151171e-05, + "loss": 0.021124042570590973, + "step": 224780 + }, + { + "epoch": 31.907735982966642, + "grad_norm": 0.02957110106945038, + "learning_rate": 6.81054648687012e-05, + "loss": 0.0053432628512382506, + "step": 224790 + }, + { + "epoch": 31.90915542938254, + "grad_norm": 0.27143827080726624, + "learning_rate": 6.810404542228531e-05, + "loss": 0.018198060989379882, + "step": 224800 + }, + { + "epoch": 31.91057487579844, + "grad_norm": 20.006208419799805, + "learning_rate": 6.810262597586941e-05, + "loss": 0.01642772853374481, + "step": 224810 + }, + { + "epoch": 31.911994322214337, + "grad_norm": 0.6595497131347656, + "learning_rate": 6.810120652945352e-05, + "loss": 0.02356277108192444, + "step": 224820 + }, + { + "epoch": 31.913413768630235, + "grad_norm": 0.08555730432271957, + "learning_rate": 6.809978708303762e-05, + "loss": 0.0021306157112121584, + "step": 224830 + }, + { + "epoch": 31.914833215046134, + "grad_norm": 0.628926157951355, + "learning_rate": 6.809836763662172e-05, + "loss": 0.001440710946917534, + "step": 224840 + }, + { + "epoch": 31.91625266146203, + "grad_norm": 0.161001056432724, + "learning_rate": 6.809694819020583e-05, + "loss": 0.00212237723171711, + "step": 224850 + }, + { + "epoch": 31.917672107877927, + "grad_norm": 0.10393770784139633, + "learning_rate": 6.809552874378993e-05, + "loss": 0.010715904086828232, + "step": 224860 + }, + { + "epoch": 31.919091554293825, + "grad_norm": 0.011391018517315388, + "learning_rate": 6.809410929737404e-05, + "loss": 0.0034809060394763946, + "step": 224870 + }, + { + "epoch": 31.920511000709723, + "grad_norm": 0.11275273561477661, + "learning_rate": 6.809268985095812e-05, + "loss": 0.0061474844813346865, + "step": 224880 + }, + { + "epoch": 31.92193044712562, + "grad_norm": 2.8207216262817383, + "learning_rate": 6.809127040454223e-05, + "loss": 0.061264443397521975, + "step": 224890 + }, + { + "epoch": 31.92334989354152, + "grad_norm": 0.48159587383270264, + "learning_rate": 6.808985095812633e-05, + "loss": 0.0026533614844083788, + "step": 224900 + }, + { + "epoch": 31.924769339957418, + "grad_norm": 0.024966992437839508, + "learning_rate": 6.808843151171044e-05, + "loss": 0.02083406001329422, + "step": 224910 + }, + { + "epoch": 31.926188786373313, + "grad_norm": 3.4574432373046875, + "learning_rate": 6.808701206529454e-05, + "loss": 0.003421981632709503, + "step": 224920 + }, + { + "epoch": 31.92760823278921, + "grad_norm": 6.021180629730225, + "learning_rate": 6.808559261887863e-05, + "loss": 0.015915167331695557, + "step": 224930 + }, + { + "epoch": 31.92902767920511, + "grad_norm": 0.44794681668281555, + "learning_rate": 6.808417317246275e-05, + "loss": 0.02210240960121155, + "step": 224940 + }, + { + "epoch": 31.930447125621008, + "grad_norm": 1.282052755355835, + "learning_rate": 6.808275372604684e-05, + "loss": 0.02113766372203827, + "step": 224950 + }, + { + "epoch": 31.931866572036906, + "grad_norm": 1.4859577417373657, + "learning_rate": 6.808133427963095e-05, + "loss": 0.027621585130691528, + "step": 224960 + }, + { + "epoch": 31.933286018452804, + "grad_norm": 0.052143268287181854, + "learning_rate": 6.807991483321505e-05, + "loss": 0.013345226645469666, + "step": 224970 + }, + { + "epoch": 31.934705464868703, + "grad_norm": 0.6125739812850952, + "learning_rate": 6.807849538679916e-05, + "loss": 0.01061306893825531, + "step": 224980 + }, + { + "epoch": 31.936124911284598, + "grad_norm": 2.8624415397644043, + "learning_rate": 6.807707594038325e-05, + "loss": 0.023602600395679473, + "step": 224990 + }, + { + "epoch": 31.937544357700496, + "grad_norm": 0.35514748096466064, + "learning_rate": 6.807565649396736e-05, + "loss": 0.07028427720069885, + "step": 225000 + }, + { + "epoch": 31.937544357700496, + "eval_accuracy": 0.9853754689387677, + "eval_loss": 0.05794696509838104, + "eval_runtime": 34.0658, + "eval_samples_per_second": 461.666, + "eval_steps_per_second": 14.443, + "step": 225000 + }, + { + "epoch": 31.938963804116394, + "grad_norm": 2.99931001663208, + "learning_rate": 6.807423704755145e-05, + "loss": 0.020571470260620117, + "step": 225010 + }, + { + "epoch": 31.940383250532292, + "grad_norm": 2.460041046142578, + "learning_rate": 6.807281760113557e-05, + "loss": 0.004888773709535599, + "step": 225020 + }, + { + "epoch": 31.94180269694819, + "grad_norm": 5.351932048797607, + "learning_rate": 6.807139815471966e-05, + "loss": 0.004782514646649361, + "step": 225030 + }, + { + "epoch": 31.94322214336409, + "grad_norm": 0.6551006436347961, + "learning_rate": 6.806997870830376e-05, + "loss": 0.0032919410616159437, + "step": 225040 + }, + { + "epoch": 31.944641589779987, + "grad_norm": 3.7694363594055176, + "learning_rate": 6.806855926188787e-05, + "loss": 0.00507575124502182, + "step": 225050 + }, + { + "epoch": 31.946061036195882, + "grad_norm": 0.013331921771168709, + "learning_rate": 6.806713981547197e-05, + "loss": 0.015005771815776826, + "step": 225060 + }, + { + "epoch": 31.94748048261178, + "grad_norm": 9.528898239135742, + "learning_rate": 6.806572036905608e-05, + "loss": 0.02971402704715729, + "step": 225070 + }, + { + "epoch": 31.94889992902768, + "grad_norm": 11.407522201538086, + "learning_rate": 6.806430092264018e-05, + "loss": 0.0279754638671875, + "step": 225080 + }, + { + "epoch": 31.950319375443577, + "grad_norm": 1.5351190567016602, + "learning_rate": 6.806288147622427e-05, + "loss": 0.013916891813278199, + "step": 225090 + }, + { + "epoch": 31.951738821859475, + "grad_norm": 0.007365619298070669, + "learning_rate": 6.806146202980837e-05, + "loss": 0.006995044648647308, + "step": 225100 + }, + { + "epoch": 31.953158268275374, + "grad_norm": 0.30356365442276, + "learning_rate": 6.806004258339248e-05, + "loss": 0.022625482082366942, + "step": 225110 + }, + { + "epoch": 31.954577714691272, + "grad_norm": 0.12768439948558807, + "learning_rate": 6.805862313697658e-05, + "loss": 0.02512997090816498, + "step": 225120 + }, + { + "epoch": 31.955997161107167, + "grad_norm": 12.926606178283691, + "learning_rate": 6.805720369056069e-05, + "loss": 0.021362486481666564, + "step": 225130 + }, + { + "epoch": 31.957416607523065, + "grad_norm": 0.05878835171461105, + "learning_rate": 6.805578424414479e-05, + "loss": 0.025714096426963807, + "step": 225140 + }, + { + "epoch": 31.958836053938963, + "grad_norm": 0.24904054403305054, + "learning_rate": 6.805436479772889e-05, + "loss": 0.009958270937204361, + "step": 225150 + }, + { + "epoch": 31.96025550035486, + "grad_norm": 5.433131217956543, + "learning_rate": 6.8052945351313e-05, + "loss": 0.054961764812469484, + "step": 225160 + }, + { + "epoch": 31.96167494677076, + "grad_norm": 18.794179916381836, + "learning_rate": 6.80515259048971e-05, + "loss": 0.023780012130737306, + "step": 225170 + }, + { + "epoch": 31.96309439318666, + "grad_norm": 9.385665893554688, + "learning_rate": 6.80501064584812e-05, + "loss": 0.026184344291687013, + "step": 225180 + }, + { + "epoch": 31.964513839602557, + "grad_norm": 1.5457196235656738, + "learning_rate": 6.804868701206529e-05, + "loss": 0.026369434595108033, + "step": 225190 + }, + { + "epoch": 31.96593328601845, + "grad_norm": 2.2176458835601807, + "learning_rate": 6.80472675656494e-05, + "loss": 0.0035610556602478026, + "step": 225200 + }, + { + "epoch": 31.96735273243435, + "grad_norm": 0.027985505759716034, + "learning_rate": 6.80458481192335e-05, + "loss": 0.003566715493798256, + "step": 225210 + }, + { + "epoch": 31.968772178850248, + "grad_norm": 10.022653579711914, + "learning_rate": 6.804442867281761e-05, + "loss": 0.015349379181861878, + "step": 225220 + }, + { + "epoch": 31.970191625266146, + "grad_norm": 1.8693671226501465, + "learning_rate": 6.80430092264017e-05, + "loss": 0.01688404530286789, + "step": 225230 + }, + { + "epoch": 31.971611071682045, + "grad_norm": 0.039673250168561935, + "learning_rate": 6.80415897799858e-05, + "loss": 0.004975778236985207, + "step": 225240 + }, + { + "epoch": 31.973030518097943, + "grad_norm": 1.8057912588119507, + "learning_rate": 6.804017033356991e-05, + "loss": 0.009275739639997482, + "step": 225250 + }, + { + "epoch": 31.97444996451384, + "grad_norm": 0.1786058396100998, + "learning_rate": 6.803875088715401e-05, + "loss": 0.041719910502433774, + "step": 225260 + }, + { + "epoch": 31.975869410929736, + "grad_norm": 0.5000008344650269, + "learning_rate": 6.803733144073812e-05, + "loss": 0.01525372564792633, + "step": 225270 + }, + { + "epoch": 31.977288857345634, + "grad_norm": 0.023901525884866714, + "learning_rate": 6.803591199432222e-05, + "loss": 0.010617045313119888, + "step": 225280 + }, + { + "epoch": 31.978708303761533, + "grad_norm": 0.07982167601585388, + "learning_rate": 6.803449254790632e-05, + "loss": 0.019258174300193786, + "step": 225290 + }, + { + "epoch": 31.98012775017743, + "grad_norm": 9.538247108459473, + "learning_rate": 6.803307310149041e-05, + "loss": 0.027436268329620362, + "step": 225300 + }, + { + "epoch": 31.98154719659333, + "grad_norm": 0.5941756367683411, + "learning_rate": 6.803165365507452e-05, + "loss": 0.016781291365623473, + "step": 225310 + }, + { + "epoch": 31.982966643009227, + "grad_norm": 0.03276979178190231, + "learning_rate": 6.803023420865862e-05, + "loss": 0.009084580838680268, + "step": 225320 + }, + { + "epoch": 31.984386089425126, + "grad_norm": 0.18447229266166687, + "learning_rate": 6.802881476224273e-05, + "loss": 0.024604880809783937, + "step": 225330 + }, + { + "epoch": 31.98580553584102, + "grad_norm": 11.734211921691895, + "learning_rate": 6.802739531582683e-05, + "loss": 0.03528172969818115, + "step": 225340 + }, + { + "epoch": 31.98722498225692, + "grad_norm": 0.006622053682804108, + "learning_rate": 6.802597586941093e-05, + "loss": 0.0028415974229574203, + "step": 225350 + }, + { + "epoch": 31.988644428672817, + "grad_norm": 0.04328836500644684, + "learning_rate": 6.802455642299504e-05, + "loss": 0.022968432307243346, + "step": 225360 + }, + { + "epoch": 31.990063875088715, + "grad_norm": 0.09625077247619629, + "learning_rate": 6.802313697657914e-05, + "loss": 0.0032951809465885163, + "step": 225370 + }, + { + "epoch": 31.991483321504614, + "grad_norm": 0.2569192349910736, + "learning_rate": 6.802171753016325e-05, + "loss": 0.028855276107788087, + "step": 225380 + }, + { + "epoch": 31.992902767920512, + "grad_norm": 0.4871579110622406, + "learning_rate": 6.802029808374734e-05, + "loss": 0.020717762410640717, + "step": 225390 + }, + { + "epoch": 31.99432221433641, + "grad_norm": 0.05741727352142334, + "learning_rate": 6.801887863733144e-05, + "loss": 0.0014943975955247879, + "step": 225400 + }, + { + "epoch": 31.995741660752305, + "grad_norm": 0.06554523855447769, + "learning_rate": 6.801745919091554e-05, + "loss": 0.0025384105741977693, + "step": 225410 + }, + { + "epoch": 31.997161107168203, + "grad_norm": 0.026202406734228134, + "learning_rate": 6.801603974449965e-05, + "loss": 0.013293239474296569, + "step": 225420 + }, + { + "epoch": 31.9985805535841, + "grad_norm": 0.8593502044677734, + "learning_rate": 6.801462029808375e-05, + "loss": 0.002008972689509392, + "step": 225430 + }, + { + "epoch": 32.0, + "grad_norm": 0.235457643866539, + "learning_rate": 6.801320085166786e-05, + "loss": 0.0052239660173654555, + "step": 225440 + }, + { + "epoch": 32.001419446415895, + "grad_norm": 1.156724214553833, + "learning_rate": 6.801178140525196e-05, + "loss": 0.015463405847549438, + "step": 225450 + }, + { + "epoch": 32.0028388928318, + "grad_norm": 1.5759824514389038, + "learning_rate": 6.801036195883605e-05, + "loss": 0.005777611956000328, + "step": 225460 + }, + { + "epoch": 32.00425833924769, + "grad_norm": 0.017195381224155426, + "learning_rate": 6.800894251242016e-05, + "loss": 0.0035879679024219515, + "step": 225470 + }, + { + "epoch": 32.00567778566359, + "grad_norm": 0.1289886236190796, + "learning_rate": 6.800752306600426e-05, + "loss": 0.026769959926605226, + "step": 225480 + }, + { + "epoch": 32.00709723207949, + "grad_norm": 0.020756451413035393, + "learning_rate": 6.800610361958837e-05, + "loss": 0.0005879897624254226, + "step": 225490 + }, + { + "epoch": 32.00851667849539, + "grad_norm": 0.06718094646930695, + "learning_rate": 6.800468417317246e-05, + "loss": 0.007511949539184571, + "step": 225500 + }, + { + "epoch": 32.00851667849539, + "eval_accuracy": 0.9863928276212882, + "eval_loss": 0.058794163167476654, + "eval_runtime": 32.9342, + "eval_samples_per_second": 477.527, + "eval_steps_per_second": 14.939, + "step": 225500 + }, + { + "epoch": 32.009936124911285, + "grad_norm": 3.416248321533203, + "learning_rate": 6.800326472675657e-05, + "loss": 0.026796227693557738, + "step": 225510 + }, + { + "epoch": 32.01135557132718, + "grad_norm": 0.3101378083229065, + "learning_rate": 6.800184528034066e-05, + "loss": 0.0015913538634777069, + "step": 225520 + }, + { + "epoch": 32.01277501774308, + "grad_norm": 0.274209201335907, + "learning_rate": 6.800042583392478e-05, + "loss": 0.0013554714620113372, + "step": 225530 + }, + { + "epoch": 32.014194464158976, + "grad_norm": 0.04068811610341072, + "learning_rate": 6.799900638750887e-05, + "loss": 0.002965814992785454, + "step": 225540 + }, + { + "epoch": 32.01561391057488, + "grad_norm": 4.010166168212891, + "learning_rate": 6.799758694109297e-05, + "loss": 0.019072823226451874, + "step": 225550 + }, + { + "epoch": 32.01703335699077, + "grad_norm": 0.138960063457489, + "learning_rate": 6.799616749467708e-05, + "loss": 0.006544172763824463, + "step": 225560 + }, + { + "epoch": 32.018452803406674, + "grad_norm": 0.01409961748868227, + "learning_rate": 6.799474804826118e-05, + "loss": 0.023673275113105775, + "step": 225570 + }, + { + "epoch": 32.01987224982257, + "grad_norm": 0.12462171912193298, + "learning_rate": 6.799332860184529e-05, + "loss": 0.0010585509240627288, + "step": 225580 + }, + { + "epoch": 32.021291696238464, + "grad_norm": 0.08817264437675476, + "learning_rate": 6.799190915542939e-05, + "loss": 0.02110879123210907, + "step": 225590 + }, + { + "epoch": 32.022711142654366, + "grad_norm": 0.015896758064627647, + "learning_rate": 6.799048970901348e-05, + "loss": 0.010520736128091812, + "step": 225600 + }, + { + "epoch": 32.02413058907026, + "grad_norm": 0.34597334265708923, + "learning_rate": 6.798907026259758e-05, + "loss": 0.0054096519947052, + "step": 225610 + }, + { + "epoch": 32.02555003548616, + "grad_norm": 3.033015489578247, + "learning_rate": 6.798765081618169e-05, + "loss": 0.017440232634544372, + "step": 225620 + }, + { + "epoch": 32.02696948190206, + "grad_norm": 0.16050459444522858, + "learning_rate": 6.798623136976579e-05, + "loss": 0.0014092542231082916, + "step": 225630 + }, + { + "epoch": 32.02838892831796, + "grad_norm": 0.43165454268455505, + "learning_rate": 6.79848119233499e-05, + "loss": 0.000973859429359436, + "step": 225640 + }, + { + "epoch": 32.029808374733854, + "grad_norm": 0.021897850558161736, + "learning_rate": 6.7983392476934e-05, + "loss": 0.0011765342205762863, + "step": 225650 + }, + { + "epoch": 32.03122782114975, + "grad_norm": 0.5210105180740356, + "learning_rate": 6.79819730305181e-05, + "loss": 0.03165818452835083, + "step": 225660 + }, + { + "epoch": 32.03264726756565, + "grad_norm": 0.03875475749373436, + "learning_rate": 6.79805535841022e-05, + "loss": 0.0009877372533082962, + "step": 225670 + }, + { + "epoch": 32.034066713981545, + "grad_norm": 0.690407931804657, + "learning_rate": 6.79791341376863e-05, + "loss": 0.003597318381071091, + "step": 225680 + }, + { + "epoch": 32.03548616039745, + "grad_norm": 0.0056189014576375484, + "learning_rate": 6.797771469127041e-05, + "loss": 0.011977921426296233, + "step": 225690 + }, + { + "epoch": 32.03690560681334, + "grad_norm": 4.438775062561035, + "learning_rate": 6.797629524485451e-05, + "loss": 0.010687750577926636, + "step": 225700 + }, + { + "epoch": 32.038325053229244, + "grad_norm": 0.04551001638174057, + "learning_rate": 6.797487579843861e-05, + "loss": 0.0009366165846586228, + "step": 225710 + }, + { + "epoch": 32.03974449964514, + "grad_norm": 0.08088213950395584, + "learning_rate": 6.797345635202271e-05, + "loss": 0.002966693043708801, + "step": 225720 + }, + { + "epoch": 32.04116394606103, + "grad_norm": 0.23085173964500427, + "learning_rate": 6.797203690560682e-05, + "loss": 0.003607754036784172, + "step": 225730 + }, + { + "epoch": 32.042583392476935, + "grad_norm": 16.407472610473633, + "learning_rate": 6.797061745919092e-05, + "loss": 0.007807709276676178, + "step": 225740 + }, + { + "epoch": 32.04400283889283, + "grad_norm": 1.3655463457107544, + "learning_rate": 6.796919801277503e-05, + "loss": 0.011934001743793488, + "step": 225750 + }, + { + "epoch": 32.04542228530873, + "grad_norm": 0.2582022249698639, + "learning_rate": 6.796777856635912e-05, + "loss": 0.02205028533935547, + "step": 225760 + }, + { + "epoch": 32.046841731724626, + "grad_norm": 4.730386734008789, + "learning_rate": 6.796635911994322e-05, + "loss": 0.021076075732707977, + "step": 225770 + }, + { + "epoch": 32.04826117814053, + "grad_norm": 0.003873984795063734, + "learning_rate": 6.796493967352733e-05, + "loss": 0.017658725380897522, + "step": 225780 + }, + { + "epoch": 32.04968062455642, + "grad_norm": 7.249237537384033, + "learning_rate": 6.796352022711143e-05, + "loss": 0.03775023221969605, + "step": 225790 + }, + { + "epoch": 32.05110007097232, + "grad_norm": 0.0968891829252243, + "learning_rate": 6.796210078069554e-05, + "loss": 0.020979142189025878, + "step": 225800 + }, + { + "epoch": 32.05251951738822, + "grad_norm": 0.03359798714518547, + "learning_rate": 6.796068133427962e-05, + "loss": 0.009515824913978576, + "step": 225810 + }, + { + "epoch": 32.053938963804114, + "grad_norm": 0.2174043357372284, + "learning_rate": 6.795926188786374e-05, + "loss": 0.004498536139726639, + "step": 225820 + }, + { + "epoch": 32.055358410220016, + "grad_norm": 0.03616295009851456, + "learning_rate": 6.795784244144783e-05, + "loss": 0.003534504026174545, + "step": 225830 + }, + { + "epoch": 32.05677785663591, + "grad_norm": 8.833267211914062, + "learning_rate": 6.795642299503194e-05, + "loss": 0.011897098273038864, + "step": 225840 + }, + { + "epoch": 32.05819730305181, + "grad_norm": 0.04778765141963959, + "learning_rate": 6.795500354861604e-05, + "loss": 0.0034243032336235047, + "step": 225850 + }, + { + "epoch": 32.05961674946771, + "grad_norm": 13.641246795654297, + "learning_rate": 6.795358410220014e-05, + "loss": 0.014955081045627594, + "step": 225860 + }, + { + "epoch": 32.0610361958836, + "grad_norm": 0.30954059958457947, + "learning_rate": 6.795216465578425e-05, + "loss": 0.009021814167499542, + "step": 225870 + }, + { + "epoch": 32.062455642299504, + "grad_norm": 0.2303438037633896, + "learning_rate": 6.795074520936835e-05, + "loss": 0.0010306891053915024, + "step": 225880 + }, + { + "epoch": 32.0638750887154, + "grad_norm": 0.30118635296821594, + "learning_rate": 6.794932576295246e-05, + "loss": 0.011443175375461578, + "step": 225890 + }, + { + "epoch": 32.0652945351313, + "grad_norm": 0.16095805168151855, + "learning_rate": 6.794790631653655e-05, + "loss": 0.005943663418292999, + "step": 225900 + }, + { + "epoch": 32.066713981547196, + "grad_norm": 0.0321248322725296, + "learning_rate": 6.794648687012065e-05, + "loss": 0.008763393759727478, + "step": 225910 + }, + { + "epoch": 32.0681334279631, + "grad_norm": 0.06952592730522156, + "learning_rate": 6.794506742370475e-05, + "loss": 0.0005770076066255569, + "step": 225920 + }, + { + "epoch": 32.06955287437899, + "grad_norm": 0.021722951903939247, + "learning_rate": 6.794364797728886e-05, + "loss": 0.0007000241428613662, + "step": 225930 + }, + { + "epoch": 32.07097232079489, + "grad_norm": 0.1027609184384346, + "learning_rate": 6.794222853087296e-05, + "loss": 0.025169748067855834, + "step": 225940 + }, + { + "epoch": 32.07239176721079, + "grad_norm": 0.11525753885507584, + "learning_rate": 6.794080908445707e-05, + "loss": 0.004572008922696114, + "step": 225950 + }, + { + "epoch": 32.07381121362668, + "grad_norm": 0.005694626830518246, + "learning_rate": 6.793938963804117e-05, + "loss": 0.002849406376481056, + "step": 225960 + }, + { + "epoch": 32.075230660042585, + "grad_norm": 0.011876318603754044, + "learning_rate": 6.793797019162526e-05, + "loss": 0.005004815012216568, + "step": 225970 + }, + { + "epoch": 32.07665010645848, + "grad_norm": 0.026346944272518158, + "learning_rate": 6.793655074520937e-05, + "loss": 0.04623944759368896, + "step": 225980 + }, + { + "epoch": 32.07806955287438, + "grad_norm": 0.020214218646287918, + "learning_rate": 6.793513129879347e-05, + "loss": 0.005116449669003487, + "step": 225990 + }, + { + "epoch": 32.07948899929028, + "grad_norm": 0.00967826135456562, + "learning_rate": 6.793371185237758e-05, + "loss": 0.0055853743106126785, + "step": 226000 + }, + { + "epoch": 32.07948899929028, + "eval_accuracy": 0.9834043364913843, + "eval_loss": 0.0651790052652359, + "eval_runtime": 33.4963, + "eval_samples_per_second": 469.514, + "eval_steps_per_second": 14.688, + "step": 226000 + }, + { + "epoch": 32.08090844570617, + "grad_norm": 27.6313533782959, + "learning_rate": 6.793229240596167e-05, + "loss": 0.05992723703384399, + "step": 226010 + }, + { + "epoch": 32.08232789212207, + "grad_norm": 0.022658897563815117, + "learning_rate": 6.793087295954578e-05, + "loss": 0.007435028254985809, + "step": 226020 + }, + { + "epoch": 32.08374733853797, + "grad_norm": 0.18772947788238525, + "learning_rate": 6.792945351312987e-05, + "loss": 0.00471024177968502, + "step": 226030 + }, + { + "epoch": 32.08516678495387, + "grad_norm": 0.24476061761379242, + "learning_rate": 6.792803406671399e-05, + "loss": 0.005466364696621895, + "step": 226040 + }, + { + "epoch": 32.086586231369765, + "grad_norm": 0.2563769817352295, + "learning_rate": 6.79266146202981e-05, + "loss": 0.01826365739107132, + "step": 226050 + }, + { + "epoch": 32.08800567778567, + "grad_norm": 2.336925983428955, + "learning_rate": 6.79251951738822e-05, + "loss": 0.0011826176196336746, + "step": 226060 + }, + { + "epoch": 32.08942512420156, + "grad_norm": 1.4505088329315186, + "learning_rate": 6.792377572746629e-05, + "loss": 0.0031688835471868513, + "step": 226070 + }, + { + "epoch": 32.090844570617456, + "grad_norm": 18.57175064086914, + "learning_rate": 6.792235628105039e-05, + "loss": 0.014582014083862305, + "step": 226080 + }, + { + "epoch": 32.09226401703336, + "grad_norm": 0.19626405835151672, + "learning_rate": 6.79209368346345e-05, + "loss": 0.008132746815681458, + "step": 226090 + }, + { + "epoch": 32.09368346344925, + "grad_norm": 0.0055230665020644665, + "learning_rate": 6.79195173882186e-05, + "loss": 0.014757515490055084, + "step": 226100 + }, + { + "epoch": 32.095102909865155, + "grad_norm": 0.02518056146800518, + "learning_rate": 6.791809794180271e-05, + "loss": 0.006229698657989502, + "step": 226110 + }, + { + "epoch": 32.09652235628105, + "grad_norm": 4.031813144683838, + "learning_rate": 6.791667849538679e-05, + "loss": 0.01869034767150879, + "step": 226120 + }, + { + "epoch": 32.09794180269695, + "grad_norm": 0.6111317276954651, + "learning_rate": 6.79152590489709e-05, + "loss": 0.02277781218290329, + "step": 226130 + }, + { + "epoch": 32.099361249112846, + "grad_norm": 0.04170465096831322, + "learning_rate": 6.791383960255501e-05, + "loss": 0.008871898055076599, + "step": 226140 + }, + { + "epoch": 32.10078069552874, + "grad_norm": 0.11410875618457794, + "learning_rate": 6.791242015613911e-05, + "loss": 0.02751876413822174, + "step": 226150 + }, + { + "epoch": 32.10220014194464, + "grad_norm": 7.799099922180176, + "learning_rate": 6.791100070972322e-05, + "loss": 0.029103317856788637, + "step": 226160 + }, + { + "epoch": 32.10361958836054, + "grad_norm": 5.764072418212891, + "learning_rate": 6.79095812633073e-05, + "loss": 0.02120859771966934, + "step": 226170 + }, + { + "epoch": 32.10503903477644, + "grad_norm": 0.00781995989382267, + "learning_rate": 6.790816181689142e-05, + "loss": 0.016630740463733674, + "step": 226180 + }, + { + "epoch": 32.106458481192334, + "grad_norm": 0.011474930681288242, + "learning_rate": 6.790674237047551e-05, + "loss": 0.0009336657822132111, + "step": 226190 + }, + { + "epoch": 32.107877927608236, + "grad_norm": 0.6200491786003113, + "learning_rate": 6.790532292405963e-05, + "loss": 0.021954762935638427, + "step": 226200 + }, + { + "epoch": 32.10929737402413, + "grad_norm": 0.21447274088859558, + "learning_rate": 6.790390347764372e-05, + "loss": 0.009431006014347076, + "step": 226210 + }, + { + "epoch": 32.110716820440025, + "grad_norm": 0.4949254095554352, + "learning_rate": 6.790248403122782e-05, + "loss": 0.010945437848567963, + "step": 226220 + }, + { + "epoch": 32.11213626685593, + "grad_norm": 0.5460069179534912, + "learning_rate": 6.790106458481193e-05, + "loss": 0.0005258731544017792, + "step": 226230 + }, + { + "epoch": 32.11355571327182, + "grad_norm": 0.012957349419593811, + "learning_rate": 6.789964513839603e-05, + "loss": 0.009892486780881882, + "step": 226240 + }, + { + "epoch": 32.114975159687724, + "grad_norm": 0.05390701815485954, + "learning_rate": 6.789822569198014e-05, + "loss": 0.034292811155319215, + "step": 226250 + }, + { + "epoch": 32.11639460610362, + "grad_norm": 0.012682627886533737, + "learning_rate": 6.789680624556424e-05, + "loss": 0.014278225600719452, + "step": 226260 + }, + { + "epoch": 32.11781405251952, + "grad_norm": 0.5053780674934387, + "learning_rate": 6.789538679914833e-05, + "loss": 0.0038548950105905533, + "step": 226270 + }, + { + "epoch": 32.119233498935415, + "grad_norm": 0.01031600683927536, + "learning_rate": 6.789396735273243e-05, + "loss": 0.022361889481544495, + "step": 226280 + }, + { + "epoch": 32.12065294535131, + "grad_norm": 0.19116392731666565, + "learning_rate": 6.789254790631654e-05, + "loss": 0.04397893846035004, + "step": 226290 + }, + { + "epoch": 32.12207239176721, + "grad_norm": 0.02821987494826317, + "learning_rate": 6.789112845990064e-05, + "loss": 0.017762596905231475, + "step": 226300 + }, + { + "epoch": 32.12349183818311, + "grad_norm": 10.63321304321289, + "learning_rate": 6.788970901348475e-05, + "loss": 0.021695098280906676, + "step": 226310 + }, + { + "epoch": 32.12491128459901, + "grad_norm": 0.0894048660993576, + "learning_rate": 6.788828956706885e-05, + "loss": 0.0022847197949886324, + "step": 226320 + }, + { + "epoch": 32.1263307310149, + "grad_norm": 0.318244993686676, + "learning_rate": 6.788687012065295e-05, + "loss": 0.009692957997322083, + "step": 226330 + }, + { + "epoch": 32.127750177430805, + "grad_norm": 0.04463260620832443, + "learning_rate": 6.788545067423706e-05, + "loss": 0.00043567493557929995, + "step": 226340 + }, + { + "epoch": 32.1291696238467, + "grad_norm": 1.1453462839126587, + "learning_rate": 6.788403122782115e-05, + "loss": 0.005518807470798493, + "step": 226350 + }, + { + "epoch": 32.130589070262594, + "grad_norm": 0.0510883703827858, + "learning_rate": 6.788261178140526e-05, + "loss": 0.017912130057811736, + "step": 226360 + }, + { + "epoch": 32.132008516678496, + "grad_norm": 0.040778279304504395, + "learning_rate": 6.788119233498935e-05, + "loss": 0.025244176387786865, + "step": 226370 + }, + { + "epoch": 32.13342796309439, + "grad_norm": 7.456422805786133, + "learning_rate": 6.787977288857346e-05, + "loss": 0.041823184490203856, + "step": 226380 + }, + { + "epoch": 32.13484740951029, + "grad_norm": 10.022063255310059, + "learning_rate": 6.787835344215756e-05, + "loss": 0.02776021361351013, + "step": 226390 + }, + { + "epoch": 32.13626685592619, + "grad_norm": 0.19150298833847046, + "learning_rate": 6.787693399574167e-05, + "loss": 0.016533374786376953, + "step": 226400 + }, + { + "epoch": 32.13768630234209, + "grad_norm": 11.90239143371582, + "learning_rate": 6.787551454932576e-05, + "loss": 0.024763843417167662, + "step": 226410 + }, + { + "epoch": 32.139105748757984, + "grad_norm": 0.6358726620674133, + "learning_rate": 6.787409510290988e-05, + "loss": 0.0022670570760965347, + "step": 226420 + }, + { + "epoch": 32.14052519517388, + "grad_norm": 8.413793563842773, + "learning_rate": 6.787267565649397e-05, + "loss": 0.014474707841873168, + "step": 226430 + }, + { + "epoch": 32.14194464158978, + "grad_norm": 0.009398320689797401, + "learning_rate": 6.787125621007807e-05, + "loss": 0.026953858137130738, + "step": 226440 + }, + { + "epoch": 32.143364088005676, + "grad_norm": 0.32910358905792236, + "learning_rate": 6.786983676366218e-05, + "loss": 0.01904934346675873, + "step": 226450 + }, + { + "epoch": 32.14478353442158, + "grad_norm": 0.7821293473243713, + "learning_rate": 6.786841731724628e-05, + "loss": 0.0015857454389333725, + "step": 226460 + }, + { + "epoch": 32.14620298083747, + "grad_norm": 0.016217797994613647, + "learning_rate": 6.786699787083039e-05, + "loss": 0.0012956250458955765, + "step": 226470 + }, + { + "epoch": 32.147622427253374, + "grad_norm": 0.051509737968444824, + "learning_rate": 6.786557842441447e-05, + "loss": 0.004078931733965873, + "step": 226480 + }, + { + "epoch": 32.14904187366927, + "grad_norm": 1.506898283958435, + "learning_rate": 6.786415897799858e-05, + "loss": 0.05694327354431152, + "step": 226490 + }, + { + "epoch": 32.150461320085164, + "grad_norm": 8.771342277526855, + "learning_rate": 6.786273953158268e-05, + "loss": 0.007567209750413894, + "step": 226500 + }, + { + "epoch": 32.150461320085164, + "eval_accuracy": 0.9849939594328225, + "eval_loss": 0.057235393673181534, + "eval_runtime": 34.6398, + "eval_samples_per_second": 454.015, + "eval_steps_per_second": 14.203, + "step": 226500 + }, + { + "epoch": 32.151880766501066, + "grad_norm": 0.3276655077934265, + "learning_rate": 6.786132008516679e-05, + "loss": 0.024104154109954833, + "step": 226510 + }, + { + "epoch": 32.15330021291696, + "grad_norm": 11.219908714294434, + "learning_rate": 6.785990063875089e-05, + "loss": 0.02843546271324158, + "step": 226520 + }, + { + "epoch": 32.15471965933286, + "grad_norm": 0.13571691513061523, + "learning_rate": 6.785848119233499e-05, + "loss": 0.004830534383654594, + "step": 226530 + }, + { + "epoch": 32.15613910574876, + "grad_norm": 11.759196281433105, + "learning_rate": 6.78570617459191e-05, + "loss": 0.05479285717010498, + "step": 226540 + }, + { + "epoch": 32.15755855216466, + "grad_norm": 0.16090506315231323, + "learning_rate": 6.78556422995032e-05, + "loss": 0.001169077306985855, + "step": 226550 + }, + { + "epoch": 32.15897799858055, + "grad_norm": 0.06365156918764114, + "learning_rate": 6.785422285308731e-05, + "loss": 0.00881885588169098, + "step": 226560 + }, + { + "epoch": 32.16039744499645, + "grad_norm": 0.014538298361003399, + "learning_rate": 6.78528034066714e-05, + "loss": 0.00044929347932338716, + "step": 226570 + }, + { + "epoch": 32.16181689141235, + "grad_norm": 0.01989283226430416, + "learning_rate": 6.78513839602555e-05, + "loss": 0.029707825183868407, + "step": 226580 + }, + { + "epoch": 32.163236337828245, + "grad_norm": 0.7520415186882019, + "learning_rate": 6.78499645138396e-05, + "loss": 0.0013905741274356843, + "step": 226590 + }, + { + "epoch": 32.16465578424415, + "grad_norm": 0.004346861504018307, + "learning_rate": 6.784854506742371e-05, + "loss": 0.0017464429140090943, + "step": 226600 + }, + { + "epoch": 32.16607523066004, + "grad_norm": 0.01068064384162426, + "learning_rate": 6.784712562100781e-05, + "loss": 0.0006943754851818085, + "step": 226610 + }, + { + "epoch": 32.16749467707594, + "grad_norm": 0.6489815711975098, + "learning_rate": 6.784570617459192e-05, + "loss": 0.009316331148147583, + "step": 226620 + }, + { + "epoch": 32.16891412349184, + "grad_norm": 0.05263878405094147, + "learning_rate": 6.784428672817602e-05, + "loss": 0.001408936083316803, + "step": 226630 + }, + { + "epoch": 32.17033356990773, + "grad_norm": 0.04206226021051407, + "learning_rate": 6.784286728176011e-05, + "loss": 0.004071025922894478, + "step": 226640 + }, + { + "epoch": 32.171753016323635, + "grad_norm": 0.021569155156612396, + "learning_rate": 6.784144783534422e-05, + "loss": 0.0029397141188383102, + "step": 226650 + }, + { + "epoch": 32.17317246273953, + "grad_norm": 0.0012881702277809381, + "learning_rate": 6.784002838892832e-05, + "loss": 0.0023689642548561096, + "step": 226660 + }, + { + "epoch": 32.17459190915543, + "grad_norm": 0.07190117985010147, + "learning_rate": 6.783860894251243e-05, + "loss": 0.005682816356420517, + "step": 226670 + }, + { + "epoch": 32.176011355571326, + "grad_norm": 0.21605613827705383, + "learning_rate": 6.783718949609652e-05, + "loss": 0.003471849486231804, + "step": 226680 + }, + { + "epoch": 32.17743080198723, + "grad_norm": 8.209395408630371, + "learning_rate": 6.783577004968063e-05, + "loss": 0.012408919632434845, + "step": 226690 + }, + { + "epoch": 32.17885024840312, + "grad_norm": 0.11910762637853622, + "learning_rate": 6.783435060326472e-05, + "loss": 0.018804123997688292, + "step": 226700 + }, + { + "epoch": 32.18026969481902, + "grad_norm": 11.646269798278809, + "learning_rate": 6.783293115684884e-05, + "loss": 0.010764125734567642, + "step": 226710 + }, + { + "epoch": 32.18168914123492, + "grad_norm": 0.1823829710483551, + "learning_rate": 6.783151171043293e-05, + "loss": 0.0021681636571884155, + "step": 226720 + }, + { + "epoch": 32.183108587650814, + "grad_norm": 1.2800319194793701, + "learning_rate": 6.783009226401703e-05, + "loss": 0.006319642066955566, + "step": 226730 + }, + { + "epoch": 32.184528034066716, + "grad_norm": 2.1969563961029053, + "learning_rate": 6.782867281760114e-05, + "loss": 0.0020336851477622984, + "step": 226740 + }, + { + "epoch": 32.18594748048261, + "grad_norm": 8.305089950561523, + "learning_rate": 6.782725337118524e-05, + "loss": 0.024220985174179078, + "step": 226750 + }, + { + "epoch": 32.18736692689851, + "grad_norm": 0.17709019780158997, + "learning_rate": 6.782583392476935e-05, + "loss": 0.0029531802982091905, + "step": 226760 + }, + { + "epoch": 32.18878637331441, + "grad_norm": 0.9323855042457581, + "learning_rate": 6.782441447835345e-05, + "loss": 0.007575319707393646, + "step": 226770 + }, + { + "epoch": 32.1902058197303, + "grad_norm": 0.9408407807350159, + "learning_rate": 6.782299503193756e-05, + "loss": 0.017737257480621337, + "step": 226780 + }, + { + "epoch": 32.191625266146204, + "grad_norm": 0.03764651343226433, + "learning_rate": 6.782157558552164e-05, + "loss": 0.00817856341600418, + "step": 226790 + }, + { + "epoch": 32.1930447125621, + "grad_norm": 0.026559466496109962, + "learning_rate": 6.782015613910575e-05, + "loss": 0.004506718739867211, + "step": 226800 + }, + { + "epoch": 32.194464158978, + "grad_norm": 0.2352267950773239, + "learning_rate": 6.781873669268985e-05, + "loss": 0.016949307918548585, + "step": 226810 + }, + { + "epoch": 32.195883605393895, + "grad_norm": 13.2322359085083, + "learning_rate": 6.781731724627396e-05, + "loss": 0.008841350674629211, + "step": 226820 + }, + { + "epoch": 32.1973030518098, + "grad_norm": 0.7084075808525085, + "learning_rate": 6.781589779985806e-05, + "loss": 0.0019575174897909164, + "step": 226830 + }, + { + "epoch": 32.19872249822569, + "grad_norm": 0.017347637563943863, + "learning_rate": 6.781447835344216e-05, + "loss": 0.012627491354942321, + "step": 226840 + }, + { + "epoch": 32.20014194464159, + "grad_norm": 27.83686637878418, + "learning_rate": 6.781305890702627e-05, + "loss": 0.05242570042610169, + "step": 226850 + }, + { + "epoch": 32.20156139105749, + "grad_norm": 0.719579815864563, + "learning_rate": 6.781163946061036e-05, + "loss": 0.012030031532049179, + "step": 226860 + }, + { + "epoch": 32.20298083747338, + "grad_norm": 0.8775447010993958, + "learning_rate": 6.781022001419447e-05, + "loss": 0.022775954008102416, + "step": 226870 + }, + { + "epoch": 32.204400283889285, + "grad_norm": 0.05771833658218384, + "learning_rate": 6.780880056777857e-05, + "loss": 0.01746365875005722, + "step": 226880 + }, + { + "epoch": 32.20581973030518, + "grad_norm": 0.9570845365524292, + "learning_rate": 6.780738112136267e-05, + "loss": 0.0026505559682846068, + "step": 226890 + }, + { + "epoch": 32.20723917672108, + "grad_norm": 0.05492893233895302, + "learning_rate": 6.780596167494677e-05, + "loss": 0.06254994869232178, + "step": 226900 + }, + { + "epoch": 32.20865862313698, + "grad_norm": 1.6246583461761475, + "learning_rate": 6.780454222853088e-05, + "loss": 0.007946115732192994, + "step": 226910 + }, + { + "epoch": 32.21007806955287, + "grad_norm": 0.024437978863716125, + "learning_rate": 6.780312278211497e-05, + "loss": 0.016856712102890015, + "step": 226920 + }, + { + "epoch": 32.21149751596877, + "grad_norm": 0.021988969296216965, + "learning_rate": 6.780170333569909e-05, + "loss": 0.02473500669002533, + "step": 226930 + }, + { + "epoch": 32.21291696238467, + "grad_norm": 0.8566555976867676, + "learning_rate": 6.780028388928318e-05, + "loss": 0.011438916623592376, + "step": 226940 + }, + { + "epoch": 32.21433640880057, + "grad_norm": 0.0034157712943851948, + "learning_rate": 6.779886444286728e-05, + "loss": 0.0029316745698451997, + "step": 226950 + }, + { + "epoch": 32.215755855216464, + "grad_norm": 0.020835809409618378, + "learning_rate": 6.779744499645139e-05, + "loss": 0.01733205169439316, + "step": 226960 + }, + { + "epoch": 32.217175301632366, + "grad_norm": 0.0289155263453722, + "learning_rate": 6.779602555003549e-05, + "loss": 0.0011914093047380447, + "step": 226970 + }, + { + "epoch": 32.21859474804826, + "grad_norm": 0.013304891996085644, + "learning_rate": 6.77946061036196e-05, + "loss": 0.0021188180893659593, + "step": 226980 + }, + { + "epoch": 32.220014194464156, + "grad_norm": 0.7231037020683289, + "learning_rate": 6.779318665720368e-05, + "loss": 0.001973387598991394, + "step": 226990 + }, + { + "epoch": 32.22143364088006, + "grad_norm": 0.023414866998791695, + "learning_rate": 6.77917672107878e-05, + "loss": 0.0065168246626853945, + "step": 227000 + }, + { + "epoch": 32.22143364088006, + "eval_accuracy": 0.9914796210338908, + "eval_loss": 0.03700839728116989, + "eval_runtime": 33.9943, + "eval_samples_per_second": 462.637, + "eval_steps_per_second": 14.473, + "step": 227000 + }, + { + "epoch": 32.22285308729595, + "grad_norm": 4.523889064788818, + "learning_rate": 6.779034776437189e-05, + "loss": 0.0016607701778411864, + "step": 227010 + }, + { + "epoch": 32.224272533711854, + "grad_norm": 0.043177589774131775, + "learning_rate": 6.7788928317956e-05, + "loss": 0.009133559465408326, + "step": 227020 + }, + { + "epoch": 32.22569198012775, + "grad_norm": 0.14806191623210907, + "learning_rate": 6.77875088715401e-05, + "loss": 0.006231259927153588, + "step": 227030 + }, + { + "epoch": 32.22711142654365, + "grad_norm": 0.27731719613075256, + "learning_rate": 6.77860894251242e-05, + "loss": 0.00062263123691082, + "step": 227040 + }, + { + "epoch": 32.228530872959546, + "grad_norm": 0.1982564777135849, + "learning_rate": 6.778466997870831e-05, + "loss": 0.011056290566921234, + "step": 227050 + }, + { + "epoch": 32.22995031937544, + "grad_norm": 0.5951840281486511, + "learning_rate": 6.77832505322924e-05, + "loss": 0.008127520978450774, + "step": 227060 + }, + { + "epoch": 32.23136976579134, + "grad_norm": 0.017885245382785797, + "learning_rate": 6.778183108587652e-05, + "loss": 0.004548555240035057, + "step": 227070 + }, + { + "epoch": 32.23278921220724, + "grad_norm": 0.10104569792747498, + "learning_rate": 6.778041163946061e-05, + "loss": 0.05551460385322571, + "step": 227080 + }, + { + "epoch": 32.23420865862314, + "grad_norm": 0.12394300103187561, + "learning_rate": 6.777899219304471e-05, + "loss": 0.02415381371974945, + "step": 227090 + }, + { + "epoch": 32.235628105039034, + "grad_norm": 0.1958477944135666, + "learning_rate": 6.777757274662881e-05, + "loss": 0.001025635376572609, + "step": 227100 + }, + { + "epoch": 32.237047551454936, + "grad_norm": 0.714449405670166, + "learning_rate": 6.777615330021292e-05, + "loss": 0.01214798092842102, + "step": 227110 + }, + { + "epoch": 32.23846699787083, + "grad_norm": 4.991261959075928, + "learning_rate": 6.777473385379702e-05, + "loss": 0.007264332473278045, + "step": 227120 + }, + { + "epoch": 32.239886444286725, + "grad_norm": 8.985249519348145, + "learning_rate": 6.777331440738113e-05, + "loss": 0.02501608431339264, + "step": 227130 + }, + { + "epoch": 32.24130589070263, + "grad_norm": 9.249789237976074, + "learning_rate": 6.777189496096523e-05, + "loss": 0.010401606559753418, + "step": 227140 + }, + { + "epoch": 32.24272533711852, + "grad_norm": 0.9215807318687439, + "learning_rate": 6.777047551454932e-05, + "loss": 0.009038049727678299, + "step": 227150 + }, + { + "epoch": 32.24414478353442, + "grad_norm": 0.6717143654823303, + "learning_rate": 6.776905606813343e-05, + "loss": 0.002036646008491516, + "step": 227160 + }, + { + "epoch": 32.24556422995032, + "grad_norm": 0.01578451320528984, + "learning_rate": 6.776763662171753e-05, + "loss": 0.028053873777389528, + "step": 227170 + }, + { + "epoch": 32.24698367636622, + "grad_norm": 2.9214670658111572, + "learning_rate": 6.776621717530164e-05, + "loss": 0.0038738112896680834, + "step": 227180 + }, + { + "epoch": 32.248403122782115, + "grad_norm": 0.06288395822048187, + "learning_rate": 6.776479772888574e-05, + "loss": 0.024388253688812256, + "step": 227190 + }, + { + "epoch": 32.24982256919801, + "grad_norm": 0.02276899665594101, + "learning_rate": 6.776337828246984e-05, + "loss": 0.01765856146812439, + "step": 227200 + }, + { + "epoch": 32.25124201561391, + "grad_norm": 23.803102493286133, + "learning_rate": 6.776195883605393e-05, + "loss": 0.02665036916732788, + "step": 227210 + }, + { + "epoch": 32.252661462029806, + "grad_norm": 0.3686148226261139, + "learning_rate": 6.776053938963805e-05, + "loss": 0.001989418640732765, + "step": 227220 + }, + { + "epoch": 32.25408090844571, + "grad_norm": 0.055271442979574203, + "learning_rate": 6.775911994322214e-05, + "loss": 0.01918162703514099, + "step": 227230 + }, + { + "epoch": 32.2555003548616, + "grad_norm": 0.008994427509605885, + "learning_rate": 6.775770049680625e-05, + "loss": 0.01659678518772125, + "step": 227240 + }, + { + "epoch": 32.256919801277505, + "grad_norm": 0.007449792232364416, + "learning_rate": 6.775628105039035e-05, + "loss": 0.0015644211322069167, + "step": 227250 + }, + { + "epoch": 32.2583392476934, + "grad_norm": 0.09163745492696762, + "learning_rate": 6.775486160397445e-05, + "loss": 0.0254619836807251, + "step": 227260 + }, + { + "epoch": 32.259758694109294, + "grad_norm": 0.10055192559957504, + "learning_rate": 6.775344215755856e-05, + "loss": 0.0034928373992443086, + "step": 227270 + }, + { + "epoch": 32.261178140525196, + "grad_norm": 0.05616561695933342, + "learning_rate": 6.775202271114266e-05, + "loss": 0.0011031094938516617, + "step": 227280 + }, + { + "epoch": 32.26259758694109, + "grad_norm": 0.0014462985564023256, + "learning_rate": 6.775060326472677e-05, + "loss": 0.0030206248164176943, + "step": 227290 + }, + { + "epoch": 32.26401703335699, + "grad_norm": 0.261479914188385, + "learning_rate": 6.774918381831085e-05, + "loss": 0.003044208139181137, + "step": 227300 + }, + { + "epoch": 32.26543647977289, + "grad_norm": 0.0035938210785388947, + "learning_rate": 6.774776437189496e-05, + "loss": 0.001812099665403366, + "step": 227310 + }, + { + "epoch": 32.26685592618879, + "grad_norm": 2.1015045642852783, + "learning_rate": 6.774634492547906e-05, + "loss": 0.007719796895980835, + "step": 227320 + }, + { + "epoch": 32.268275372604684, + "grad_norm": 0.3582167327404022, + "learning_rate": 6.774492547906317e-05, + "loss": 0.004690805077552795, + "step": 227330 + }, + { + "epoch": 32.26969481902058, + "grad_norm": 0.11420204490423203, + "learning_rate": 6.774350603264728e-05, + "loss": 0.004896505549550056, + "step": 227340 + }, + { + "epoch": 32.27111426543648, + "grad_norm": 1.2265373468399048, + "learning_rate": 6.774208658623137e-05, + "loss": 0.015039607882499695, + "step": 227350 + }, + { + "epoch": 32.272533711852375, + "grad_norm": 0.0007800370804034173, + "learning_rate": 6.774066713981548e-05, + "loss": 0.0012316405773162842, + "step": 227360 + }, + { + "epoch": 32.27395315826828, + "grad_norm": 0.007190863136202097, + "learning_rate": 6.773924769339957e-05, + "loss": 0.0007853962481021882, + "step": 227370 + }, + { + "epoch": 32.27537260468417, + "grad_norm": 0.003958662506192923, + "learning_rate": 6.773782824698368e-05, + "loss": 0.017862720787525176, + "step": 227380 + }, + { + "epoch": 32.276792051100074, + "grad_norm": 0.005294716916978359, + "learning_rate": 6.773640880056778e-05, + "loss": 0.027928057312965392, + "step": 227390 + }, + { + "epoch": 32.27821149751597, + "grad_norm": 6.227741718292236, + "learning_rate": 6.773498935415188e-05, + "loss": 0.008893126249313354, + "step": 227400 + }, + { + "epoch": 32.27963094393186, + "grad_norm": 0.06309053301811218, + "learning_rate": 6.773356990773598e-05, + "loss": 0.03918173313140869, + "step": 227410 + }, + { + "epoch": 32.281050390347765, + "grad_norm": 10.613203048706055, + "learning_rate": 6.773215046132009e-05, + "loss": 0.0217086061835289, + "step": 227420 + }, + { + "epoch": 32.28246983676366, + "grad_norm": 2.29934024810791, + "learning_rate": 6.773073101490419e-05, + "loss": 0.020522288978099823, + "step": 227430 + }, + { + "epoch": 32.28388928317956, + "grad_norm": 1.0432966947555542, + "learning_rate": 6.77293115684883e-05, + "loss": 0.0027814146131277086, + "step": 227440 + }, + { + "epoch": 32.28530872959546, + "grad_norm": 0.07523953169584274, + "learning_rate": 6.772789212207241e-05, + "loss": 0.0031500421464443207, + "step": 227450 + }, + { + "epoch": 32.28672817601136, + "grad_norm": 0.20080378651618958, + "learning_rate": 6.772647267565649e-05, + "loss": 0.024440935254096983, + "step": 227460 + }, + { + "epoch": 32.28814762242725, + "grad_norm": 0.059458743780851364, + "learning_rate": 6.77250532292406e-05, + "loss": 0.04724908173084259, + "step": 227470 + }, + { + "epoch": 32.28956706884315, + "grad_norm": 3.167478084564209, + "learning_rate": 6.77236337828247e-05, + "loss": 0.002858617156744003, + "step": 227480 + }, + { + "epoch": 32.29098651525905, + "grad_norm": 13.873939514160156, + "learning_rate": 6.772221433640881e-05, + "loss": 0.009082743525505066, + "step": 227490 + }, + { + "epoch": 32.292405961674945, + "grad_norm": 0.2479545772075653, + "learning_rate": 6.772079488999291e-05, + "loss": 0.010691935569047928, + "step": 227500 + }, + { + "epoch": 32.292405961674945, + "eval_accuracy": 0.9814967889616583, + "eval_loss": 0.08010502904653549, + "eval_runtime": 33.7574, + "eval_samples_per_second": 465.883, + "eval_steps_per_second": 14.575, + "step": 227500 + }, + { + "epoch": 32.29382540809085, + "grad_norm": 0.5075340270996094, + "learning_rate": 6.7719375443577e-05, + "loss": 0.010340392589569092, + "step": 227510 + }, + { + "epoch": 32.29524485450674, + "grad_norm": 0.021565062925219536, + "learning_rate": 6.77179559971611e-05, + "loss": 0.02045447677373886, + "step": 227520 + }, + { + "epoch": 32.29666430092264, + "grad_norm": 0.062061652541160583, + "learning_rate": 6.771653655074521e-05, + "loss": 0.0029515191912651063, + "step": 227530 + }, + { + "epoch": 32.29808374733854, + "grad_norm": 3.0621190071105957, + "learning_rate": 6.771511710432932e-05, + "loss": 0.011473548412322999, + "step": 227540 + }, + { + "epoch": 32.29950319375443, + "grad_norm": 15.458075523376465, + "learning_rate": 6.771369765791342e-05, + "loss": 0.023530313372612, + "step": 227550 + }, + { + "epoch": 32.300922640170334, + "grad_norm": 8.01618480682373, + "learning_rate": 6.771227821149752e-05, + "loss": 0.0072765380144119264, + "step": 227560 + }, + { + "epoch": 32.30234208658623, + "grad_norm": 0.004135652910917997, + "learning_rate": 6.771085876508162e-05, + "loss": 0.011257041245698929, + "step": 227570 + }, + { + "epoch": 32.30376153300213, + "grad_norm": 1.4548581838607788, + "learning_rate": 6.770943931866573e-05, + "loss": 0.024643242359161377, + "step": 227580 + }, + { + "epoch": 32.305180979418026, + "grad_norm": 0.0316215418279171, + "learning_rate": 6.770801987224982e-05, + "loss": 0.010795360058546066, + "step": 227590 + }, + { + "epoch": 32.30660042583393, + "grad_norm": 0.02264237217605114, + "learning_rate": 6.770660042583394e-05, + "loss": 0.03683864176273346, + "step": 227600 + }, + { + "epoch": 32.30801987224982, + "grad_norm": 0.028490282595157623, + "learning_rate": 6.770518097941802e-05, + "loss": 0.04996927082538605, + "step": 227610 + }, + { + "epoch": 32.30943931866572, + "grad_norm": 0.4497067928314209, + "learning_rate": 6.770376153300213e-05, + "loss": 0.001808510720729828, + "step": 227620 + }, + { + "epoch": 32.31085876508162, + "grad_norm": 0.20218124985694885, + "learning_rate": 6.770234208658624e-05, + "loss": 0.013576823472976684, + "step": 227630 + }, + { + "epoch": 32.312278211497514, + "grad_norm": 0.19858773052692413, + "learning_rate": 6.770092264017034e-05, + "loss": 0.03481378853321075, + "step": 227640 + }, + { + "epoch": 32.313697657913416, + "grad_norm": 0.03164186328649521, + "learning_rate": 6.769950319375445e-05, + "loss": 0.0010971792042255401, + "step": 227650 + }, + { + "epoch": 32.31511710432931, + "grad_norm": 0.7752857208251953, + "learning_rate": 6.769808374733853e-05, + "loss": 0.04529733955860138, + "step": 227660 + }, + { + "epoch": 32.31653655074521, + "grad_norm": 0.9886729121208191, + "learning_rate": 6.769666430092264e-05, + "loss": 0.047073495388031, + "step": 227670 + }, + { + "epoch": 32.31795599716111, + "grad_norm": 0.0945403128862381, + "learning_rate": 6.769524485450674e-05, + "loss": 0.03597712218761444, + "step": 227680 + }, + { + "epoch": 32.319375443577, + "grad_norm": 0.7908775210380554, + "learning_rate": 6.769382540809085e-05, + "loss": 0.012195146828889846, + "step": 227690 + }, + { + "epoch": 32.320794889992904, + "grad_norm": 0.08392732590436935, + "learning_rate": 6.769240596167495e-05, + "loss": 0.03931039273738861, + "step": 227700 + }, + { + "epoch": 32.3222143364088, + "grad_norm": 0.07122748345136642, + "learning_rate": 6.769098651525905e-05, + "loss": 0.0029064871370792387, + "step": 227710 + }, + { + "epoch": 32.3236337828247, + "grad_norm": 0.030743375420570374, + "learning_rate": 6.768956706884316e-05, + "loss": 0.009477080404758453, + "step": 227720 + }, + { + "epoch": 32.325053229240595, + "grad_norm": 0.23477473855018616, + "learning_rate": 6.768814762242726e-05, + "loss": 0.024989084899425508, + "step": 227730 + }, + { + "epoch": 32.3264726756565, + "grad_norm": 6.123174667358398, + "learning_rate": 6.768672817601137e-05, + "loss": 0.008365117013454437, + "step": 227740 + }, + { + "epoch": 32.32789212207239, + "grad_norm": 0.022556474432349205, + "learning_rate": 6.768530872959546e-05, + "loss": 0.009026828408241271, + "step": 227750 + }, + { + "epoch": 32.329311568488286, + "grad_norm": 0.05930907651782036, + "learning_rate": 6.768388928317956e-05, + "loss": 0.0033019714057445524, + "step": 227760 + }, + { + "epoch": 32.33073101490419, + "grad_norm": 0.010127953253686428, + "learning_rate": 6.768246983676366e-05, + "loss": 0.0012955412268638611, + "step": 227770 + }, + { + "epoch": 32.33215046132008, + "grad_norm": 0.06372237950563431, + "learning_rate": 6.768105039034777e-05, + "loss": 0.008390715718269348, + "step": 227780 + }, + { + "epoch": 32.333569907735985, + "grad_norm": 11.254677772521973, + "learning_rate": 6.767963094393187e-05, + "loss": 0.0328385591506958, + "step": 227790 + }, + { + "epoch": 32.33498935415188, + "grad_norm": 0.9688782095909119, + "learning_rate": 6.767821149751598e-05, + "loss": 0.015755003690719603, + "step": 227800 + }, + { + "epoch": 32.33640880056778, + "grad_norm": 7.805362701416016, + "learning_rate": 6.767679205110008e-05, + "loss": 0.004446466267108917, + "step": 227810 + }, + { + "epoch": 32.337828246983676, + "grad_norm": 0.03059357963502407, + "learning_rate": 6.767537260468417e-05, + "loss": 0.005260283499956131, + "step": 227820 + }, + { + "epoch": 32.33924769339957, + "grad_norm": 1.1648472547531128, + "learning_rate": 6.767395315826828e-05, + "loss": 0.001507154479622841, + "step": 227830 + }, + { + "epoch": 32.34066713981547, + "grad_norm": 0.02689109928905964, + "learning_rate": 6.767253371185238e-05, + "loss": 0.005947951227426529, + "step": 227840 + }, + { + "epoch": 32.34208658623137, + "grad_norm": 8.782817840576172, + "learning_rate": 6.767111426543649e-05, + "loss": 0.007454285770654679, + "step": 227850 + }, + { + "epoch": 32.34350603264727, + "grad_norm": 0.02137431688606739, + "learning_rate": 6.766969481902059e-05, + "loss": 0.02134057432413101, + "step": 227860 + }, + { + "epoch": 32.344925479063164, + "grad_norm": 12.489258766174316, + "learning_rate": 6.766827537260469e-05, + "loss": 0.016687971353530884, + "step": 227870 + }, + { + "epoch": 32.346344925479066, + "grad_norm": 1.469437837600708, + "learning_rate": 6.766685592618878e-05, + "loss": 0.04476257562637329, + "step": 227880 + }, + { + "epoch": 32.34776437189496, + "grad_norm": 0.5112425684928894, + "learning_rate": 6.76654364797729e-05, + "loss": 0.022267118096351624, + "step": 227890 + }, + { + "epoch": 32.349183818310856, + "grad_norm": 0.06943535059690475, + "learning_rate": 6.766401703335699e-05, + "loss": 0.05193442106246948, + "step": 227900 + }, + { + "epoch": 32.35060326472676, + "grad_norm": 0.6256040334701538, + "learning_rate": 6.76625975869411e-05, + "loss": 0.0032735299319028854, + "step": 227910 + }, + { + "epoch": 32.35202271114265, + "grad_norm": 0.06451132148504257, + "learning_rate": 6.76611781405252e-05, + "loss": 0.0011525850743055344, + "step": 227920 + }, + { + "epoch": 32.353442157558554, + "grad_norm": 2.958451986312866, + "learning_rate": 6.76597586941093e-05, + "loss": 0.02717694640159607, + "step": 227930 + }, + { + "epoch": 32.35486160397445, + "grad_norm": 15.756006240844727, + "learning_rate": 6.765833924769341e-05, + "loss": 0.02494056075811386, + "step": 227940 + }, + { + "epoch": 32.35628105039035, + "grad_norm": 0.15754926204681396, + "learning_rate": 6.76569198012775e-05, + "loss": 0.022916850447654725, + "step": 227950 + }, + { + "epoch": 32.357700496806245, + "grad_norm": 2.698223829269409, + "learning_rate": 6.765550035486162e-05, + "loss": 0.018903559446334837, + "step": 227960 + }, + { + "epoch": 32.35911994322214, + "grad_norm": 0.016407039016485214, + "learning_rate": 6.76540809084457e-05, + "loss": 0.011047527939081193, + "step": 227970 + }, + { + "epoch": 32.36053938963804, + "grad_norm": 0.05905010178685188, + "learning_rate": 6.765266146202981e-05, + "loss": 0.008999267965555191, + "step": 227980 + }, + { + "epoch": 32.36195883605394, + "grad_norm": 0.02000059373676777, + "learning_rate": 6.765124201561391e-05, + "loss": 0.04155224859714508, + "step": 227990 + }, + { + "epoch": 32.36337828246984, + "grad_norm": 4.761148452758789, + "learning_rate": 6.764982256919802e-05, + "loss": 0.04279707670211792, + "step": 228000 + }, + { + "epoch": 32.36337828246984, + "eval_accuracy": 0.9856933935270554, + "eval_loss": 0.0647951066493988, + "eval_runtime": 33.8057, + "eval_samples_per_second": 465.218, + "eval_steps_per_second": 14.554, + "step": 228000 + }, + { + "epoch": 32.36479772888573, + "grad_norm": 0.04677955061197281, + "learning_rate": 6.764840312278212e-05, + "loss": 0.0024467162787914277, + "step": 228010 + }, + { + "epoch": 32.366217175301635, + "grad_norm": 0.1450110673904419, + "learning_rate": 6.764698367636621e-05, + "loss": 0.0012122221291065217, + "step": 228020 + }, + { + "epoch": 32.36763662171753, + "grad_norm": 0.01674295775592327, + "learning_rate": 6.764556422995033e-05, + "loss": 0.008547592908143997, + "step": 228030 + }, + { + "epoch": 32.369056068133425, + "grad_norm": 1.6343581676483154, + "learning_rate": 6.764414478353442e-05, + "loss": 0.01282573640346527, + "step": 228040 + }, + { + "epoch": 32.37047551454933, + "grad_norm": 0.02405940741300583, + "learning_rate": 6.764272533711853e-05, + "loss": 0.002673574909567833, + "step": 228050 + }, + { + "epoch": 32.37189496096522, + "grad_norm": 19.897550582885742, + "learning_rate": 6.764130589070263e-05, + "loss": 0.035383996367454526, + "step": 228060 + }, + { + "epoch": 32.37331440738112, + "grad_norm": 0.12654224038124084, + "learning_rate": 6.763988644428673e-05, + "loss": 0.044627875089645386, + "step": 228070 + }, + { + "epoch": 32.37473385379702, + "grad_norm": 0.004290546290576458, + "learning_rate": 6.763846699787083e-05, + "loss": 0.0206047847867012, + "step": 228080 + }, + { + "epoch": 32.37615330021292, + "grad_norm": 0.1409478634595871, + "learning_rate": 6.763704755145494e-05, + "loss": 0.07216464281082154, + "step": 228090 + }, + { + "epoch": 32.377572746628815, + "grad_norm": 0.02848219871520996, + "learning_rate": 6.763562810503903e-05, + "loss": 0.003207835927605629, + "step": 228100 + }, + { + "epoch": 32.37899219304471, + "grad_norm": 0.24600884318351746, + "learning_rate": 6.763420865862315e-05, + "loss": 0.03827841877937317, + "step": 228110 + }, + { + "epoch": 32.38041163946061, + "grad_norm": 0.31802183389663696, + "learning_rate": 6.763278921220724e-05, + "loss": 0.005123189836740494, + "step": 228120 + }, + { + "epoch": 32.381831085876506, + "grad_norm": 0.04005808010697365, + "learning_rate": 6.763136976579134e-05, + "loss": 0.005430297553539276, + "step": 228130 + }, + { + "epoch": 32.38325053229241, + "grad_norm": 0.608680009841919, + "learning_rate": 6.763009226401704e-05, + "loss": 0.010631616413593292, + "step": 228140 + }, + { + "epoch": 32.3846699787083, + "grad_norm": 0.010335426777601242, + "learning_rate": 6.762867281760114e-05, + "loss": 0.036104637384414676, + "step": 228150 + }, + { + "epoch": 32.386089425124204, + "grad_norm": 0.16962112486362457, + "learning_rate": 6.762725337118523e-05, + "loss": 0.005247683823108673, + "step": 228160 + }, + { + "epoch": 32.3875088715401, + "grad_norm": 7.886364936828613, + "learning_rate": 6.762583392476934e-05, + "loss": 0.024862556159496306, + "step": 228170 + }, + { + "epoch": 32.388928317955994, + "grad_norm": 6.093403339385986, + "learning_rate": 6.762441447835344e-05, + "loss": 0.024755634367465973, + "step": 228180 + }, + { + "epoch": 32.390347764371896, + "grad_norm": 0.12247197329998016, + "learning_rate": 6.762299503193755e-05, + "loss": 0.0030530963093042375, + "step": 228190 + }, + { + "epoch": 32.39176721078779, + "grad_norm": 0.12136774510145187, + "learning_rate": 6.762157558552165e-05, + "loss": 0.02287091612815857, + "step": 228200 + }, + { + "epoch": 32.39318665720369, + "grad_norm": 0.07281627506017685, + "learning_rate": 6.762015613910575e-05, + "loss": 0.012600746750831605, + "step": 228210 + }, + { + "epoch": 32.39460610361959, + "grad_norm": 5.197438716888428, + "learning_rate": 6.761873669268986e-05, + "loss": 0.003877962753176689, + "step": 228220 + }, + { + "epoch": 32.39602555003549, + "grad_norm": 0.022315802052617073, + "learning_rate": 6.761731724627396e-05, + "loss": 0.0014178488403558732, + "step": 228230 + }, + { + "epoch": 32.397444996451384, + "grad_norm": 0.2400563359260559, + "learning_rate": 6.761589779985807e-05, + "loss": 0.03126126825809479, + "step": 228240 + }, + { + "epoch": 32.39886444286728, + "grad_norm": 0.01292402669787407, + "learning_rate": 6.761447835344215e-05, + "loss": 0.0014435343444347381, + "step": 228250 + }, + { + "epoch": 32.40028388928318, + "grad_norm": 0.014890138059854507, + "learning_rate": 6.761305890702626e-05, + "loss": 0.00864144042134285, + "step": 228260 + }, + { + "epoch": 32.401703335699075, + "grad_norm": 0.03969304636120796, + "learning_rate": 6.761163946061036e-05, + "loss": 0.010965158045291901, + "step": 228270 + }, + { + "epoch": 32.40312278211498, + "grad_norm": 1.7562295198440552, + "learning_rate": 6.761022001419447e-05, + "loss": 0.01308256983757019, + "step": 228280 + }, + { + "epoch": 32.40454222853087, + "grad_norm": 0.4444243013858795, + "learning_rate": 6.760880056777858e-05, + "loss": 0.021709375083446503, + "step": 228290 + }, + { + "epoch": 32.405961674946774, + "grad_norm": 0.04428355395793915, + "learning_rate": 6.760738112136266e-05, + "loss": 0.0008758913725614547, + "step": 228300 + }, + { + "epoch": 32.40738112136267, + "grad_norm": 0.729003369808197, + "learning_rate": 6.760596167494678e-05, + "loss": 0.05409917235374451, + "step": 228310 + }, + { + "epoch": 32.40880056777856, + "grad_norm": 0.36538851261138916, + "learning_rate": 6.760454222853087e-05, + "loss": 0.010304288566112518, + "step": 228320 + }, + { + "epoch": 32.410220014194465, + "grad_norm": 0.4610559344291687, + "learning_rate": 6.760312278211498e-05, + "loss": 0.016494973003864287, + "step": 228330 + }, + { + "epoch": 32.41163946061036, + "grad_norm": 1.9190986156463623, + "learning_rate": 6.760170333569908e-05, + "loss": 0.01858416050672531, + "step": 228340 + }, + { + "epoch": 32.41305890702626, + "grad_norm": 0.323722243309021, + "learning_rate": 6.760028388928318e-05, + "loss": 0.012614800035953522, + "step": 228350 + }, + { + "epoch": 32.414478353442156, + "grad_norm": 8.763099670410156, + "learning_rate": 6.759886444286728e-05, + "loss": 0.06558595299720764, + "step": 228360 + }, + { + "epoch": 32.41589779985806, + "grad_norm": 12.192046165466309, + "learning_rate": 6.759744499645139e-05, + "loss": 0.02629726827144623, + "step": 228370 + }, + { + "epoch": 32.41731724627395, + "grad_norm": 0.07630567997694016, + "learning_rate": 6.75960255500355e-05, + "loss": 0.005735642835497856, + "step": 228380 + }, + { + "epoch": 32.41873669268985, + "grad_norm": 2.305961847305298, + "learning_rate": 6.75946061036196e-05, + "loss": 0.011921775341033936, + "step": 228390 + }, + { + "epoch": 32.42015613910575, + "grad_norm": 0.40924063324928284, + "learning_rate": 6.759318665720369e-05, + "loss": 0.05728234648704529, + "step": 228400 + }, + { + "epoch": 32.421575585521644, + "grad_norm": 0.5826217532157898, + "learning_rate": 6.759176721078779e-05, + "loss": 0.016080982983112335, + "step": 228410 + }, + { + "epoch": 32.422995031937546, + "grad_norm": 0.6857572197914124, + "learning_rate": 6.75903477643719e-05, + "loss": 0.013486593961715698, + "step": 228420 + }, + { + "epoch": 32.42441447835344, + "grad_norm": 0.5240159034729004, + "learning_rate": 6.7588928317956e-05, + "loss": 0.029421159625053407, + "step": 228430 + }, + { + "epoch": 32.42583392476934, + "grad_norm": 5.623213291168213, + "learning_rate": 6.758750887154011e-05, + "loss": 0.02402797043323517, + "step": 228440 + }, + { + "epoch": 32.42725337118524, + "grad_norm": 0.07374012470245361, + "learning_rate": 6.758608942512419e-05, + "loss": 0.013141918182373046, + "step": 228450 + }, + { + "epoch": 32.42867281760113, + "grad_norm": 0.041638847440481186, + "learning_rate": 6.75846699787083e-05, + "loss": 0.003548675775527954, + "step": 228460 + }, + { + "epoch": 32.430092264017034, + "grad_norm": 0.8819023370742798, + "learning_rate": 6.758325053229241e-05, + "loss": 0.015502172708511352, + "step": 228470 + }, + { + "epoch": 32.43151171043293, + "grad_norm": 0.37147173285484314, + "learning_rate": 6.758183108587651e-05, + "loss": 0.0028808791190385818, + "step": 228480 + }, + { + "epoch": 32.43293115684883, + "grad_norm": 0.1811521202325821, + "learning_rate": 6.758041163946062e-05, + "loss": 0.005864652991294861, + "step": 228490 + }, + { + "epoch": 32.434350603264726, + "grad_norm": 0.02258164994418621, + "learning_rate": 6.757899219304472e-05, + "loss": 0.014680463075637817, + "step": 228500 + }, + { + "epoch": 32.434350603264726, + "eval_accuracy": 0.9867107522095759, + "eval_loss": 0.05285225436091423, + "eval_runtime": 34.2881, + "eval_samples_per_second": 458.672, + "eval_steps_per_second": 14.349, + "step": 228500 + }, + { + "epoch": 32.43577004968063, + "grad_norm": 0.007231009192764759, + "learning_rate": 6.757757274662882e-05, + "loss": 0.0033688973635435104, + "step": 228510 + }, + { + "epoch": 32.43718949609652, + "grad_norm": 0.07019180059432983, + "learning_rate": 6.757615330021292e-05, + "loss": 0.0077016264200210575, + "step": 228520 + }, + { + "epoch": 32.43860894251242, + "grad_norm": 4.379995346069336, + "learning_rate": 6.757473385379703e-05, + "loss": 0.002343284711241722, + "step": 228530 + }, + { + "epoch": 32.44002838892832, + "grad_norm": 5.343328952789307, + "learning_rate": 6.757331440738112e-05, + "loss": 0.004488516598939896, + "step": 228540 + }, + { + "epoch": 32.44144783534421, + "grad_norm": 0.097828708589077, + "learning_rate": 6.757189496096523e-05, + "loss": 0.030615192651748658, + "step": 228550 + }, + { + "epoch": 32.442867281760115, + "grad_norm": 0.9162726402282715, + "learning_rate": 6.757047551454933e-05, + "loss": 0.025946620106697082, + "step": 228560 + }, + { + "epoch": 32.44428672817601, + "grad_norm": 0.05263305455446243, + "learning_rate": 6.756905606813343e-05, + "loss": 0.052242130041122437, + "step": 228570 + }, + { + "epoch": 32.44570617459191, + "grad_norm": 0.0456642247736454, + "learning_rate": 6.756763662171754e-05, + "loss": 0.034745055437088015, + "step": 228580 + }, + { + "epoch": 32.44712562100781, + "grad_norm": 0.018306411802768707, + "learning_rate": 6.756621717530164e-05, + "loss": 0.03233112692832947, + "step": 228590 + }, + { + "epoch": 32.4485450674237, + "grad_norm": 3.446342945098877, + "learning_rate": 6.756479772888575e-05, + "loss": 0.01495877057313919, + "step": 228600 + }, + { + "epoch": 32.4499645138396, + "grad_norm": 0.033757373690605164, + "learning_rate": 6.756337828246983e-05, + "loss": 0.03143808543682099, + "step": 228610 + }, + { + "epoch": 32.4513839602555, + "grad_norm": 0.23406028747558594, + "learning_rate": 6.756195883605394e-05, + "loss": 0.002052842453122139, + "step": 228620 + }, + { + "epoch": 32.4528034066714, + "grad_norm": 0.8516905903816223, + "learning_rate": 6.756053938963804e-05, + "loss": 0.0010334018617868424, + "step": 228630 + }, + { + "epoch": 32.454222853087295, + "grad_norm": 0.0809004008769989, + "learning_rate": 6.755911994322215e-05, + "loss": 0.003334370627999306, + "step": 228640 + }, + { + "epoch": 32.4556422995032, + "grad_norm": 0.2099735289812088, + "learning_rate": 6.755770049680625e-05, + "loss": 0.01036931574344635, + "step": 228650 + }, + { + "epoch": 32.45706174591909, + "grad_norm": 0.20453427731990814, + "learning_rate": 6.755628105039035e-05, + "loss": 0.004026395455002785, + "step": 228660 + }, + { + "epoch": 32.458481192334986, + "grad_norm": 0.30600693821907043, + "learning_rate": 6.755486160397446e-05, + "loss": 0.011128799617290496, + "step": 228670 + }, + { + "epoch": 32.45990063875089, + "grad_norm": 0.019511636346578598, + "learning_rate": 6.755344215755855e-05, + "loss": 0.0028480660170316697, + "step": 228680 + }, + { + "epoch": 32.46132008516678, + "grad_norm": 0.13183800876140594, + "learning_rate": 6.755202271114267e-05, + "loss": 0.015529140830039978, + "step": 228690 + }, + { + "epoch": 32.462739531582685, + "grad_norm": 1.259804129600525, + "learning_rate": 6.755060326472676e-05, + "loss": 0.008653013408184052, + "step": 228700 + }, + { + "epoch": 32.46415897799858, + "grad_norm": 0.006522087380290031, + "learning_rate": 6.754918381831086e-05, + "loss": 0.0017131742089986802, + "step": 228710 + }, + { + "epoch": 32.46557842441448, + "grad_norm": 0.5121077299118042, + "learning_rate": 6.754776437189496e-05, + "loss": 0.005044599995017052, + "step": 228720 + }, + { + "epoch": 32.466997870830376, + "grad_norm": 0.02302316203713417, + "learning_rate": 6.754634492547907e-05, + "loss": 0.004867821559309959, + "step": 228730 + }, + { + "epoch": 32.46841731724627, + "grad_norm": 3.0244956016540527, + "learning_rate": 6.754492547906317e-05, + "loss": 0.0020168837159872054, + "step": 228740 + }, + { + "epoch": 32.46983676366217, + "grad_norm": 0.3158857822418213, + "learning_rate": 6.754350603264728e-05, + "loss": 0.014676621556282044, + "step": 228750 + }, + { + "epoch": 32.47125621007807, + "grad_norm": 0.04764304682612419, + "learning_rate": 6.754208658623137e-05, + "loss": 0.019751401245594026, + "step": 228760 + }, + { + "epoch": 32.47267565649397, + "grad_norm": 0.010971294716000557, + "learning_rate": 6.754066713981547e-05, + "loss": 0.015144042670726776, + "step": 228770 + }, + { + "epoch": 32.474095102909864, + "grad_norm": 0.07485675811767578, + "learning_rate": 6.753924769339958e-05, + "loss": 0.005256921052932739, + "step": 228780 + }, + { + "epoch": 32.475514549325766, + "grad_norm": 0.03224635496735573, + "learning_rate": 6.753782824698368e-05, + "loss": 0.004273584857583046, + "step": 228790 + }, + { + "epoch": 32.47693399574166, + "grad_norm": 10.965764045715332, + "learning_rate": 6.753640880056779e-05, + "loss": 0.007052581757307053, + "step": 228800 + }, + { + "epoch": 32.478353442157555, + "grad_norm": 0.33855903148651123, + "learning_rate": 6.753498935415187e-05, + "loss": 0.010540449619293213, + "step": 228810 + }, + { + "epoch": 32.47977288857346, + "grad_norm": 0.016043735668063164, + "learning_rate": 6.753356990773599e-05, + "loss": 0.0009699732065200806, + "step": 228820 + }, + { + "epoch": 32.48119233498935, + "grad_norm": 2.0261361598968506, + "learning_rate": 6.753215046132008e-05, + "loss": 0.0013237256556749345, + "step": 228830 + }, + { + "epoch": 32.482611781405254, + "grad_norm": 6.038012981414795, + "learning_rate": 6.75307310149042e-05, + "loss": 0.01433452069759369, + "step": 228840 + }, + { + "epoch": 32.48403122782115, + "grad_norm": 6.848779201507568, + "learning_rate": 6.752931156848829e-05, + "loss": 0.01569778025150299, + "step": 228850 + }, + { + "epoch": 32.48545067423705, + "grad_norm": 7.495294570922852, + "learning_rate": 6.75278921220724e-05, + "loss": 0.009149719774723054, + "step": 228860 + }, + { + "epoch": 32.486870120652945, + "grad_norm": 0.019819092005491257, + "learning_rate": 6.75264726756565e-05, + "loss": 0.008754295855760574, + "step": 228870 + }, + { + "epoch": 32.48828956706884, + "grad_norm": 0.013119860552251339, + "learning_rate": 6.75250532292406e-05, + "loss": 0.013514451682567596, + "step": 228880 + }, + { + "epoch": 32.48970901348474, + "grad_norm": 0.5483527779579163, + "learning_rate": 6.752363378282471e-05, + "loss": 0.028198054432868956, + "step": 228890 + }, + { + "epoch": 32.49112845990064, + "grad_norm": 1.6966410875320435, + "learning_rate": 6.75222143364088e-05, + "loss": 0.022334070503711702, + "step": 228900 + }, + { + "epoch": 32.49254790631654, + "grad_norm": 0.03143806755542755, + "learning_rate": 6.752079488999292e-05, + "loss": 0.0029576733708381654, + "step": 228910 + }, + { + "epoch": 32.49396735273243, + "grad_norm": 0.544014573097229, + "learning_rate": 6.7519375443577e-05, + "loss": 0.01594178080558777, + "step": 228920 + }, + { + "epoch": 32.495386799148335, + "grad_norm": 11.814414024353027, + "learning_rate": 6.751795599716111e-05, + "loss": 0.031930530071258546, + "step": 228930 + }, + { + "epoch": 32.49680624556423, + "grad_norm": 0.27639931440353394, + "learning_rate": 6.751653655074521e-05, + "loss": 0.0024049948900938035, + "step": 228940 + }, + { + "epoch": 32.498225691980124, + "grad_norm": 10.267918586730957, + "learning_rate": 6.751511710432932e-05, + "loss": 0.01639072299003601, + "step": 228950 + }, + { + "epoch": 32.499645138396026, + "grad_norm": 0.11040349304676056, + "learning_rate": 6.751369765791342e-05, + "loss": 0.01325320303440094, + "step": 228960 + }, + { + "epoch": 32.50106458481192, + "grad_norm": 1.444700837135315, + "learning_rate": 6.751227821149751e-05, + "loss": 0.04753404259681702, + "step": 228970 + }, + { + "epoch": 32.50248403122782, + "grad_norm": 1.1246538162231445, + "learning_rate": 6.751085876508162e-05, + "loss": 0.007898826897144318, + "step": 228980 + }, + { + "epoch": 32.50390347764372, + "grad_norm": 0.013423698022961617, + "learning_rate": 6.750943931866572e-05, + "loss": 0.001518493890762329, + "step": 228990 + }, + { + "epoch": 32.50532292405962, + "grad_norm": 0.010352909564971924, + "learning_rate": 6.750801987224983e-05, + "loss": 0.0007946062833070755, + "step": 229000 + }, + { + "epoch": 32.50532292405962, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.061502307653427124, + "eval_runtime": 33.8629, + "eval_samples_per_second": 464.431, + "eval_steps_per_second": 14.529, + "step": 229000 + }, + { + "epoch": 32.506742370475514, + "grad_norm": 17.54646873474121, + "learning_rate": 6.750660042583393e-05, + "loss": 0.039235925674438475, + "step": 229010 + }, + { + "epoch": 32.50816181689141, + "grad_norm": 0.04597978666424751, + "learning_rate": 6.750518097941803e-05, + "loss": 0.01882010102272034, + "step": 229020 + }, + { + "epoch": 32.50958126330731, + "grad_norm": 5.5355916023254395, + "learning_rate": 6.750376153300213e-05, + "loss": 0.032336241006851195, + "step": 229030 + }, + { + "epoch": 32.511000709723206, + "grad_norm": 0.15653273463249207, + "learning_rate": 6.750234208658624e-05, + "loss": 0.025763991475105285, + "step": 229040 + }, + { + "epoch": 32.51242015613911, + "grad_norm": 2.074625015258789, + "learning_rate": 6.750092264017033e-05, + "loss": 0.004245015606284141, + "step": 229050 + }, + { + "epoch": 32.513839602555, + "grad_norm": 2.186718225479126, + "learning_rate": 6.749950319375444e-05, + "loss": 0.03725409209728241, + "step": 229060 + }, + { + "epoch": 32.515259048970904, + "grad_norm": 0.06504274159669876, + "learning_rate": 6.749808374733854e-05, + "loss": 0.008576406538486481, + "step": 229070 + }, + { + "epoch": 32.5166784953868, + "grad_norm": 0.016747912392020226, + "learning_rate": 6.749666430092264e-05, + "loss": 0.0016852028667926789, + "step": 229080 + }, + { + "epoch": 32.518097941802694, + "grad_norm": 0.037762485444545746, + "learning_rate": 6.749524485450675e-05, + "loss": 0.01922301948070526, + "step": 229090 + }, + { + "epoch": 32.519517388218595, + "grad_norm": 5.243544101715088, + "learning_rate": 6.749382540809085e-05, + "loss": 0.013619337975978852, + "step": 229100 + }, + { + "epoch": 32.52093683463449, + "grad_norm": 0.4673132002353668, + "learning_rate": 6.749240596167496e-05, + "loss": 0.019435304403305053, + "step": 229110 + }, + { + "epoch": 32.52235628105039, + "grad_norm": 0.06780438125133514, + "learning_rate": 6.749098651525904e-05, + "loss": 0.031889355182647704, + "step": 229120 + }, + { + "epoch": 32.52377572746629, + "grad_norm": 0.03425315394997597, + "learning_rate": 6.748956706884315e-05, + "loss": 0.0010838449001312257, + "step": 229130 + }, + { + "epoch": 32.52519517388219, + "grad_norm": 0.12657800316810608, + "learning_rate": 6.748814762242725e-05, + "loss": 0.014370493590831757, + "step": 229140 + }, + { + "epoch": 32.52661462029808, + "grad_norm": 8.048868179321289, + "learning_rate": 6.748672817601136e-05, + "loss": 0.004952149838209153, + "step": 229150 + }, + { + "epoch": 32.52803406671398, + "grad_norm": 16.968772888183594, + "learning_rate": 6.748530872959546e-05, + "loss": 0.05705510973930359, + "step": 229160 + }, + { + "epoch": 32.52945351312988, + "grad_norm": 3.7434425354003906, + "learning_rate": 6.748388928317956e-05, + "loss": 0.0026023328304290772, + "step": 229170 + }, + { + "epoch": 32.530872959545775, + "grad_norm": 1.7936410903930664, + "learning_rate": 6.748246983676367e-05, + "loss": 0.001976187527179718, + "step": 229180 + }, + { + "epoch": 32.53229240596168, + "grad_norm": 2.689037322998047, + "learning_rate": 6.748105039034776e-05, + "loss": 0.0025141272693872453, + "step": 229190 + }, + { + "epoch": 32.53371185237757, + "grad_norm": 0.06508883833885193, + "learning_rate": 6.747963094393188e-05, + "loss": 0.0017630167305469513, + "step": 229200 + }, + { + "epoch": 32.53513129879347, + "grad_norm": 4.06475830078125, + "learning_rate": 6.747821149751597e-05, + "loss": 0.01314859539270401, + "step": 229210 + }, + { + "epoch": 32.53655074520937, + "grad_norm": 0.035527270287275314, + "learning_rate": 6.747679205110008e-05, + "loss": 0.014146934449672698, + "step": 229220 + }, + { + "epoch": 32.53797019162526, + "grad_norm": 0.13337914645671844, + "learning_rate": 6.747537260468417e-05, + "loss": 0.01753297746181488, + "step": 229230 + }, + { + "epoch": 32.539389638041165, + "grad_norm": 1.4026387929916382, + "learning_rate": 6.747395315826828e-05, + "loss": 0.02159238010644913, + "step": 229240 + }, + { + "epoch": 32.54080908445706, + "grad_norm": 7.431717395782471, + "learning_rate": 6.747253371185238e-05, + "loss": 0.012537233531475067, + "step": 229250 + }, + { + "epoch": 32.54222853087296, + "grad_norm": 2.094836950302124, + "learning_rate": 6.747111426543649e-05, + "loss": 0.0299485445022583, + "step": 229260 + }, + { + "epoch": 32.543647977288856, + "grad_norm": 0.014431372284889221, + "learning_rate": 6.746969481902058e-05, + "loss": 0.0020636335015296937, + "step": 229270 + }, + { + "epoch": 32.54506742370476, + "grad_norm": 5.154273986816406, + "learning_rate": 6.746827537260468e-05, + "loss": 0.01795615702867508, + "step": 229280 + }, + { + "epoch": 32.54648687012065, + "grad_norm": 0.007334401365369558, + "learning_rate": 6.746685592618879e-05, + "loss": 0.020967456698417663, + "step": 229290 + }, + { + "epoch": 32.54790631653655, + "grad_norm": 0.26880526542663574, + "learning_rate": 6.746543647977289e-05, + "loss": 0.04667791426181793, + "step": 229300 + }, + { + "epoch": 32.54932576295245, + "grad_norm": 7.054488182067871, + "learning_rate": 6.7464017033357e-05, + "loss": 0.0025762926787137987, + "step": 229310 + }, + { + "epoch": 32.550745209368344, + "grad_norm": 0.1871930956840515, + "learning_rate": 6.74625975869411e-05, + "loss": 0.005865605920553208, + "step": 229320 + }, + { + "epoch": 32.552164655784246, + "grad_norm": 0.27339187264442444, + "learning_rate": 6.74611781405252e-05, + "loss": 0.038812240958213805, + "step": 229330 + }, + { + "epoch": 32.55358410220014, + "grad_norm": 27.47489356994629, + "learning_rate": 6.745975869410929e-05, + "loss": 0.017351204156875612, + "step": 229340 + }, + { + "epoch": 32.55500354861604, + "grad_norm": 0.02372206188738346, + "learning_rate": 6.74583392476934e-05, + "loss": 0.010866829752922058, + "step": 229350 + }, + { + "epoch": 32.55642299503194, + "grad_norm": 0.07493670284748077, + "learning_rate": 6.74569198012775e-05, + "loss": 0.006163535267114639, + "step": 229360 + }, + { + "epoch": 32.55784244144783, + "grad_norm": 0.05554557219147682, + "learning_rate": 6.745550035486161e-05, + "loss": 0.0027071300894021986, + "step": 229370 + }, + { + "epoch": 32.559261887863734, + "grad_norm": 0.2487206757068634, + "learning_rate": 6.745408090844571e-05, + "loss": 0.0023162011057138443, + "step": 229380 + }, + { + "epoch": 32.56068133427963, + "grad_norm": 0.004434707574546337, + "learning_rate": 6.745266146202981e-05, + "loss": 0.013143643736839294, + "step": 229390 + }, + { + "epoch": 32.56210078069553, + "grad_norm": 0.4935588240623474, + "learning_rate": 6.745124201561392e-05, + "loss": 0.005409666895866394, + "step": 229400 + }, + { + "epoch": 32.563520227111425, + "grad_norm": 0.12315703928470612, + "learning_rate": 6.744982256919802e-05, + "loss": 0.0030065789818763734, + "step": 229410 + }, + { + "epoch": 32.56493967352733, + "grad_norm": 0.018055645748972893, + "learning_rate": 6.744840312278213e-05, + "loss": 0.0007707826793193817, + "step": 229420 + }, + { + "epoch": 32.56635911994322, + "grad_norm": 22.14475440979004, + "learning_rate": 6.744698367636621e-05, + "loss": 0.010454246401786804, + "step": 229430 + }, + { + "epoch": 32.56777856635912, + "grad_norm": 0.9014254212379456, + "learning_rate": 6.744556422995032e-05, + "loss": 0.0011354818940162658, + "step": 229440 + }, + { + "epoch": 32.56919801277502, + "grad_norm": 0.014854700304567814, + "learning_rate": 6.744414478353442e-05, + "loss": 0.033128181099891664, + "step": 229450 + }, + { + "epoch": 32.57061745919091, + "grad_norm": 17.92057228088379, + "learning_rate": 6.744272533711853e-05, + "loss": 0.012529285252094268, + "step": 229460 + }, + { + "epoch": 32.572036905606815, + "grad_norm": 5.200765609741211, + "learning_rate": 6.744130589070263e-05, + "loss": 0.05218632817268372, + "step": 229470 + }, + { + "epoch": 32.57345635202271, + "grad_norm": 0.15613718330860138, + "learning_rate": 6.743988644428672e-05, + "loss": 0.025765004754066467, + "step": 229480 + }, + { + "epoch": 32.57487579843861, + "grad_norm": 0.022275427356362343, + "learning_rate": 6.743846699787083e-05, + "loss": 0.0022942434996366503, + "step": 229490 + }, + { + "epoch": 32.57629524485451, + "grad_norm": 0.759853184223175, + "learning_rate": 6.743704755145493e-05, + "loss": 0.03849336206912994, + "step": 229500 + }, + { + "epoch": 32.57629524485451, + "eval_accuracy": 0.9785718827494119, + "eval_loss": 0.09146282076835632, + "eval_runtime": 33.8414, + "eval_samples_per_second": 464.727, + "eval_steps_per_second": 14.538, + "step": 229500 + }, + { + "epoch": 32.5777146912704, + "grad_norm": 0.06368359923362732, + "learning_rate": 6.743562810503904e-05, + "loss": 0.05640682578086853, + "step": 229510 + }, + { + "epoch": 32.5791341376863, + "grad_norm": 10.098808288574219, + "learning_rate": 6.743420865862314e-05, + "loss": 0.016964754462242125, + "step": 229520 + }, + { + "epoch": 32.5805535841022, + "grad_norm": 0.007689241319894791, + "learning_rate": 6.743278921220724e-05, + "loss": 0.008618653565645219, + "step": 229530 + }, + { + "epoch": 32.5819730305181, + "grad_norm": 0.04808669164776802, + "learning_rate": 6.743136976579134e-05, + "loss": 0.0004969310015439987, + "step": 229540 + }, + { + "epoch": 32.583392476933994, + "grad_norm": 0.07033667713403702, + "learning_rate": 6.742995031937545e-05, + "loss": 0.015263874828815461, + "step": 229550 + }, + { + "epoch": 32.584811923349896, + "grad_norm": 0.2035844773054123, + "learning_rate": 6.742853087295954e-05, + "loss": 0.012534555792808533, + "step": 229560 + }, + { + "epoch": 32.58623136976579, + "grad_norm": 0.048522308468818665, + "learning_rate": 6.742711142654365e-05, + "loss": 0.0054033294320106505, + "step": 229570 + }, + { + "epoch": 32.587650816181686, + "grad_norm": 0.023666007444262505, + "learning_rate": 6.742569198012775e-05, + "loss": 0.006664180755615234, + "step": 229580 + }, + { + "epoch": 32.58907026259759, + "grad_norm": 0.7024424076080322, + "learning_rate": 6.742427253371185e-05, + "loss": 0.004219807311892509, + "step": 229590 + }, + { + "epoch": 32.59048970901348, + "grad_norm": 7.908431529998779, + "learning_rate": 6.742285308729596e-05, + "loss": 0.004080007597804069, + "step": 229600 + }, + { + "epoch": 32.591909155429384, + "grad_norm": 0.03177419677376747, + "learning_rate": 6.742143364088006e-05, + "loss": 0.008294917643070221, + "step": 229610 + }, + { + "epoch": 32.59332860184528, + "grad_norm": 0.032971788197755814, + "learning_rate": 6.742001419446417e-05, + "loss": 0.0009228859096765518, + "step": 229620 + }, + { + "epoch": 32.59474804826118, + "grad_norm": 1.1993937492370605, + "learning_rate": 6.741859474804827e-05, + "loss": 0.02713583707809448, + "step": 229630 + }, + { + "epoch": 32.596167494677076, + "grad_norm": 0.43226516246795654, + "learning_rate": 6.741717530163236e-05, + "loss": 0.01599059849977493, + "step": 229640 + }, + { + "epoch": 32.59758694109297, + "grad_norm": 9.288530349731445, + "learning_rate": 6.741575585521646e-05, + "loss": 0.010985949635505676, + "step": 229650 + }, + { + "epoch": 32.59900638750887, + "grad_norm": 0.015751026570796967, + "learning_rate": 6.741433640880057e-05, + "loss": 0.0012706160545349122, + "step": 229660 + }, + { + "epoch": 32.60042583392477, + "grad_norm": 0.3139137923717499, + "learning_rate": 6.741291696238467e-05, + "loss": 0.010555453598499298, + "step": 229670 + }, + { + "epoch": 32.60184528034067, + "grad_norm": 0.16862639784812927, + "learning_rate": 6.741149751596878e-05, + "loss": 0.001846550405025482, + "step": 229680 + }, + { + "epoch": 32.603264726756564, + "grad_norm": 0.23515217006206512, + "learning_rate": 6.741007806955288e-05, + "loss": 0.02824563980102539, + "step": 229690 + }, + { + "epoch": 32.604684173172465, + "grad_norm": 0.24600635468959808, + "learning_rate": 6.740865862313697e-05, + "loss": 0.020244674384593965, + "step": 229700 + }, + { + "epoch": 32.60610361958836, + "grad_norm": 0.29075828194618225, + "learning_rate": 6.740723917672109e-05, + "loss": 0.0030585888773202894, + "step": 229710 + }, + { + "epoch": 32.607523066004255, + "grad_norm": 17.400236129760742, + "learning_rate": 6.740581973030518e-05, + "loss": 0.031883764266967776, + "step": 229720 + }, + { + "epoch": 32.60894251242016, + "grad_norm": 16.481021881103516, + "learning_rate": 6.74044002838893e-05, + "loss": 0.014023074507713318, + "step": 229730 + }, + { + "epoch": 32.61036195883605, + "grad_norm": 0.022958939895033836, + "learning_rate": 6.740298083747338e-05, + "loss": 0.0011251475661993027, + "step": 229740 + }, + { + "epoch": 32.61178140525195, + "grad_norm": 0.021372072398662567, + "learning_rate": 6.740156139105749e-05, + "loss": 0.0012654218822717666, + "step": 229750 + }, + { + "epoch": 32.61320085166785, + "grad_norm": 0.25391215085983276, + "learning_rate": 6.740014194464159e-05, + "loss": 0.03843848407268524, + "step": 229760 + }, + { + "epoch": 32.61462029808375, + "grad_norm": 0.158311665058136, + "learning_rate": 6.73987224982257e-05, + "loss": 0.008335818350315095, + "step": 229770 + }, + { + "epoch": 32.616039744499645, + "grad_norm": 0.024364599958062172, + "learning_rate": 6.739730305180981e-05, + "loss": 0.0012691326439380646, + "step": 229780 + }, + { + "epoch": 32.61745919091554, + "grad_norm": 0.7214038372039795, + "learning_rate": 6.739588360539389e-05, + "loss": 0.013850057125091552, + "step": 229790 + }, + { + "epoch": 32.61887863733144, + "grad_norm": 4.130451679229736, + "learning_rate": 6.7394464158978e-05, + "loss": 0.028388482332229615, + "step": 229800 + }, + { + "epoch": 32.620298083747336, + "grad_norm": 8.130624771118164, + "learning_rate": 6.73930447125621e-05, + "loss": 0.008559781312942504, + "step": 229810 + }, + { + "epoch": 32.62171753016324, + "grad_norm": 3.3276076316833496, + "learning_rate": 6.739162526614621e-05, + "loss": 0.008431027829647064, + "step": 229820 + }, + { + "epoch": 32.62313697657913, + "grad_norm": 0.8454357981681824, + "learning_rate": 6.739020581973031e-05, + "loss": 0.030283817648887636, + "step": 229830 + }, + { + "epoch": 32.624556422995035, + "grad_norm": 0.3295801281929016, + "learning_rate": 6.73887863733144e-05, + "loss": 0.020723450183868408, + "step": 229840 + }, + { + "epoch": 32.62597586941093, + "grad_norm": 5.296194076538086, + "learning_rate": 6.73873669268985e-05, + "loss": 0.02623702585697174, + "step": 229850 + }, + { + "epoch": 32.627395315826824, + "grad_norm": 0.4415184259414673, + "learning_rate": 6.738594748048261e-05, + "loss": 0.015328249335289002, + "step": 229860 + }, + { + "epoch": 32.628814762242726, + "grad_norm": 0.14892180263996124, + "learning_rate": 6.738452803406672e-05, + "loss": 0.0018846526741981505, + "step": 229870 + }, + { + "epoch": 32.63023420865862, + "grad_norm": 0.036461714655160904, + "learning_rate": 6.738310858765082e-05, + "loss": 0.0068075649440288545, + "step": 229880 + }, + { + "epoch": 32.63165365507452, + "grad_norm": 4.379703044891357, + "learning_rate": 6.738168914123493e-05, + "loss": 0.0033054325729608538, + "step": 229890 + }, + { + "epoch": 32.63307310149042, + "grad_norm": 0.4534512758255005, + "learning_rate": 6.738026969481902e-05, + "loss": 0.0019421163946390153, + "step": 229900 + }, + { + "epoch": 32.63449254790632, + "grad_norm": 1.4178011417388916, + "learning_rate": 6.737885024840313e-05, + "loss": 0.06002342104911804, + "step": 229910 + }, + { + "epoch": 32.635911994322214, + "grad_norm": 0.4269276559352875, + "learning_rate": 6.737743080198723e-05, + "loss": 0.0031254947185516357, + "step": 229920 + }, + { + "epoch": 32.63733144073811, + "grad_norm": 0.27989357709884644, + "learning_rate": 6.737601135557134e-05, + "loss": 0.059497016668319705, + "step": 229930 + }, + { + "epoch": 32.63875088715401, + "grad_norm": 2.9348011016845703, + "learning_rate": 6.737459190915543e-05, + "loss": 0.006866764277219772, + "step": 229940 + }, + { + "epoch": 32.640170333569905, + "grad_norm": 0.19549033045768738, + "learning_rate": 6.737317246273953e-05, + "loss": 0.006307584047317505, + "step": 229950 + }, + { + "epoch": 32.64158977998581, + "grad_norm": 0.05920843780040741, + "learning_rate": 6.737175301632364e-05, + "loss": 0.008201535791158676, + "step": 229960 + }, + { + "epoch": 32.6430092264017, + "grad_norm": 0.035806771367788315, + "learning_rate": 6.737033356990774e-05, + "loss": 0.0040438991039991375, + "step": 229970 + }, + { + "epoch": 32.644428672817604, + "grad_norm": 8.582637786865234, + "learning_rate": 6.736891412349185e-05, + "loss": 0.024805967509746552, + "step": 229980 + }, + { + "epoch": 32.6458481192335, + "grad_norm": 0.11949902772903442, + "learning_rate": 6.736749467707595e-05, + "loss": 0.01019858717918396, + "step": 229990 + }, + { + "epoch": 32.64726756564939, + "grad_norm": 0.3251058757305145, + "learning_rate": 6.736607523066004e-05, + "loss": 0.025926533341407775, + "step": 230000 + }, + { + "epoch": 32.64726756564939, + "eval_accuracy": 0.986011318115343, + "eval_loss": 0.06189598888158798, + "eval_runtime": 33.9482, + "eval_samples_per_second": 463.264, + "eval_steps_per_second": 14.493, + "step": 230000 + }, + { + "epoch": 32.648687012065295, + "grad_norm": 1.803137183189392, + "learning_rate": 6.736465578424414e-05, + "loss": 0.014004331827163697, + "step": 230010 + }, + { + "epoch": 32.65010645848119, + "grad_norm": 0.18455222249031067, + "learning_rate": 6.736323633782825e-05, + "loss": 0.011704017221927644, + "step": 230020 + }, + { + "epoch": 32.65152590489709, + "grad_norm": 0.8866627216339111, + "learning_rate": 6.736181689141235e-05, + "loss": 0.014083343744277953, + "step": 230030 + }, + { + "epoch": 32.65294535131299, + "grad_norm": 2.7694249153137207, + "learning_rate": 6.736039744499646e-05, + "loss": 0.004213947057723999, + "step": 230040 + }, + { + "epoch": 32.65436479772889, + "grad_norm": 0.10214859247207642, + "learning_rate": 6.735897799858056e-05, + "loss": 0.01895824670791626, + "step": 230050 + }, + { + "epoch": 32.65578424414478, + "grad_norm": 0.82787024974823, + "learning_rate": 6.735755855216466e-05, + "loss": 0.0018074888736009597, + "step": 230060 + }, + { + "epoch": 32.65720369056068, + "grad_norm": 0.19432945549488068, + "learning_rate": 6.735613910574877e-05, + "loss": 0.003745337948203087, + "step": 230070 + }, + { + "epoch": 32.65862313697658, + "grad_norm": 0.017711736261844635, + "learning_rate": 6.735471965933286e-05, + "loss": 0.004111262038350105, + "step": 230080 + }, + { + "epoch": 32.660042583392475, + "grad_norm": 4.314962387084961, + "learning_rate": 6.735330021291698e-05, + "loss": 0.014219552278518677, + "step": 230090 + }, + { + "epoch": 32.661462029808376, + "grad_norm": 5.631881237030029, + "learning_rate": 6.735188076650106e-05, + "loss": 0.006019681319594383, + "step": 230100 + }, + { + "epoch": 32.66288147622427, + "grad_norm": 0.01014167070388794, + "learning_rate": 6.735046132008517e-05, + "loss": 0.0008535075932741166, + "step": 230110 + }, + { + "epoch": 32.66430092264017, + "grad_norm": 0.080812007188797, + "learning_rate": 6.734904187366927e-05, + "loss": 0.002751779556274414, + "step": 230120 + }, + { + "epoch": 32.66572036905607, + "grad_norm": 2.750189781188965, + "learning_rate": 6.734762242725338e-05, + "loss": 0.003860221058130264, + "step": 230130 + }, + { + "epoch": 32.66713981547196, + "grad_norm": 0.16712497174739838, + "learning_rate": 6.734620298083748e-05, + "loss": 0.0095685675740242, + "step": 230140 + }, + { + "epoch": 32.668559261887864, + "grad_norm": 0.09808217734098434, + "learning_rate": 6.734478353442157e-05, + "loss": 0.0052022445946931836, + "step": 230150 + }, + { + "epoch": 32.66997870830376, + "grad_norm": 0.024165118113160133, + "learning_rate": 6.734336408800568e-05, + "loss": 0.028092315793037413, + "step": 230160 + }, + { + "epoch": 32.67139815471966, + "grad_norm": 1.7018033266067505, + "learning_rate": 6.734194464158978e-05, + "loss": 0.0046866275370121, + "step": 230170 + }, + { + "epoch": 32.672817601135556, + "grad_norm": 12.519664764404297, + "learning_rate": 6.734052519517389e-05, + "loss": 0.013973727822303772, + "step": 230180 + }, + { + "epoch": 32.67423704755146, + "grad_norm": 0.4195597767829895, + "learning_rate": 6.733910574875799e-05, + "loss": 0.011637437343597411, + "step": 230190 + }, + { + "epoch": 32.67565649396735, + "grad_norm": 0.1746259331703186, + "learning_rate": 6.733768630234209e-05, + "loss": 0.012811291217803954, + "step": 230200 + }, + { + "epoch": 32.67707594038325, + "grad_norm": 0.036932483315467834, + "learning_rate": 6.733626685592618e-05, + "loss": 0.028728348016738892, + "step": 230210 + }, + { + "epoch": 32.67849538679915, + "grad_norm": 0.5389435291290283, + "learning_rate": 6.73348474095103e-05, + "loss": 0.015115834772586823, + "step": 230220 + }, + { + "epoch": 32.679914833215044, + "grad_norm": 3.6993408203125, + "learning_rate": 6.733342796309439e-05, + "loss": 0.018788757920265197, + "step": 230230 + }, + { + "epoch": 32.681334279630946, + "grad_norm": 10.755971908569336, + "learning_rate": 6.733215046132009e-05, + "loss": 0.08036642670631408, + "step": 230240 + }, + { + "epoch": 32.68275372604684, + "grad_norm": 33.43832778930664, + "learning_rate": 6.733073101490419e-05, + "loss": 0.052683568000793456, + "step": 230250 + }, + { + "epoch": 32.68417317246274, + "grad_norm": 0.7054136395454407, + "learning_rate": 6.73293115684883e-05, + "loss": 0.01225418969988823, + "step": 230260 + }, + { + "epoch": 32.68559261887864, + "grad_norm": 0.047475770115852356, + "learning_rate": 6.73278921220724e-05, + "loss": 0.009024527668952943, + "step": 230270 + }, + { + "epoch": 32.68701206529453, + "grad_norm": 3.146012306213379, + "learning_rate": 6.73264726756565e-05, + "loss": 0.01068558096885681, + "step": 230280 + }, + { + "epoch": 32.688431511710434, + "grad_norm": 0.009903008118271828, + "learning_rate": 6.732505322924059e-05, + "loss": 0.008776780962944031, + "step": 230290 + }, + { + "epoch": 32.68985095812633, + "grad_norm": 13.541001319885254, + "learning_rate": 6.73236337828247e-05, + "loss": 0.010999911278486253, + "step": 230300 + }, + { + "epoch": 32.69127040454223, + "grad_norm": 0.032709814608097076, + "learning_rate": 6.73222143364088e-05, + "loss": 0.005047610402107239, + "step": 230310 + }, + { + "epoch": 32.692689850958125, + "grad_norm": 1.428013563156128, + "learning_rate": 6.732079488999291e-05, + "loss": 0.01961361914873123, + "step": 230320 + }, + { + "epoch": 32.69410929737403, + "grad_norm": 0.031771622598171234, + "learning_rate": 6.731937544357701e-05, + "loss": 0.010294514149427414, + "step": 230330 + }, + { + "epoch": 32.69552874378992, + "grad_norm": 3.373060703277588, + "learning_rate": 6.73179559971611e-05, + "loss": 0.018394869565963746, + "step": 230340 + }, + { + "epoch": 32.696948190205816, + "grad_norm": 0.05297103151679039, + "learning_rate": 6.731653655074522e-05, + "loss": 0.03796628713607788, + "step": 230350 + }, + { + "epoch": 32.69836763662172, + "grad_norm": 0.866274893283844, + "learning_rate": 6.731511710432931e-05, + "loss": 0.04973970651626587, + "step": 230360 + }, + { + "epoch": 32.69978708303761, + "grad_norm": 0.03765305131673813, + "learning_rate": 6.731369765791343e-05, + "loss": 0.0014414940029382705, + "step": 230370 + }, + { + "epoch": 32.701206529453515, + "grad_norm": 0.027956806123256683, + "learning_rate": 6.731227821149751e-05, + "loss": 0.03513614535331726, + "step": 230380 + }, + { + "epoch": 32.70262597586941, + "grad_norm": 0.1128496527671814, + "learning_rate": 6.731085876508162e-05, + "loss": 0.022524484992027284, + "step": 230390 + }, + { + "epoch": 32.70404542228531, + "grad_norm": 0.016960546374320984, + "learning_rate": 6.730943931866572e-05, + "loss": 0.0012403856962919235, + "step": 230400 + }, + { + "epoch": 32.705464868701206, + "grad_norm": 3.487699031829834, + "learning_rate": 6.730801987224983e-05, + "loss": 0.009856456518173217, + "step": 230410 + }, + { + "epoch": 32.7068843151171, + "grad_norm": 2.0815320014953613, + "learning_rate": 6.730660042583393e-05, + "loss": 0.00281062051653862, + "step": 230420 + }, + { + "epoch": 32.708303761533, + "grad_norm": 0.020726002752780914, + "learning_rate": 6.730518097941802e-05, + "loss": 0.0033649589866399767, + "step": 230430 + }, + { + "epoch": 32.7097232079489, + "grad_norm": 0.5696667432785034, + "learning_rate": 6.730376153300213e-05, + "loss": 0.006717415153980255, + "step": 230440 + }, + { + "epoch": 32.7111426543648, + "grad_norm": 0.008767221122980118, + "learning_rate": 6.730234208658623e-05, + "loss": 0.006326006352901458, + "step": 230450 + }, + { + "epoch": 32.712562100780694, + "grad_norm": 0.015940619632601738, + "learning_rate": 6.730092264017034e-05, + "loss": 0.0009957339614629745, + "step": 230460 + }, + { + "epoch": 32.713981547196596, + "grad_norm": 0.08114396780729294, + "learning_rate": 6.729950319375444e-05, + "loss": 0.02642592489719391, + "step": 230470 + }, + { + "epoch": 32.71540099361249, + "grad_norm": 0.0063024405390024185, + "learning_rate": 6.729808374733854e-05, + "loss": 0.00914572924375534, + "step": 230480 + }, + { + "epoch": 32.716820440028386, + "grad_norm": 0.048442550003528595, + "learning_rate": 6.729666430092263e-05, + "loss": 0.002997415140271187, + "step": 230490 + }, + { + "epoch": 32.71823988644429, + "grad_norm": 0.14661253988742828, + "learning_rate": 6.729524485450675e-05, + "loss": 0.035250693559646606, + "step": 230500 + }, + { + "epoch": 32.71823988644429, + "eval_accuracy": 0.9888090544922744, + "eval_loss": 0.04891662299633026, + "eval_runtime": 34.0055, + "eval_samples_per_second": 462.484, + "eval_steps_per_second": 14.468, + "step": 230500 + }, + { + "epoch": 32.71965933286018, + "grad_norm": 0.0290578231215477, + "learning_rate": 6.729382540809084e-05, + "loss": 0.035941395163536075, + "step": 230510 + }, + { + "epoch": 32.721078779276084, + "grad_norm": 0.045315977185964584, + "learning_rate": 6.729240596167495e-05, + "loss": 0.03346693813800812, + "step": 230520 + }, + { + "epoch": 32.72249822569198, + "grad_norm": 3.518812417984009, + "learning_rate": 6.729098651525905e-05, + "loss": 0.003750159963965416, + "step": 230530 + }, + { + "epoch": 32.72391767210788, + "grad_norm": 0.1635931134223938, + "learning_rate": 6.728956706884315e-05, + "loss": 0.01892370581626892, + "step": 230540 + }, + { + "epoch": 32.725337118523775, + "grad_norm": 0.8122530579566956, + "learning_rate": 6.728814762242726e-05, + "loss": 0.04671438336372376, + "step": 230550 + }, + { + "epoch": 32.72675656493967, + "grad_norm": 0.007584854029119015, + "learning_rate": 6.728672817601136e-05, + "loss": 0.017057327926158904, + "step": 230560 + }, + { + "epoch": 32.72817601135557, + "grad_norm": 0.10802122950553894, + "learning_rate": 6.728530872959547e-05, + "loss": 0.02184951901435852, + "step": 230570 + }, + { + "epoch": 32.72959545777147, + "grad_norm": 0.6605439782142639, + "learning_rate": 6.728388928317956e-05, + "loss": 0.0060537301003932955, + "step": 230580 + }, + { + "epoch": 32.73101490418737, + "grad_norm": 0.1624131202697754, + "learning_rate": 6.728246983676366e-05, + "loss": 0.00879824236035347, + "step": 230590 + }, + { + "epoch": 32.73243435060326, + "grad_norm": 0.13078251481056213, + "learning_rate": 6.728105039034776e-05, + "loss": 0.012118557095527649, + "step": 230600 + }, + { + "epoch": 32.733853797019165, + "grad_norm": 0.05230562388896942, + "learning_rate": 6.727963094393187e-05, + "loss": 0.004619729891419411, + "step": 230610 + }, + { + "epoch": 32.73527324343506, + "grad_norm": 0.05851571634411812, + "learning_rate": 6.727821149751598e-05, + "loss": 0.003976175189018249, + "step": 230620 + }, + { + "epoch": 32.736692689850955, + "grad_norm": 1.9686379432678223, + "learning_rate": 6.727679205110008e-05, + "loss": 0.020291274785995482, + "step": 230630 + }, + { + "epoch": 32.73811213626686, + "grad_norm": 0.12447971850633621, + "learning_rate": 6.727537260468418e-05, + "loss": 0.04038263261318207, + "step": 230640 + }, + { + "epoch": 32.73953158268275, + "grad_norm": 0.046545252203941345, + "learning_rate": 6.727395315826827e-05, + "loss": 0.008150433003902436, + "step": 230650 + }, + { + "epoch": 32.74095102909865, + "grad_norm": 0.02471901848912239, + "learning_rate": 6.727253371185238e-05, + "loss": 0.03380143642425537, + "step": 230660 + }, + { + "epoch": 32.74237047551455, + "grad_norm": 0.019994737580418587, + "learning_rate": 6.727111426543648e-05, + "loss": 0.008697059750556946, + "step": 230670 + }, + { + "epoch": 32.74378992193045, + "grad_norm": 0.10496027767658234, + "learning_rate": 6.726969481902059e-05, + "loss": 0.0196558341383934, + "step": 230680 + }, + { + "epoch": 32.745209368346345, + "grad_norm": 0.0281540397554636, + "learning_rate": 6.726827537260468e-05, + "loss": 0.0009372115135192871, + "step": 230690 + }, + { + "epoch": 32.74662881476224, + "grad_norm": 0.06435875594615936, + "learning_rate": 6.726685592618879e-05, + "loss": 0.021498198807239532, + "step": 230700 + }, + { + "epoch": 32.74804826117814, + "grad_norm": 0.06471769511699677, + "learning_rate": 6.72654364797729e-05, + "loss": 0.03661236763000488, + "step": 230710 + }, + { + "epoch": 32.749467707594036, + "grad_norm": 0.2929028868675232, + "learning_rate": 6.7264017033357e-05, + "loss": 0.0010427266359329223, + "step": 230720 + }, + { + "epoch": 32.75088715400994, + "grad_norm": 0.33679839968681335, + "learning_rate": 6.726259758694111e-05, + "loss": 0.001510174199938774, + "step": 230730 + }, + { + "epoch": 32.75230660042583, + "grad_norm": 0.8773136734962463, + "learning_rate": 6.726117814052519e-05, + "loss": 0.0049071390181779865, + "step": 230740 + }, + { + "epoch": 32.753726046841734, + "grad_norm": 0.007670919876545668, + "learning_rate": 6.72597586941093e-05, + "loss": 0.0020812459290027617, + "step": 230750 + }, + { + "epoch": 32.75514549325763, + "grad_norm": 0.04153800010681152, + "learning_rate": 6.72583392476934e-05, + "loss": 0.0018878430128097534, + "step": 230760 + }, + { + "epoch": 32.756564939673524, + "grad_norm": 0.021790411323308945, + "learning_rate": 6.725691980127751e-05, + "loss": 0.010397710651159287, + "step": 230770 + }, + { + "epoch": 32.757984386089426, + "grad_norm": 0.43994247913360596, + "learning_rate": 6.725550035486161e-05, + "loss": 0.006249840930104255, + "step": 230780 + }, + { + "epoch": 32.75940383250532, + "grad_norm": 11.3201322555542, + "learning_rate": 6.72540809084457e-05, + "loss": 0.025871825218200684, + "step": 230790 + }, + { + "epoch": 32.76082327892122, + "grad_norm": 0.12535302340984344, + "learning_rate": 6.725266146202982e-05, + "loss": 0.007925930619239806, + "step": 230800 + }, + { + "epoch": 32.76224272533712, + "grad_norm": 0.10669919103384018, + "learning_rate": 6.725124201561391e-05, + "loss": 0.004126046970486641, + "step": 230810 + }, + { + "epoch": 32.76366217175302, + "grad_norm": 3.0622034072875977, + "learning_rate": 6.724982256919802e-05, + "loss": 0.018352055549621583, + "step": 230820 + }, + { + "epoch": 32.765081618168914, + "grad_norm": 0.03727499023079872, + "learning_rate": 6.724840312278212e-05, + "loss": 0.020802204310894013, + "step": 230830 + }, + { + "epoch": 32.76650106458481, + "grad_norm": 0.17027802765369415, + "learning_rate": 6.724698367636622e-05, + "loss": 0.006650953739881516, + "step": 230840 + }, + { + "epoch": 32.76792051100071, + "grad_norm": 5.131646156311035, + "learning_rate": 6.724556422995032e-05, + "loss": 0.0034897059202194216, + "step": 230850 + }, + { + "epoch": 32.769339957416605, + "grad_norm": 3.352581024169922, + "learning_rate": 6.724414478353443e-05, + "loss": 0.02109636515378952, + "step": 230860 + }, + { + "epoch": 32.77075940383251, + "grad_norm": 5.641472816467285, + "learning_rate": 6.724272533711852e-05, + "loss": 0.004619826003909111, + "step": 230870 + }, + { + "epoch": 32.7721788502484, + "grad_norm": 0.6716129183769226, + "learning_rate": 6.724130589070264e-05, + "loss": 0.015075746178627013, + "step": 230880 + }, + { + "epoch": 32.773598296664304, + "grad_norm": 0.21852178871631622, + "learning_rate": 6.723988644428673e-05, + "loss": 0.00915912389755249, + "step": 230890 + }, + { + "epoch": 32.7750177430802, + "grad_norm": 1.2869906425476074, + "learning_rate": 6.723846699787083e-05, + "loss": 0.03137437105178833, + "step": 230900 + }, + { + "epoch": 32.77643718949609, + "grad_norm": 0.015739547088742256, + "learning_rate": 6.723704755145494e-05, + "loss": 0.01766941249370575, + "step": 230910 + }, + { + "epoch": 32.777856635911995, + "grad_norm": 0.09523604065179825, + "learning_rate": 6.723562810503904e-05, + "loss": 0.013410581648349762, + "step": 230920 + }, + { + "epoch": 32.77927608232789, + "grad_norm": 0.10387518256902695, + "learning_rate": 6.723420865862315e-05, + "loss": 0.009111431241035462, + "step": 230930 + }, + { + "epoch": 32.78069552874379, + "grad_norm": 0.048273105174303055, + "learning_rate": 6.723278921220725e-05, + "loss": 0.04129899144172668, + "step": 230940 + }, + { + "epoch": 32.782114975159686, + "grad_norm": 0.17375950515270233, + "learning_rate": 6.723136976579134e-05, + "loss": 0.00653654858469963, + "step": 230950 + }, + { + "epoch": 32.78353442157559, + "grad_norm": 10.035643577575684, + "learning_rate": 6.722995031937544e-05, + "loss": 0.046347704529762265, + "step": 230960 + }, + { + "epoch": 32.78495386799148, + "grad_norm": 0.09198124706745148, + "learning_rate": 6.722853087295955e-05, + "loss": 0.014584222435951233, + "step": 230970 + }, + { + "epoch": 32.78637331440738, + "grad_norm": 2.574244260787964, + "learning_rate": 6.722711142654365e-05, + "loss": 0.004319317266345024, + "step": 230980 + }, + { + "epoch": 32.78779276082328, + "grad_norm": 13.228898048400879, + "learning_rate": 6.722569198012776e-05, + "loss": 0.03043754994869232, + "step": 230990 + }, + { + "epoch": 32.789212207239174, + "grad_norm": 1.8371703624725342, + "learning_rate": 6.722427253371186e-05, + "loss": 0.002741202712059021, + "step": 231000 + }, + { + "epoch": 32.789212207239174, + "eval_accuracy": 0.9832135817384117, + "eval_loss": 0.07428185641765594, + "eval_runtime": 33.76, + "eval_samples_per_second": 465.847, + "eval_steps_per_second": 14.573, + "step": 231000 + }, + { + "epoch": 32.790631653655076, + "grad_norm": 0.01925431564450264, + "learning_rate": 6.722285308729596e-05, + "loss": 0.018006950616836548, + "step": 231010 + }, + { + "epoch": 32.79205110007097, + "grad_norm": 0.29841598868370056, + "learning_rate": 6.722143364088007e-05, + "loss": 0.002831881493330002, + "step": 231020 + }, + { + "epoch": 32.79347054648687, + "grad_norm": 0.013378333300352097, + "learning_rate": 6.722001419446416e-05, + "loss": 0.019047008454799653, + "step": 231030 + }, + { + "epoch": 32.79488999290277, + "grad_norm": 0.8350253105163574, + "learning_rate": 6.721859474804827e-05, + "loss": 0.018263734877109528, + "step": 231040 + }, + { + "epoch": 32.79630943931866, + "grad_norm": 20.30790901184082, + "learning_rate": 6.721717530163236e-05, + "loss": 0.022602403163909913, + "step": 231050 + }, + { + "epoch": 32.797728885734564, + "grad_norm": 0.2207595407962799, + "learning_rate": 6.721575585521647e-05, + "loss": 0.048073971271514894, + "step": 231060 + }, + { + "epoch": 32.79914833215046, + "grad_norm": 3.4685189723968506, + "learning_rate": 6.721433640880057e-05, + "loss": 0.008565272390842437, + "step": 231070 + }, + { + "epoch": 32.80056777856636, + "grad_norm": 0.11025356501340866, + "learning_rate": 6.721291696238468e-05, + "loss": 0.0023024167865514757, + "step": 231080 + }, + { + "epoch": 32.801987224982255, + "grad_norm": 2.341585874557495, + "learning_rate": 6.721149751596878e-05, + "loss": 0.023181115090847016, + "step": 231090 + }, + { + "epoch": 32.80340667139816, + "grad_norm": 7.1681318283081055, + "learning_rate": 6.721007806955287e-05, + "loss": 0.005585624277591706, + "step": 231100 + }, + { + "epoch": 32.80482611781405, + "grad_norm": 1.2752413749694824, + "learning_rate": 6.720865862313698e-05, + "loss": 0.04518181085586548, + "step": 231110 + }, + { + "epoch": 32.80624556422995, + "grad_norm": 0.14472432434558868, + "learning_rate": 6.720723917672108e-05, + "loss": 0.0008715644478797912, + "step": 231120 + }, + { + "epoch": 32.80766501064585, + "grad_norm": 4.769308090209961, + "learning_rate": 6.720581973030519e-05, + "loss": 0.0036443073302507402, + "step": 231130 + }, + { + "epoch": 32.80908445706174, + "grad_norm": 0.030749961733818054, + "learning_rate": 6.720440028388929e-05, + "loss": 0.025627979636192323, + "step": 231140 + }, + { + "epoch": 32.810503903477645, + "grad_norm": 0.23386546969413757, + "learning_rate": 6.720298083747339e-05, + "loss": 0.019003939628601075, + "step": 231150 + }, + { + "epoch": 32.81192334989354, + "grad_norm": 0.6540308594703674, + "learning_rate": 6.720156139105748e-05, + "loss": 0.006749765574932098, + "step": 231160 + }, + { + "epoch": 32.81334279630944, + "grad_norm": 0.031811948865652084, + "learning_rate": 6.72001419446416e-05, + "loss": 0.012926410138607024, + "step": 231170 + }, + { + "epoch": 32.81476224272534, + "grad_norm": 0.034818392246961594, + "learning_rate": 6.719872249822569e-05, + "loss": 0.01658569872379303, + "step": 231180 + }, + { + "epoch": 32.81618168914123, + "grad_norm": 1.4626773595809937, + "learning_rate": 6.71973030518098e-05, + "loss": 0.029288163781166075, + "step": 231190 + }, + { + "epoch": 32.81760113555713, + "grad_norm": 1.5658215284347534, + "learning_rate": 6.71958836053939e-05, + "loss": 0.0240283265709877, + "step": 231200 + }, + { + "epoch": 32.81902058197303, + "grad_norm": 0.05823824554681778, + "learning_rate": 6.7194464158978e-05, + "loss": 0.024326929450035097, + "step": 231210 + }, + { + "epoch": 32.82044002838893, + "grad_norm": 0.15625575184822083, + "learning_rate": 6.719304471256211e-05, + "loss": 0.0019854120910167694, + "step": 231220 + }, + { + "epoch": 32.821859474804825, + "grad_norm": 0.738330602645874, + "learning_rate": 6.71916252661462e-05, + "loss": 0.013944508135318756, + "step": 231230 + }, + { + "epoch": 32.82327892122073, + "grad_norm": 3.7863121032714844, + "learning_rate": 6.719020581973032e-05, + "loss": 0.002555343881249428, + "step": 231240 + }, + { + "epoch": 32.82469836763662, + "grad_norm": 0.011311098001897335, + "learning_rate": 6.71887863733144e-05, + "loss": 0.011439505219459533, + "step": 231250 + }, + { + "epoch": 32.826117814052516, + "grad_norm": 0.11863119900226593, + "learning_rate": 6.718736692689851e-05, + "loss": 0.015598432719707489, + "step": 231260 + }, + { + "epoch": 32.82753726046842, + "grad_norm": 0.1548033356666565, + "learning_rate": 6.718594748048261e-05, + "loss": 0.007593175768852234, + "step": 231270 + }, + { + "epoch": 32.82895670688431, + "grad_norm": 0.26622864603996277, + "learning_rate": 6.718452803406672e-05, + "loss": 0.003058861941099167, + "step": 231280 + }, + { + "epoch": 32.830376153300215, + "grad_norm": 0.007623805198818445, + "learning_rate": 6.718310858765082e-05, + "loss": 0.023908104002475738, + "step": 231290 + }, + { + "epoch": 32.83179559971611, + "grad_norm": 0.039282459765672684, + "learning_rate": 6.718168914123493e-05, + "loss": 0.007355505973100663, + "step": 231300 + }, + { + "epoch": 32.83321504613201, + "grad_norm": 0.19591206312179565, + "learning_rate": 6.718026969481903e-05, + "loss": 0.0014194190502166749, + "step": 231310 + }, + { + "epoch": 32.834634492547906, + "grad_norm": 12.430092811584473, + "learning_rate": 6.717885024840312e-05, + "loss": 0.009993849694728852, + "step": 231320 + }, + { + "epoch": 32.8360539389638, + "grad_norm": 0.029802091419696808, + "learning_rate": 6.717743080198723e-05, + "loss": 0.0052051559090614315, + "step": 231330 + }, + { + "epoch": 32.8374733853797, + "grad_norm": 0.055598825216293335, + "learning_rate": 6.717601135557133e-05, + "loss": 0.008159220218658447, + "step": 231340 + }, + { + "epoch": 32.8388928317956, + "grad_norm": 0.02679130993783474, + "learning_rate": 6.717459190915544e-05, + "loss": 0.0018076658248901366, + "step": 231350 + }, + { + "epoch": 32.8403122782115, + "grad_norm": 0.02063179574906826, + "learning_rate": 6.717317246273953e-05, + "loss": 0.01325775533914566, + "step": 231360 + }, + { + "epoch": 32.841731724627394, + "grad_norm": 0.25618353486061096, + "learning_rate": 6.717175301632364e-05, + "loss": 0.022197124361991883, + "step": 231370 + }, + { + "epoch": 32.843151171043296, + "grad_norm": 3.908083200454712, + "learning_rate": 6.717033356990773e-05, + "loss": 0.013657313585281373, + "step": 231380 + }, + { + "epoch": 32.84457061745919, + "grad_norm": 0.033198848366737366, + "learning_rate": 6.716891412349185e-05, + "loss": 0.005740191414952278, + "step": 231390 + }, + { + "epoch": 32.845990063875085, + "grad_norm": 3.2797787189483643, + "learning_rate": 6.716749467707594e-05, + "loss": 0.025468868017196656, + "step": 231400 + }, + { + "epoch": 32.84740951029099, + "grad_norm": 3.714743137359619, + "learning_rate": 6.716607523066004e-05, + "loss": 0.008779262751340866, + "step": 231410 + }, + { + "epoch": 32.84882895670688, + "grad_norm": 0.4043198823928833, + "learning_rate": 6.716465578424415e-05, + "loss": 0.0033072389662265776, + "step": 231420 + }, + { + "epoch": 32.850248403122784, + "grad_norm": 0.3662734031677246, + "learning_rate": 6.716323633782825e-05, + "loss": 0.00147789865732193, + "step": 231430 + }, + { + "epoch": 32.85166784953868, + "grad_norm": 4.429003715515137, + "learning_rate": 6.716181689141236e-05, + "loss": 0.0054496128112077715, + "step": 231440 + }, + { + "epoch": 32.85308729595458, + "grad_norm": 0.30776646733283997, + "learning_rate": 6.716039744499646e-05, + "loss": 0.08286625146865845, + "step": 231450 + }, + { + "epoch": 32.854506742370475, + "grad_norm": 0.021724529564380646, + "learning_rate": 6.715897799858055e-05, + "loss": 0.008498325943946838, + "step": 231460 + }, + { + "epoch": 32.85592618878637, + "grad_norm": 0.003040471114218235, + "learning_rate": 6.715755855216465e-05, + "loss": 0.018158800899982452, + "step": 231470 + }, + { + "epoch": 32.85734563520227, + "grad_norm": 0.004658316727727652, + "learning_rate": 6.715613910574876e-05, + "loss": 0.013198831677436828, + "step": 231480 + }, + { + "epoch": 32.85876508161817, + "grad_norm": 0.023386310786008835, + "learning_rate": 6.715471965933286e-05, + "loss": 0.0016544081270694732, + "step": 231490 + }, + { + "epoch": 32.86018452803407, + "grad_norm": 0.1902914047241211, + "learning_rate": 6.715330021291697e-05, + "loss": 0.0033270377665758134, + "step": 231500 + }, + { + "epoch": 32.86018452803407, + "eval_accuracy": 0.9893177338335347, + "eval_loss": 0.03940606489777565, + "eval_runtime": 34.3261, + "eval_samples_per_second": 458.165, + "eval_steps_per_second": 14.333, + "step": 231500 + }, + { + "epoch": 32.86160397444996, + "grad_norm": 3.041313648223877, + "learning_rate": 6.715188076650107e-05, + "loss": 0.0011204395443201065, + "step": 231510 + }, + { + "epoch": 32.863023420865865, + "grad_norm": 1.801601767539978, + "learning_rate": 6.715046132008517e-05, + "loss": 0.0025904454290866853, + "step": 231520 + }, + { + "epoch": 32.86444286728176, + "grad_norm": 2.6573565006256104, + "learning_rate": 6.714904187366928e-05, + "loss": 0.006695915013551712, + "step": 231530 + }, + { + "epoch": 32.865862313697654, + "grad_norm": 7.459601402282715, + "learning_rate": 6.714762242725337e-05, + "loss": 0.05104820728302002, + "step": 231540 + }, + { + "epoch": 32.867281760113556, + "grad_norm": 0.04584014415740967, + "learning_rate": 6.714620298083748e-05, + "loss": 0.0054659172892570496, + "step": 231550 + }, + { + "epoch": 32.86870120652945, + "grad_norm": 0.005146549083292484, + "learning_rate": 6.714478353442157e-05, + "loss": 0.005311392247676849, + "step": 231560 + }, + { + "epoch": 32.87012065294535, + "grad_norm": 17.119455337524414, + "learning_rate": 6.714336408800568e-05, + "loss": 0.04992564916610718, + "step": 231570 + }, + { + "epoch": 32.87154009936125, + "grad_norm": 0.0393180213868618, + "learning_rate": 6.714194464158978e-05, + "loss": 0.0021539822220802305, + "step": 231580 + }, + { + "epoch": 32.87295954577715, + "grad_norm": 0.2024555802345276, + "learning_rate": 6.714052519517389e-05, + "loss": 0.007630480825901032, + "step": 231590 + }, + { + "epoch": 32.874378992193044, + "grad_norm": 0.03942874073982239, + "learning_rate": 6.713910574875799e-05, + "loss": 0.009272660315036773, + "step": 231600 + }, + { + "epoch": 32.87579843860894, + "grad_norm": 8.144450187683105, + "learning_rate": 6.713768630234208e-05, + "loss": 0.0055247493088245395, + "step": 231610 + }, + { + "epoch": 32.87721788502484, + "grad_norm": 0.38919520378112793, + "learning_rate": 6.71362668559262e-05, + "loss": 0.0011010777205228805, + "step": 231620 + }, + { + "epoch": 32.878637331440736, + "grad_norm": 1.017945408821106, + "learning_rate": 6.713484740951029e-05, + "loss": 0.00895301327109337, + "step": 231630 + }, + { + "epoch": 32.88005677785664, + "grad_norm": 0.1145785003900528, + "learning_rate": 6.71334279630944e-05, + "loss": 0.007628552615642548, + "step": 231640 + }, + { + "epoch": 32.88147622427253, + "grad_norm": 0.18990199267864227, + "learning_rate": 6.71320085166785e-05, + "loss": 0.0012573011219501496, + "step": 231650 + }, + { + "epoch": 32.882895670688434, + "grad_norm": 0.08772388845682144, + "learning_rate": 6.713058907026261e-05, + "loss": 0.0347080647945404, + "step": 231660 + }, + { + "epoch": 32.88431511710433, + "grad_norm": 0.21623870730400085, + "learning_rate": 6.71291696238467e-05, + "loss": 0.026904284954071045, + "step": 231670 + }, + { + "epoch": 32.885734563520224, + "grad_norm": 2.4581193923950195, + "learning_rate": 6.71277501774308e-05, + "loss": 0.011369779706001282, + "step": 231680 + }, + { + "epoch": 32.887154009936125, + "grad_norm": 0.008075900375843048, + "learning_rate": 6.71263307310149e-05, + "loss": 0.004570067301392556, + "step": 231690 + }, + { + "epoch": 32.88857345635202, + "grad_norm": 13.731317520141602, + "learning_rate": 6.712491128459901e-05, + "loss": 0.008460581302642822, + "step": 231700 + }, + { + "epoch": 32.88999290276792, + "grad_norm": 0.06395643949508667, + "learning_rate": 6.712349183818311e-05, + "loss": 0.0025458872318267823, + "step": 231710 + }, + { + "epoch": 32.89141234918382, + "grad_norm": 3.1496829986572266, + "learning_rate": 6.712207239176721e-05, + "loss": 0.027531376481056212, + "step": 231720 + }, + { + "epoch": 32.89283179559972, + "grad_norm": 0.24812306463718414, + "learning_rate": 6.712065294535132e-05, + "loss": 0.013879990577697754, + "step": 231730 + }, + { + "epoch": 32.89425124201561, + "grad_norm": 14.349849700927734, + "learning_rate": 6.711923349893542e-05, + "loss": 0.03282317221164703, + "step": 231740 + }, + { + "epoch": 32.89567068843151, + "grad_norm": 0.056372374296188354, + "learning_rate": 6.711781405251953e-05, + "loss": 0.02389896661043167, + "step": 231750 + }, + { + "epoch": 32.89709013484741, + "grad_norm": 0.06032063439488411, + "learning_rate": 6.711639460610362e-05, + "loss": 0.0039440695196390155, + "step": 231760 + }, + { + "epoch": 32.898509581263305, + "grad_norm": 0.07783947885036469, + "learning_rate": 6.711497515968772e-05, + "loss": 0.009103375673294067, + "step": 231770 + }, + { + "epoch": 32.89992902767921, + "grad_norm": 3.9782207012176514, + "learning_rate": 6.711355571327182e-05, + "loss": 0.040722066164016725, + "step": 231780 + }, + { + "epoch": 32.9013484740951, + "grad_norm": 0.012061492539942265, + "learning_rate": 6.711213626685593e-05, + "loss": 0.06374676823616028, + "step": 231790 + }, + { + "epoch": 32.902767920511, + "grad_norm": 11.535358428955078, + "learning_rate": 6.711071682044003e-05, + "loss": 0.03359541893005371, + "step": 231800 + }, + { + "epoch": 32.9041873669269, + "grad_norm": 0.4673287868499756, + "learning_rate": 6.710929737402414e-05, + "loss": 0.027708268165588378, + "step": 231810 + }, + { + "epoch": 32.9056068133428, + "grad_norm": 0.4884614944458008, + "learning_rate": 6.710787792760824e-05, + "loss": 0.004563270136713982, + "step": 231820 + }, + { + "epoch": 32.907026259758695, + "grad_norm": 0.036059167236089706, + "learning_rate": 6.710645848119233e-05, + "loss": 0.03675717711448669, + "step": 231830 + }, + { + "epoch": 32.90844570617459, + "grad_norm": 0.19582921266555786, + "learning_rate": 6.710503903477644e-05, + "loss": 0.020689474046230318, + "step": 231840 + }, + { + "epoch": 32.90986515259049, + "grad_norm": 12.544333457946777, + "learning_rate": 6.710361958836054e-05, + "loss": 0.019328832626342773, + "step": 231850 + }, + { + "epoch": 32.911284599006386, + "grad_norm": 16.0892333984375, + "learning_rate": 6.710220014194465e-05, + "loss": 0.03440391719341278, + "step": 231860 + }, + { + "epoch": 32.91270404542229, + "grad_norm": 0.1699782907962799, + "learning_rate": 6.710078069552874e-05, + "loss": 0.020639361441135408, + "step": 231870 + }, + { + "epoch": 32.91412349183818, + "grad_norm": 12.383332252502441, + "learning_rate": 6.709936124911285e-05, + "loss": 0.009761115908622742, + "step": 231880 + }, + { + "epoch": 32.91554293825408, + "grad_norm": 0.017739875242114067, + "learning_rate": 6.709794180269694e-05, + "loss": 0.017158189415931703, + "step": 231890 + }, + { + "epoch": 32.91696238466998, + "grad_norm": 0.8807216882705688, + "learning_rate": 6.709652235628106e-05, + "loss": 0.05336596965789795, + "step": 231900 + }, + { + "epoch": 32.918381831085874, + "grad_norm": 0.2529066801071167, + "learning_rate": 6.709510290986515e-05, + "loss": 0.0033892594277858735, + "step": 231910 + }, + { + "epoch": 32.919801277501776, + "grad_norm": 0.035110339522361755, + "learning_rate": 6.709368346344925e-05, + "loss": 0.002490738406777382, + "step": 231920 + }, + { + "epoch": 32.92122072391767, + "grad_norm": 0.04664776846766472, + "learning_rate": 6.709226401703336e-05, + "loss": 0.008895879983901978, + "step": 231930 + }, + { + "epoch": 32.92264017033357, + "grad_norm": 0.17450641095638275, + "learning_rate": 6.709084457061746e-05, + "loss": 0.017239540815353394, + "step": 231940 + }, + { + "epoch": 32.92405961674947, + "grad_norm": 11.441862106323242, + "learning_rate": 6.708942512420157e-05, + "loss": 0.005307575315237045, + "step": 231950 + }, + { + "epoch": 32.92547906316537, + "grad_norm": 0.3068740963935852, + "learning_rate": 6.708800567778567e-05, + "loss": 0.015283815562725067, + "step": 231960 + }, + { + "epoch": 32.926898509581264, + "grad_norm": 2.3802616596221924, + "learning_rate": 6.708658623136976e-05, + "loss": 0.018784326314926148, + "step": 231970 + }, + { + "epoch": 32.92831795599716, + "grad_norm": 0.22128991782665253, + "learning_rate": 6.708516678495386e-05, + "loss": 0.006721298396587372, + "step": 231980 + }, + { + "epoch": 32.92973740241306, + "grad_norm": 1.227864384651184, + "learning_rate": 6.708374733853797e-05, + "loss": 0.008744403719902039, + "step": 231990 + }, + { + "epoch": 32.931156848828955, + "grad_norm": 0.14756132662296295, + "learning_rate": 6.708232789212207e-05, + "loss": 0.011222265660762787, + "step": 232000 + }, + { + "epoch": 32.931156848828955, + "eval_accuracy": 0.9879824505627265, + "eval_loss": 0.04411115124821663, + "eval_runtime": 34.4579, + "eval_samples_per_second": 456.412, + "eval_steps_per_second": 14.278, + "step": 232000 + }, + { + "epoch": 32.93257629524486, + "grad_norm": 0.27643921971321106, + "learning_rate": 6.708090844570618e-05, + "loss": 0.012924249470233917, + "step": 232010 + }, + { + "epoch": 32.93399574166075, + "grad_norm": 0.02039431221783161, + "learning_rate": 6.707948899929029e-05, + "loss": 0.006604589521884918, + "step": 232020 + }, + { + "epoch": 32.93541518807665, + "grad_norm": 0.06880743056535721, + "learning_rate": 6.707806955287438e-05, + "loss": 0.0027513235807418825, + "step": 232030 + }, + { + "epoch": 32.93683463449255, + "grad_norm": 5.663849830627441, + "learning_rate": 6.707665010645849e-05, + "loss": 0.007062958180904388, + "step": 232040 + }, + { + "epoch": 32.93825408090844, + "grad_norm": 0.11515184491872787, + "learning_rate": 6.707523066004258e-05, + "loss": 0.03868401050567627, + "step": 232050 + }, + { + "epoch": 32.939673527324345, + "grad_norm": 5.109851360321045, + "learning_rate": 6.70738112136267e-05, + "loss": 0.007406014204025269, + "step": 232060 + }, + { + "epoch": 32.94109297374024, + "grad_norm": 1.5324277877807617, + "learning_rate": 6.707239176721079e-05, + "loss": 0.01873302608728409, + "step": 232070 + }, + { + "epoch": 32.94251242015614, + "grad_norm": 2.1102709770202637, + "learning_rate": 6.707097232079489e-05, + "loss": 0.03645229935646057, + "step": 232080 + }, + { + "epoch": 32.943931866572036, + "grad_norm": 0.3256604075431824, + "learning_rate": 6.706955287437899e-05, + "loss": 0.01611800491809845, + "step": 232090 + }, + { + "epoch": 32.94535131298794, + "grad_norm": 0.031053220853209496, + "learning_rate": 6.70681334279631e-05, + "loss": 0.002537635713815689, + "step": 232100 + }, + { + "epoch": 32.94677075940383, + "grad_norm": 2.1605403423309326, + "learning_rate": 6.706671398154721e-05, + "loss": 0.021388383209705354, + "step": 232110 + }, + { + "epoch": 32.94819020581973, + "grad_norm": 0.8946403861045837, + "learning_rate": 6.70652945351313e-05, + "loss": 0.010015501081943512, + "step": 232120 + }, + { + "epoch": 32.94960965223563, + "grad_norm": 23.95484733581543, + "learning_rate": 6.70638750887154e-05, + "loss": 0.015324440598487855, + "step": 232130 + }, + { + "epoch": 32.951029098651524, + "grad_norm": 0.11976815015077591, + "learning_rate": 6.70624556422995e-05, + "loss": 0.006980134546756745, + "step": 232140 + }, + { + "epoch": 32.952448545067426, + "grad_norm": 8.950846672058105, + "learning_rate": 6.706103619588361e-05, + "loss": 0.009636526554822921, + "step": 232150 + }, + { + "epoch": 32.95386799148332, + "grad_norm": 0.33655521273612976, + "learning_rate": 6.705961674946771e-05, + "loss": 0.020722810924053193, + "step": 232160 + }, + { + "epoch": 32.955287437899216, + "grad_norm": 0.01412772387266159, + "learning_rate": 6.705819730305182e-05, + "loss": 0.007164295017719269, + "step": 232170 + }, + { + "epoch": 32.95670688431512, + "grad_norm": 0.004767750855535269, + "learning_rate": 6.70567778566359e-05, + "loss": 0.056897598505020144, + "step": 232180 + }, + { + "epoch": 32.95812633073101, + "grad_norm": 0.06058765575289726, + "learning_rate": 6.705535841022001e-05, + "loss": 0.004940172657370567, + "step": 232190 + }, + { + "epoch": 32.959545777146914, + "grad_norm": 12.3880033493042, + "learning_rate": 6.705393896380413e-05, + "loss": 0.029117286205291748, + "step": 232200 + }, + { + "epoch": 32.96096522356281, + "grad_norm": 0.18718144297599792, + "learning_rate": 6.705251951738822e-05, + "loss": 0.010015229880809783, + "step": 232210 + }, + { + "epoch": 32.96238466997871, + "grad_norm": 2.3876774311065674, + "learning_rate": 6.705110007097233e-05, + "loss": 0.004492121934890747, + "step": 232220 + }, + { + "epoch": 32.963804116394606, + "grad_norm": 0.005972818471491337, + "learning_rate": 6.704968062455642e-05, + "loss": 0.0004711143672466278, + "step": 232230 + }, + { + "epoch": 32.96522356281051, + "grad_norm": 7.932565212249756, + "learning_rate": 6.704826117814053e-05, + "loss": 0.003530152514576912, + "step": 232240 + }, + { + "epoch": 32.9666430092264, + "grad_norm": 5.62136697769165, + "learning_rate": 6.704684173172463e-05, + "loss": 0.005086075142025947, + "step": 232250 + }, + { + "epoch": 32.9680624556423, + "grad_norm": 3.2676095962524414, + "learning_rate": 6.704542228530874e-05, + "loss": 0.011179011315107346, + "step": 232260 + }, + { + "epoch": 32.9694819020582, + "grad_norm": 0.09036093205213547, + "learning_rate": 6.704400283889283e-05, + "loss": 0.0026679933071136475, + "step": 232270 + }, + { + "epoch": 32.970901348474094, + "grad_norm": 0.10060188174247742, + "learning_rate": 6.704258339247693e-05, + "loss": 0.008093390613794327, + "step": 232280 + }, + { + "epoch": 32.972320794889995, + "grad_norm": 3.444136381149292, + "learning_rate": 6.704116394606104e-05, + "loss": 0.021916474401950835, + "step": 232290 + }, + { + "epoch": 32.97374024130589, + "grad_norm": 0.07037210464477539, + "learning_rate": 6.703974449964514e-05, + "loss": 0.013254526257514953, + "step": 232300 + }, + { + "epoch": 32.975159687721785, + "grad_norm": 0.32932987809181213, + "learning_rate": 6.703832505322925e-05, + "loss": 0.015632474422454835, + "step": 232310 + }, + { + "epoch": 32.97657913413769, + "grad_norm": 0.010571050457656384, + "learning_rate": 6.703690560681335e-05, + "loss": 0.016012327373027803, + "step": 232320 + }, + { + "epoch": 32.97799858055358, + "grad_norm": 16.485734939575195, + "learning_rate": 6.703548616039745e-05, + "loss": 0.03403065204620361, + "step": 232330 + }, + { + "epoch": 32.97941802696948, + "grad_norm": 0.37770694494247437, + "learning_rate": 6.703406671398154e-05, + "loss": 0.006315302103757858, + "step": 232340 + }, + { + "epoch": 32.98083747338538, + "grad_norm": 0.009163899347186089, + "learning_rate": 6.703264726756565e-05, + "loss": 0.03321839272975922, + "step": 232350 + }, + { + "epoch": 32.98225691980128, + "grad_norm": 0.011571010574698448, + "learning_rate": 6.703122782114975e-05, + "loss": 0.0059160348027944565, + "step": 232360 + }, + { + "epoch": 32.983676366217175, + "grad_norm": 0.07013394683599472, + "learning_rate": 6.702980837473386e-05, + "loss": 0.014336442947387696, + "step": 232370 + }, + { + "epoch": 32.98509581263308, + "grad_norm": 0.01363424863666296, + "learning_rate": 6.702853087295955e-05, + "loss": 0.031113147735595703, + "step": 232380 + }, + { + "epoch": 32.98651525904897, + "grad_norm": 5.027544975280762, + "learning_rate": 6.702711142654366e-05, + "loss": 0.014988642930984498, + "step": 232390 + }, + { + "epoch": 32.987934705464866, + "grad_norm": 12.52743911743164, + "learning_rate": 6.702569198012776e-05, + "loss": 0.028587546944618226, + "step": 232400 + }, + { + "epoch": 32.98935415188077, + "grad_norm": 0.035969510674476624, + "learning_rate": 6.702427253371185e-05, + "loss": 0.010097838938236237, + "step": 232410 + }, + { + "epoch": 32.99077359829666, + "grad_norm": 4.378223419189453, + "learning_rate": 6.702285308729595e-05, + "loss": 0.0029059067368507386, + "step": 232420 + }, + { + "epoch": 32.992193044712565, + "grad_norm": 9.89118480682373, + "learning_rate": 6.702143364088006e-05, + "loss": 0.005419235676527023, + "step": 232430 + }, + { + "epoch": 32.99361249112846, + "grad_norm": 3.0281972885131836, + "learning_rate": 6.702001419446416e-05, + "loss": 0.01452578604221344, + "step": 232440 + }, + { + "epoch": 32.995031937544354, + "grad_norm": 1.240073800086975, + "learning_rate": 6.701859474804827e-05, + "loss": 0.011771759390830994, + "step": 232450 + }, + { + "epoch": 32.996451383960256, + "grad_norm": 0.021093245595693588, + "learning_rate": 6.701717530163237e-05, + "loss": 0.017976436018943786, + "step": 232460 + }, + { + "epoch": 32.99787083037615, + "grad_norm": 0.008404689840972424, + "learning_rate": 6.701575585521646e-05, + "loss": 0.001192772388458252, + "step": 232470 + }, + { + "epoch": 32.99929027679205, + "grad_norm": 13.54447078704834, + "learning_rate": 6.701433640880058e-05, + "loss": 0.01685427725315094, + "step": 232480 + }, + { + "epoch": 33.00070972320795, + "grad_norm": 0.36610502004623413, + "learning_rate": 6.701291696238467e-05, + "loss": 0.0031044039875268935, + "step": 232490 + }, + { + "epoch": 33.00212916962385, + "grad_norm": 0.027562536299228668, + "learning_rate": 6.701149751596878e-05, + "loss": 0.0007439866662025451, + "step": 232500 + }, + { + "epoch": 33.00212916962385, + "eval_accuracy": 0.9909709416926306, + "eval_loss": 0.03813938423991203, + "eval_runtime": 33.765, + "eval_samples_per_second": 465.778, + "eval_steps_per_second": 14.571, + "step": 232500 + }, + { + "epoch": 33.003548616039744, + "grad_norm": 0.6496100425720215, + "learning_rate": 6.701007806955287e-05, + "loss": 0.036458027362823484, + "step": 232510 + }, + { + "epoch": 33.004968062455646, + "grad_norm": 0.014963158406317234, + "learning_rate": 6.700865862313698e-05, + "loss": 0.011810368299484253, + "step": 232520 + }, + { + "epoch": 33.00638750887154, + "grad_norm": 0.07337503880262375, + "learning_rate": 6.700723917672108e-05, + "loss": 0.002793193981051445, + "step": 232530 + }, + { + "epoch": 33.007806955287435, + "grad_norm": 0.014566749334335327, + "learning_rate": 6.700581973030519e-05, + "loss": 0.0034230899065732954, + "step": 232540 + }, + { + "epoch": 33.00922640170334, + "grad_norm": 3.476078510284424, + "learning_rate": 6.700440028388928e-05, + "loss": 0.014581476151943207, + "step": 232550 + }, + { + "epoch": 33.01064584811923, + "grad_norm": 0.17664480209350586, + "learning_rate": 6.700298083747338e-05, + "loss": 0.0024559590965509415, + "step": 232560 + }, + { + "epoch": 33.012065294535134, + "grad_norm": 0.022657381370663643, + "learning_rate": 6.700156139105749e-05, + "loss": 0.005892970785498619, + "step": 232570 + }, + { + "epoch": 33.01348474095103, + "grad_norm": 0.1816049963235855, + "learning_rate": 6.700014194464159e-05, + "loss": 0.047564005851745604, + "step": 232580 + }, + { + "epoch": 33.01490418736693, + "grad_norm": 0.2688983082771301, + "learning_rate": 6.69987224982257e-05, + "loss": 0.0035350944846868513, + "step": 232590 + }, + { + "epoch": 33.016323633782825, + "grad_norm": 5.948859214782715, + "learning_rate": 6.69973030518098e-05, + "loss": 0.009034616500139236, + "step": 232600 + }, + { + "epoch": 33.01774308019872, + "grad_norm": 0.09592500329017639, + "learning_rate": 6.69958836053939e-05, + "loss": 0.010737352073192596, + "step": 232610 + }, + { + "epoch": 33.01916252661462, + "grad_norm": 0.11492357403039932, + "learning_rate": 6.699446415897799e-05, + "loss": 0.05328705906867981, + "step": 232620 + }, + { + "epoch": 33.02058197303052, + "grad_norm": 0.10146881639957428, + "learning_rate": 6.69930447125621e-05, + "loss": 0.005699126049876213, + "step": 232630 + }, + { + "epoch": 33.02200141944642, + "grad_norm": 0.0042434027418494225, + "learning_rate": 6.69916252661462e-05, + "loss": 0.0018366482108831406, + "step": 232640 + }, + { + "epoch": 33.02342086586231, + "grad_norm": 0.01908477395772934, + "learning_rate": 6.699020581973031e-05, + "loss": 0.013897585868835449, + "step": 232650 + }, + { + "epoch": 33.024840312278215, + "grad_norm": 0.04993263632059097, + "learning_rate": 6.698878637331441e-05, + "loss": 0.005027785524725914, + "step": 232660 + }, + { + "epoch": 33.02625975869411, + "grad_norm": 0.023937663063406944, + "learning_rate": 6.698736692689851e-05, + "loss": 0.0019275680184364318, + "step": 232670 + }, + { + "epoch": 33.027679205110005, + "grad_norm": 0.1582077592611313, + "learning_rate": 6.698594748048262e-05, + "loss": 0.003883858770132065, + "step": 232680 + }, + { + "epoch": 33.029098651525906, + "grad_norm": 0.004634209908545017, + "learning_rate": 6.698452803406672e-05, + "loss": 0.008256744593381882, + "step": 232690 + }, + { + "epoch": 33.0305180979418, + "grad_norm": 0.02267731912434101, + "learning_rate": 6.698310858765083e-05, + "loss": 0.015025709569454194, + "step": 232700 + }, + { + "epoch": 33.0319375443577, + "grad_norm": 3.224761724472046, + "learning_rate": 6.698168914123492e-05, + "loss": 0.009513667225837708, + "step": 232710 + }, + { + "epoch": 33.0333569907736, + "grad_norm": 0.007782425731420517, + "learning_rate": 6.698026969481902e-05, + "loss": 0.010710550844669342, + "step": 232720 + }, + { + "epoch": 33.0347764371895, + "grad_norm": 0.06294766813516617, + "learning_rate": 6.697885024840312e-05, + "loss": 0.008526723086833953, + "step": 232730 + }, + { + "epoch": 33.036195883605394, + "grad_norm": 0.20215864479541779, + "learning_rate": 6.697743080198723e-05, + "loss": 0.002528071030974388, + "step": 232740 + }, + { + "epoch": 33.03761533002129, + "grad_norm": 0.004148895852267742, + "learning_rate": 6.697601135557133e-05, + "loss": 0.005145163089036941, + "step": 232750 + }, + { + "epoch": 33.03903477643719, + "grad_norm": 0.01698174886405468, + "learning_rate": 6.697459190915544e-05, + "loss": 0.014062893390655518, + "step": 232760 + }, + { + "epoch": 33.040454222853086, + "grad_norm": 0.012025895528495312, + "learning_rate": 6.697317246273953e-05, + "loss": 0.005314980819821358, + "step": 232770 + }, + { + "epoch": 33.04187366926899, + "grad_norm": 0.801257848739624, + "learning_rate": 6.697175301632363e-05, + "loss": 0.005766784027218819, + "step": 232780 + }, + { + "epoch": 33.04329311568488, + "grad_norm": 6.4183759689331055, + "learning_rate": 6.697033356990774e-05, + "loss": 0.028923073410987855, + "step": 232790 + }, + { + "epoch": 33.044712562100784, + "grad_norm": 0.31630003452301025, + "learning_rate": 6.696891412349184e-05, + "loss": 0.0024191752076148988, + "step": 232800 + }, + { + "epoch": 33.04613200851668, + "grad_norm": 0.10296811908483505, + "learning_rate": 6.696749467707595e-05, + "loss": 0.014398099482059478, + "step": 232810 + }, + { + "epoch": 33.047551454932574, + "grad_norm": 0.012554096058011055, + "learning_rate": 6.696607523066004e-05, + "loss": 0.027711886167526244, + "step": 232820 + }, + { + "epoch": 33.048970901348476, + "grad_norm": 18.79640769958496, + "learning_rate": 6.696465578424415e-05, + "loss": 0.02449519783258438, + "step": 232830 + }, + { + "epoch": 33.05039034776437, + "grad_norm": 1.0111384391784668, + "learning_rate": 6.696323633782824e-05, + "loss": 0.0016225460916757584, + "step": 232840 + }, + { + "epoch": 33.05180979418027, + "grad_norm": 8.156940460205078, + "learning_rate": 6.696181689141235e-05, + "loss": 0.014043767750263215, + "step": 232850 + }, + { + "epoch": 33.05322924059617, + "grad_norm": 0.048435505479574203, + "learning_rate": 6.696039744499647e-05, + "loss": 0.02610374093055725, + "step": 232860 + }, + { + "epoch": 33.05464868701207, + "grad_norm": 2.3942348957061768, + "learning_rate": 6.695897799858055e-05, + "loss": 0.029642096161842345, + "step": 232870 + }, + { + "epoch": 33.056068133427964, + "grad_norm": 0.08680371940135956, + "learning_rate": 6.695755855216466e-05, + "loss": 0.004501764476299286, + "step": 232880 + }, + { + "epoch": 33.05748757984386, + "grad_norm": 0.06177636235952377, + "learning_rate": 6.695613910574876e-05, + "loss": 0.014505928754806519, + "step": 232890 + }, + { + "epoch": 33.05890702625976, + "grad_norm": 0.08930642902851105, + "learning_rate": 6.695471965933287e-05, + "loss": 0.0080267034471035, + "step": 232900 + }, + { + "epoch": 33.060326472675655, + "grad_norm": 1.669140338897705, + "learning_rate": 6.695330021291697e-05, + "loss": 0.03357402682304382, + "step": 232910 + }, + { + "epoch": 33.06174591909156, + "grad_norm": 0.06884250789880753, + "learning_rate": 6.695188076650106e-05, + "loss": 0.036909133195877075, + "step": 232920 + }, + { + "epoch": 33.06316536550745, + "grad_norm": 0.33558300137519836, + "learning_rate": 6.695046132008516e-05, + "loss": 0.012088485062122345, + "step": 232930 + }, + { + "epoch": 33.06458481192335, + "grad_norm": 0.2721827030181885, + "learning_rate": 6.694904187366927e-05, + "loss": 0.03767330348491669, + "step": 232940 + }, + { + "epoch": 33.06600425833925, + "grad_norm": 3.2938904762268066, + "learning_rate": 6.694762242725338e-05, + "loss": 0.006995049118995666, + "step": 232950 + }, + { + "epoch": 33.06742370475514, + "grad_norm": 15.198688507080078, + "learning_rate": 6.694620298083748e-05, + "loss": 0.019055862724781037, + "step": 232960 + }, + { + "epoch": 33.068843151171045, + "grad_norm": 0.10339218378067017, + "learning_rate": 6.694478353442158e-05, + "loss": 0.014005209505558013, + "step": 232970 + }, + { + "epoch": 33.07026259758694, + "grad_norm": 0.14587794244289398, + "learning_rate": 6.694336408800567e-05, + "loss": 0.010410657525062561, + "step": 232980 + }, + { + "epoch": 33.07168204400284, + "grad_norm": 0.008789008483290672, + "learning_rate": 6.694194464158979e-05, + "loss": 0.015039154887199402, + "step": 232990 + }, + { + "epoch": 33.073101490418736, + "grad_norm": 0.09059564769268036, + "learning_rate": 6.694052519517388e-05, + "loss": 0.003294055536389351, + "step": 233000 + }, + { + "epoch": 33.073101490418736, + "eval_accuracy": 0.9884911299039868, + "eval_loss": 0.04424584284424782, + "eval_runtime": 33.6783, + "eval_samples_per_second": 466.978, + "eval_steps_per_second": 14.609, + "step": 233000 + }, + { + "epoch": 33.07452093683464, + "grad_norm": 0.008037124760448933, + "learning_rate": 6.6939105748758e-05, + "loss": 0.0005681019276380539, + "step": 233010 + }, + { + "epoch": 33.07594038325053, + "grad_norm": 0.041609026491642, + "learning_rate": 6.693768630234209e-05, + "loss": 0.01185714676976204, + "step": 233020 + }, + { + "epoch": 33.07735982966643, + "grad_norm": 0.03921293839812279, + "learning_rate": 6.693626685592619e-05, + "loss": 0.006963609158992768, + "step": 233030 + }, + { + "epoch": 33.07877927608233, + "grad_norm": 0.06588096171617508, + "learning_rate": 6.69348474095103e-05, + "loss": 0.006099595502018928, + "step": 233040 + }, + { + "epoch": 33.080198722498224, + "grad_norm": 0.33578309416770935, + "learning_rate": 6.69334279630944e-05, + "loss": 0.00404360294342041, + "step": 233050 + }, + { + "epoch": 33.081618168914126, + "grad_norm": 0.012730725109577179, + "learning_rate": 6.693200851667851e-05, + "loss": 0.002884392440319061, + "step": 233060 + }, + { + "epoch": 33.08303761533002, + "grad_norm": 0.1294029802083969, + "learning_rate": 6.69305890702626e-05, + "loss": 0.021287019550800323, + "step": 233070 + }, + { + "epoch": 33.08445706174592, + "grad_norm": 1.1463992595672607, + "learning_rate": 6.69291696238467e-05, + "loss": 0.010997149348258971, + "step": 233080 + }, + { + "epoch": 33.08587650816182, + "grad_norm": 0.03240277245640755, + "learning_rate": 6.69277501774308e-05, + "loss": 0.0004938151687383652, + "step": 233090 + }, + { + "epoch": 33.08729595457771, + "grad_norm": 11.319725036621094, + "learning_rate": 6.692633073101491e-05, + "loss": 0.020416563749313353, + "step": 233100 + }, + { + "epoch": 33.088715400993614, + "grad_norm": 0.15019521117210388, + "learning_rate": 6.692491128459901e-05, + "loss": 0.018285247683525085, + "step": 233110 + }, + { + "epoch": 33.09013484740951, + "grad_norm": 0.03332260996103287, + "learning_rate": 6.692349183818312e-05, + "loss": 0.021844398975372315, + "step": 233120 + }, + { + "epoch": 33.09155429382541, + "grad_norm": 13.027524948120117, + "learning_rate": 6.692207239176722e-05, + "loss": 0.01533067673444748, + "step": 233130 + }, + { + "epoch": 33.092973740241305, + "grad_norm": 0.016743667423725128, + "learning_rate": 6.692065294535131e-05, + "loss": 0.020993357896804808, + "step": 233140 + }, + { + "epoch": 33.09439318665721, + "grad_norm": 0.009188542142510414, + "learning_rate": 6.691923349893542e-05, + "loss": 0.011552165448665618, + "step": 233150 + }, + { + "epoch": 33.0958126330731, + "grad_norm": 0.03161291033029556, + "learning_rate": 6.691781405251952e-05, + "loss": 0.001324000582098961, + "step": 233160 + }, + { + "epoch": 33.097232079489, + "grad_norm": 1.7931348085403442, + "learning_rate": 6.691639460610363e-05, + "loss": 0.009678155183792114, + "step": 233170 + }, + { + "epoch": 33.0986515259049, + "grad_norm": 11.032681465148926, + "learning_rate": 6.691497515968772e-05, + "loss": 0.01582941710948944, + "step": 233180 + }, + { + "epoch": 33.10007097232079, + "grad_norm": 2.957695960998535, + "learning_rate": 6.691355571327183e-05, + "loss": 0.02472063899040222, + "step": 233190 + }, + { + "epoch": 33.101490418736695, + "grad_norm": 0.014576147310435772, + "learning_rate": 6.691213626685593e-05, + "loss": 0.004825562238693237, + "step": 233200 + }, + { + "epoch": 33.10290986515259, + "grad_norm": 0.08827441185712814, + "learning_rate": 6.691071682044004e-05, + "loss": 0.017442087829113006, + "step": 233210 + }, + { + "epoch": 33.10432931156849, + "grad_norm": 0.14966894686222076, + "learning_rate": 6.690929737402413e-05, + "loss": 0.02420864999294281, + "step": 233220 + }, + { + "epoch": 33.10574875798439, + "grad_norm": 18.940500259399414, + "learning_rate": 6.690787792760823e-05, + "loss": 0.06303948163986206, + "step": 233230 + }, + { + "epoch": 33.10716820440028, + "grad_norm": 0.05172613635659218, + "learning_rate": 6.690645848119234e-05, + "loss": 0.017350469529628754, + "step": 233240 + }, + { + "epoch": 33.10858765081618, + "grad_norm": 0.7404413819313049, + "learning_rate": 6.690503903477644e-05, + "loss": 0.003830821439623833, + "step": 233250 + }, + { + "epoch": 33.11000709723208, + "grad_norm": 6.709632873535156, + "learning_rate": 6.690361958836055e-05, + "loss": 0.024368323385715485, + "step": 233260 + }, + { + "epoch": 33.11142654364798, + "grad_norm": 8.958331108093262, + "learning_rate": 6.690220014194465e-05, + "loss": 0.02602173089981079, + "step": 233270 + }, + { + "epoch": 33.112845990063875, + "grad_norm": 0.012217581272125244, + "learning_rate": 6.690078069552874e-05, + "loss": 0.0009043179452419281, + "step": 233280 + }, + { + "epoch": 33.114265436479776, + "grad_norm": 0.035470329225063324, + "learning_rate": 6.689936124911284e-05, + "loss": 0.004645437747240066, + "step": 233290 + }, + { + "epoch": 33.11568488289567, + "grad_norm": 3.1749613285064697, + "learning_rate": 6.689794180269695e-05, + "loss": 0.008248934149742126, + "step": 233300 + }, + { + "epoch": 33.117104329311566, + "grad_norm": 5.524974822998047, + "learning_rate": 6.689652235628105e-05, + "loss": 0.00760193020105362, + "step": 233310 + }, + { + "epoch": 33.11852377572747, + "grad_norm": 0.6162898540496826, + "learning_rate": 6.689510290986516e-05, + "loss": 0.01911797821521759, + "step": 233320 + }, + { + "epoch": 33.11994322214336, + "grad_norm": 0.0829072818160057, + "learning_rate": 6.689368346344926e-05, + "loss": 0.025186258554458617, + "step": 233330 + }, + { + "epoch": 33.121362668559264, + "grad_norm": 26.86383819580078, + "learning_rate": 6.689226401703336e-05, + "loss": 0.025918185710906982, + "step": 233340 + }, + { + "epoch": 33.12278211497516, + "grad_norm": 0.04101794213056564, + "learning_rate": 6.689084457061747e-05, + "loss": 0.03154298663139343, + "step": 233350 + }, + { + "epoch": 33.12420156139106, + "grad_norm": 0.08597413450479507, + "learning_rate": 6.688942512420156e-05, + "loss": 0.001614866778254509, + "step": 233360 + }, + { + "epoch": 33.125621007806956, + "grad_norm": 2.6257076263427734, + "learning_rate": 6.688814762242725e-05, + "loss": 0.043822288513183594, + "step": 233370 + }, + { + "epoch": 33.12704045422285, + "grad_norm": 0.16938471794128418, + "learning_rate": 6.688672817601136e-05, + "loss": 0.007029750943183899, + "step": 233380 + }, + { + "epoch": 33.12845990063875, + "grad_norm": 0.5439189076423645, + "learning_rate": 6.688530872959546e-05, + "loss": 0.03674255907535553, + "step": 233390 + }, + { + "epoch": 33.12987934705465, + "grad_norm": 0.011006061919033527, + "learning_rate": 6.688388928317957e-05, + "loss": 0.034467703104019164, + "step": 233400 + }, + { + "epoch": 33.13129879347055, + "grad_norm": 12.013583183288574, + "learning_rate": 6.688246983676367e-05, + "loss": 0.06046155095100403, + "step": 233410 + }, + { + "epoch": 33.132718239886444, + "grad_norm": 1.6158959865570068, + "learning_rate": 6.688105039034776e-05, + "loss": 0.018071942031383514, + "step": 233420 + }, + { + "epoch": 33.134137686302346, + "grad_norm": 2.619174003601074, + "learning_rate": 6.687963094393187e-05, + "loss": 0.011493847519159318, + "step": 233430 + }, + { + "epoch": 33.13555713271824, + "grad_norm": 7.562828540802002, + "learning_rate": 6.687821149751597e-05, + "loss": 0.008856996893882751, + "step": 233440 + }, + { + "epoch": 33.136976579134135, + "grad_norm": 0.607689380645752, + "learning_rate": 6.687679205110008e-05, + "loss": 0.003968733549118042, + "step": 233450 + }, + { + "epoch": 33.13839602555004, + "grad_norm": 0.041713688522577286, + "learning_rate": 6.687537260468417e-05, + "loss": 0.01523936092853546, + "step": 233460 + }, + { + "epoch": 33.13981547196593, + "grad_norm": 0.07218188047409058, + "learning_rate": 6.687395315826828e-05, + "loss": 0.02639442980289459, + "step": 233470 + }, + { + "epoch": 33.14123491838183, + "grad_norm": 0.012482138350605965, + "learning_rate": 6.687253371185237e-05, + "loss": 0.00729006826877594, + "step": 233480 + }, + { + "epoch": 33.14265436479773, + "grad_norm": 4.322803020477295, + "learning_rate": 6.687111426543649e-05, + "loss": 0.007799869775772095, + "step": 233490 + }, + { + "epoch": 33.14407381121363, + "grad_norm": 0.9652037024497986, + "learning_rate": 6.686969481902058e-05, + "loss": 0.0010501332581043243, + "step": 233500 + }, + { + "epoch": 33.14407381121363, + "eval_accuracy": 0.988046035480384, + "eval_loss": 0.0511661060154438, + "eval_runtime": 33.4137, + "eval_samples_per_second": 470.675, + "eval_steps_per_second": 14.724, + "step": 233500 + }, + { + "epoch": 33.145493257629525, + "grad_norm": 0.11278227716684341, + "learning_rate": 6.686827537260468e-05, + "loss": 0.010985912382602691, + "step": 233510 + }, + { + "epoch": 33.14691270404542, + "grad_norm": 1.0396829843521118, + "learning_rate": 6.686685592618879e-05, + "loss": 0.025447699427604675, + "step": 233520 + }, + { + "epoch": 33.14833215046132, + "grad_norm": 0.024442046880722046, + "learning_rate": 6.686543647977289e-05, + "loss": 0.05104362368583679, + "step": 233530 + }, + { + "epoch": 33.149751596877216, + "grad_norm": 17.73453140258789, + "learning_rate": 6.6864017033357e-05, + "loss": 0.024703022837638856, + "step": 233540 + }, + { + "epoch": 33.15117104329312, + "grad_norm": 6.855377674102783, + "learning_rate": 6.68625975869411e-05, + "loss": 0.06453539133071899, + "step": 233550 + }, + { + "epoch": 33.15259048970901, + "grad_norm": 0.020894097164273262, + "learning_rate": 6.68611781405252e-05, + "loss": 0.04764627814292908, + "step": 233560 + }, + { + "epoch": 33.154009936124915, + "grad_norm": 0.02138455957174301, + "learning_rate": 6.685975869410929e-05, + "loss": 0.03184522092342377, + "step": 233570 + }, + { + "epoch": 33.15542938254081, + "grad_norm": 0.03723510727286339, + "learning_rate": 6.68583392476934e-05, + "loss": 0.005528325214982033, + "step": 233580 + }, + { + "epoch": 33.156848828956704, + "grad_norm": 1.0821117162704468, + "learning_rate": 6.68569198012775e-05, + "loss": 0.0021553833037614824, + "step": 233590 + }, + { + "epoch": 33.158268275372606, + "grad_norm": 0.025012128055095673, + "learning_rate": 6.685550035486161e-05, + "loss": 0.003945128247141838, + "step": 233600 + }, + { + "epoch": 33.1596877217885, + "grad_norm": 0.08692723512649536, + "learning_rate": 6.685408090844571e-05, + "loss": 0.0015285030007362367, + "step": 233610 + }, + { + "epoch": 33.1611071682044, + "grad_norm": 0.40331217646598816, + "learning_rate": 6.68526614620298e-05, + "loss": 0.004440701007843018, + "step": 233620 + }, + { + "epoch": 33.1625266146203, + "grad_norm": 10.579977989196777, + "learning_rate": 6.685124201561392e-05, + "loss": 0.003727759420871735, + "step": 233630 + }, + { + "epoch": 33.1639460610362, + "grad_norm": 0.038057733327150345, + "learning_rate": 6.684982256919801e-05, + "loss": 0.023101110756397248, + "step": 233640 + }, + { + "epoch": 33.165365507452094, + "grad_norm": 0.3885143995285034, + "learning_rate": 6.684840312278213e-05, + "loss": 0.006836455315351486, + "step": 233650 + }, + { + "epoch": 33.16678495386799, + "grad_norm": 0.2088741809129715, + "learning_rate": 6.684698367636621e-05, + "loss": 0.0029083773493766784, + "step": 233660 + }, + { + "epoch": 33.16820440028389, + "grad_norm": 0.03473229706287384, + "learning_rate": 6.684556422995032e-05, + "loss": 0.01826951503753662, + "step": 233670 + }, + { + "epoch": 33.169623846699785, + "grad_norm": 9.644742965698242, + "learning_rate": 6.684414478353442e-05, + "loss": 0.03616454601287842, + "step": 233680 + }, + { + "epoch": 33.17104329311569, + "grad_norm": 0.03992383927106857, + "learning_rate": 6.684272533711853e-05, + "loss": 0.0005787279456853867, + "step": 233690 + }, + { + "epoch": 33.17246273953158, + "grad_norm": 0.013497031293809414, + "learning_rate": 6.684130589070264e-05, + "loss": 0.010306386649608612, + "step": 233700 + }, + { + "epoch": 33.173882185947484, + "grad_norm": 0.02980031631886959, + "learning_rate": 6.683988644428674e-05, + "loss": 0.002165204659104347, + "step": 233710 + }, + { + "epoch": 33.17530163236338, + "grad_norm": 0.0604572668671608, + "learning_rate": 6.683846699787083e-05, + "loss": 0.007351689040660858, + "step": 233720 + }, + { + "epoch": 33.17672107877927, + "grad_norm": 0.03832433745265007, + "learning_rate": 6.683704755145493e-05, + "loss": 0.005150881037116051, + "step": 233730 + }, + { + "epoch": 33.178140525195175, + "grad_norm": 0.0468791164457798, + "learning_rate": 6.683562810503904e-05, + "loss": 0.005359580367803573, + "step": 233740 + }, + { + "epoch": 33.17955997161107, + "grad_norm": 0.015428449958562851, + "learning_rate": 6.683420865862314e-05, + "loss": 0.00745561346411705, + "step": 233750 + }, + { + "epoch": 33.18097941802697, + "grad_norm": 0.3669448494911194, + "learning_rate": 6.683278921220725e-05, + "loss": 0.0029941268265247347, + "step": 233760 + }, + { + "epoch": 33.18239886444287, + "grad_norm": 0.23417966067790985, + "learning_rate": 6.683136976579133e-05, + "loss": 0.014048345386981964, + "step": 233770 + }, + { + "epoch": 33.18381831085877, + "grad_norm": 0.4174904227256775, + "learning_rate": 6.682995031937545e-05, + "loss": 0.004466451704502106, + "step": 233780 + }, + { + "epoch": 33.18523775727466, + "grad_norm": 22.998750686645508, + "learning_rate": 6.682853087295956e-05, + "loss": 0.04613946676254273, + "step": 233790 + }, + { + "epoch": 33.18665720369056, + "grad_norm": 4.055075645446777, + "learning_rate": 6.682711142654365e-05, + "loss": 0.020310547947883607, + "step": 233800 + }, + { + "epoch": 33.18807665010646, + "grad_norm": 0.009953886270523071, + "learning_rate": 6.682569198012776e-05, + "loss": 0.013074669241905212, + "step": 233810 + }, + { + "epoch": 33.189496096522355, + "grad_norm": 0.04905260354280472, + "learning_rate": 6.682427253371185e-05, + "loss": 0.005987417325377465, + "step": 233820 + }, + { + "epoch": 33.19091554293826, + "grad_norm": 0.004849367309361696, + "learning_rate": 6.682285308729596e-05, + "loss": 0.0008703749626874924, + "step": 233830 + }, + { + "epoch": 33.19233498935415, + "grad_norm": 2.3980891704559326, + "learning_rate": 6.682143364088006e-05, + "loss": 0.007647818326950074, + "step": 233840 + }, + { + "epoch": 33.19375443577005, + "grad_norm": 0.051546886563301086, + "learning_rate": 6.682001419446417e-05, + "loss": 0.05345403552055359, + "step": 233850 + }, + { + "epoch": 33.19517388218595, + "grad_norm": 9.360692024230957, + "learning_rate": 6.681859474804826e-05, + "loss": 0.02700009346008301, + "step": 233860 + }, + { + "epoch": 33.19659332860184, + "grad_norm": 0.029519930481910706, + "learning_rate": 6.681717530163236e-05, + "loss": 0.002945105358958244, + "step": 233870 + }, + { + "epoch": 33.198012775017745, + "grad_norm": 0.04676840454339981, + "learning_rate": 6.681575585521647e-05, + "loss": 0.007978560030460357, + "step": 233880 + }, + { + "epoch": 33.19943222143364, + "grad_norm": 9.016703605651855, + "learning_rate": 6.681433640880057e-05, + "loss": 0.05397295951843262, + "step": 233890 + }, + { + "epoch": 33.20085166784954, + "grad_norm": 0.04345522075891495, + "learning_rate": 6.681291696238468e-05, + "loss": 0.001767541468143463, + "step": 233900 + }, + { + "epoch": 33.202271114265436, + "grad_norm": 0.1434599757194519, + "learning_rate": 6.681149751596878e-05, + "loss": 0.015801993012428284, + "step": 233910 + }, + { + "epoch": 33.20369056068134, + "grad_norm": 0.15655562281608582, + "learning_rate": 6.681007806955288e-05, + "loss": 0.029455965757369994, + "step": 233920 + }, + { + "epoch": 33.20511000709723, + "grad_norm": 3.6737663745880127, + "learning_rate": 6.680865862313697e-05, + "loss": 0.0032779380679130556, + "step": 233930 + }, + { + "epoch": 33.20652945351313, + "grad_norm": 5.022531986236572, + "learning_rate": 6.680723917672108e-05, + "loss": 0.005018160492181778, + "step": 233940 + }, + { + "epoch": 33.20794889992903, + "grad_norm": 0.05431528016924858, + "learning_rate": 6.680581973030518e-05, + "loss": 0.037546944618225095, + "step": 233950 + }, + { + "epoch": 33.209368346344924, + "grad_norm": 3.3834187984466553, + "learning_rate": 6.680440028388929e-05, + "loss": 0.00802500694990158, + "step": 233960 + }, + { + "epoch": 33.210787792760826, + "grad_norm": 0.12480318546295166, + "learning_rate": 6.680298083747339e-05, + "loss": 0.00783214271068573, + "step": 233970 + }, + { + "epoch": 33.21220723917672, + "grad_norm": 6.7063775062561035, + "learning_rate": 6.680156139105749e-05, + "loss": 0.03108794093132019, + "step": 233980 + }, + { + "epoch": 33.21362668559262, + "grad_norm": 0.051691945642232895, + "learning_rate": 6.68001419446416e-05, + "loss": 0.004519025608897209, + "step": 233990 + }, + { + "epoch": 33.21504613200852, + "grad_norm": 0.08351609855890274, + "learning_rate": 6.67987224982257e-05, + "loss": 0.009213033318519592, + "step": 234000 + }, + { + "epoch": 33.21504613200852, + "eval_accuracy": 0.9885547148216443, + "eval_loss": 0.048911549150943756, + "eval_runtime": 33.494, + "eval_samples_per_second": 469.546, + "eval_steps_per_second": 14.689, + "step": 234000 + }, + { + "epoch": 33.21646557842441, + "grad_norm": 12.847097396850586, + "learning_rate": 6.679730305180981e-05, + "loss": 0.041031491756439206, + "step": 234010 + }, + { + "epoch": 33.217885024840314, + "grad_norm": 0.05760093405842781, + "learning_rate": 6.679588360539389e-05, + "loss": 0.018675903975963592, + "step": 234020 + }, + { + "epoch": 33.21930447125621, + "grad_norm": 1.339172124862671, + "learning_rate": 6.6794464158978e-05, + "loss": 0.0027301646769046783, + "step": 234030 + }, + { + "epoch": 33.22072391767211, + "grad_norm": 0.22511158883571625, + "learning_rate": 6.67930447125621e-05, + "loss": 0.004426179453730583, + "step": 234040 + }, + { + "epoch": 33.222143364088005, + "grad_norm": 0.052577629685401917, + "learning_rate": 6.679162526614621e-05, + "loss": 0.0009278155863285065, + "step": 234050 + }, + { + "epoch": 33.22356281050391, + "grad_norm": 0.01491778064519167, + "learning_rate": 6.679020581973031e-05, + "loss": 0.0027528423815965652, + "step": 234060 + }, + { + "epoch": 33.2249822569198, + "grad_norm": 3.006554365158081, + "learning_rate": 6.678878637331442e-05, + "loss": 0.003032988682389259, + "step": 234070 + }, + { + "epoch": 33.226401703335696, + "grad_norm": 0.39298203587532043, + "learning_rate": 6.678736692689852e-05, + "loss": 0.023786969482898712, + "step": 234080 + }, + { + "epoch": 33.2278211497516, + "grad_norm": 0.011240835301578045, + "learning_rate": 6.678594748048261e-05, + "loss": 0.006125143170356751, + "step": 234090 + }, + { + "epoch": 33.22924059616749, + "grad_norm": 0.06871847063302994, + "learning_rate": 6.678452803406672e-05, + "loss": 0.005964962393045425, + "step": 234100 + }, + { + "epoch": 33.230660042583395, + "grad_norm": 0.03394101560115814, + "learning_rate": 6.678310858765082e-05, + "loss": 0.0071018412709236145, + "step": 234110 + }, + { + "epoch": 33.23207948899929, + "grad_norm": 0.2580115497112274, + "learning_rate": 6.678168914123493e-05, + "loss": 0.0012913040816783905, + "step": 234120 + }, + { + "epoch": 33.23349893541519, + "grad_norm": 0.3244493901729584, + "learning_rate": 6.678026969481902e-05, + "loss": 0.009963646531105042, + "step": 234130 + }, + { + "epoch": 33.234918381831086, + "grad_norm": 0.01086276862770319, + "learning_rate": 6.677885024840313e-05, + "loss": 0.0037088338285684586, + "step": 234140 + }, + { + "epoch": 33.23633782824698, + "grad_norm": 0.7692731022834778, + "learning_rate": 6.677743080198722e-05, + "loss": 0.006198544055223465, + "step": 234150 + }, + { + "epoch": 33.23775727466288, + "grad_norm": 0.22529275715351105, + "learning_rate": 6.677601135557134e-05, + "loss": 0.0010684389621019364, + "step": 234160 + }, + { + "epoch": 33.23917672107878, + "grad_norm": 0.028435086831450462, + "learning_rate": 6.677459190915543e-05, + "loss": 0.002316346764564514, + "step": 234170 + }, + { + "epoch": 33.24059616749468, + "grad_norm": 3.410317897796631, + "learning_rate": 6.677317246273953e-05, + "loss": 0.005131001770496369, + "step": 234180 + }, + { + "epoch": 33.242015613910574, + "grad_norm": 9.984674453735352, + "learning_rate": 6.677175301632364e-05, + "loss": 0.017441952228546144, + "step": 234190 + }, + { + "epoch": 33.243435060326476, + "grad_norm": 0.054334547370672226, + "learning_rate": 6.677033356990774e-05, + "loss": 0.011415466666221619, + "step": 234200 + }, + { + "epoch": 33.24485450674237, + "grad_norm": 0.2871350049972534, + "learning_rate": 6.676891412349185e-05, + "loss": 0.032577088475227355, + "step": 234210 + }, + { + "epoch": 33.246273953158266, + "grad_norm": 0.015596027486026287, + "learning_rate": 6.676749467707595e-05, + "loss": 0.006145032495260239, + "step": 234220 + }, + { + "epoch": 33.24769339957417, + "grad_norm": 0.011379465460777283, + "learning_rate": 6.676607523066004e-05, + "loss": 0.002318020910024643, + "step": 234230 + }, + { + "epoch": 33.24911284599006, + "grad_norm": 10.3837890625, + "learning_rate": 6.676465578424414e-05, + "loss": 0.02048136293888092, + "step": 234240 + }, + { + "epoch": 33.250532292405964, + "grad_norm": 0.18506494164466858, + "learning_rate": 6.676323633782825e-05, + "loss": 0.0013022135943174362, + "step": 234250 + }, + { + "epoch": 33.25195173882186, + "grad_norm": 1.0906410217285156, + "learning_rate": 6.676181689141235e-05, + "loss": 0.0033903051167726516, + "step": 234260 + }, + { + "epoch": 33.25337118523776, + "grad_norm": 9.639078140258789, + "learning_rate": 6.676039744499646e-05, + "loss": 0.011625054478645324, + "step": 234270 + }, + { + "epoch": 33.254790631653655, + "grad_norm": 0.19092154502868652, + "learning_rate": 6.675897799858056e-05, + "loss": 0.012114310264587402, + "step": 234280 + }, + { + "epoch": 33.25621007806955, + "grad_norm": 0.005104308947920799, + "learning_rate": 6.675755855216466e-05, + "loss": 0.004198534786701203, + "step": 234290 + }, + { + "epoch": 33.25762952448545, + "grad_norm": 0.02708045393228531, + "learning_rate": 6.675613910574877e-05, + "loss": 0.024370075762271882, + "step": 234300 + }, + { + "epoch": 33.25904897090135, + "grad_norm": 0.23471218347549438, + "learning_rate": 6.675471965933286e-05, + "loss": 0.0009724624454975128, + "step": 234310 + }, + { + "epoch": 33.26046841731725, + "grad_norm": 0.5575664639472961, + "learning_rate": 6.675330021291697e-05, + "loss": 0.001916193589568138, + "step": 234320 + }, + { + "epoch": 33.26188786373314, + "grad_norm": 0.004176279995590448, + "learning_rate": 6.675188076650106e-05, + "loss": 0.0007734742015600204, + "step": 234330 + }, + { + "epoch": 33.263307310149045, + "grad_norm": 2.920609712600708, + "learning_rate": 6.675046132008517e-05, + "loss": 0.008864916115999221, + "step": 234340 + }, + { + "epoch": 33.26472675656494, + "grad_norm": 5.787831783294678, + "learning_rate": 6.674904187366927e-05, + "loss": 0.028482720255851746, + "step": 234350 + }, + { + "epoch": 33.266146202980835, + "grad_norm": 0.06801189482212067, + "learning_rate": 6.674762242725338e-05, + "loss": 0.0021121140569448473, + "step": 234360 + }, + { + "epoch": 33.26756564939674, + "grad_norm": 3.312286853790283, + "learning_rate": 6.674620298083747e-05, + "loss": 0.007769698649644852, + "step": 234370 + }, + { + "epoch": 33.26898509581263, + "grad_norm": 10.785375595092773, + "learning_rate": 6.674478353442157e-05, + "loss": 0.008757569640874863, + "step": 234380 + }, + { + "epoch": 33.27040454222853, + "grad_norm": 1.9413939714431763, + "learning_rate": 6.674336408800568e-05, + "loss": 0.026570558547973633, + "step": 234390 + }, + { + "epoch": 33.27182398864443, + "grad_norm": 0.05666542425751686, + "learning_rate": 6.674194464158978e-05, + "loss": 0.004696536436676979, + "step": 234400 + }, + { + "epoch": 33.27324343506033, + "grad_norm": 10.54587459564209, + "learning_rate": 6.674052519517389e-05, + "loss": 0.02693682014942169, + "step": 234410 + }, + { + "epoch": 33.274662881476225, + "grad_norm": 0.1945808380842209, + "learning_rate": 6.673910574875799e-05, + "loss": 0.006191116198897362, + "step": 234420 + }, + { + "epoch": 33.27608232789212, + "grad_norm": 0.04081287980079651, + "learning_rate": 6.67376863023421e-05, + "loss": 0.021575236320495607, + "step": 234430 + }, + { + "epoch": 33.27750177430802, + "grad_norm": 0.8260237574577332, + "learning_rate": 6.673626685592618e-05, + "loss": 0.014854776859283447, + "step": 234440 + }, + { + "epoch": 33.278921220723916, + "grad_norm": 1.5600595474243164, + "learning_rate": 6.67348474095103e-05, + "loss": 0.0251138836145401, + "step": 234450 + }, + { + "epoch": 33.28034066713982, + "grad_norm": 0.14217649400234222, + "learning_rate": 6.673342796309439e-05, + "loss": 0.02960202991962433, + "step": 234460 + }, + { + "epoch": 33.28176011355571, + "grad_norm": 0.016632072627544403, + "learning_rate": 6.67320085166785e-05, + "loss": 0.0009741503745317459, + "step": 234470 + }, + { + "epoch": 33.283179559971614, + "grad_norm": 0.08310196548700333, + "learning_rate": 6.67305890702626e-05, + "loss": 0.0028669312596321106, + "step": 234480 + }, + { + "epoch": 33.28459900638751, + "grad_norm": 0.8343585133552551, + "learning_rate": 6.67291696238467e-05, + "loss": 0.004730924218893051, + "step": 234490 + }, + { + "epoch": 33.286018452803404, + "grad_norm": 8.698060035705566, + "learning_rate": 6.672775017743081e-05, + "loss": 0.015089142322540283, + "step": 234500 + }, + { + "epoch": 33.286018452803404, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.05357368662953377, + "eval_runtime": 34.072, + "eval_samples_per_second": 461.582, + "eval_steps_per_second": 14.44, + "step": 234500 + }, + { + "epoch": 33.287437899219306, + "grad_norm": 10.680930137634277, + "learning_rate": 6.67263307310149e-05, + "loss": 0.04065114855766296, + "step": 234510 + }, + { + "epoch": 33.2888573456352, + "grad_norm": 0.1185414269566536, + "learning_rate": 6.672491128459902e-05, + "loss": 0.013488034904003143, + "step": 234520 + }, + { + "epoch": 33.2902767920511, + "grad_norm": 0.06110299378633499, + "learning_rate": 6.672349183818311e-05, + "loss": 0.010936358571052551, + "step": 234530 + }, + { + "epoch": 33.291696238467, + "grad_norm": 9.963912963867188, + "learning_rate": 6.672207239176721e-05, + "loss": 0.009226620942354203, + "step": 234540 + }, + { + "epoch": 33.2931156848829, + "grad_norm": 0.04855835437774658, + "learning_rate": 6.672065294535131e-05, + "loss": 0.02042834460735321, + "step": 234550 + }, + { + "epoch": 33.294535131298794, + "grad_norm": 0.0368192084133625, + "learning_rate": 6.671923349893542e-05, + "loss": 0.022215184569358826, + "step": 234560 + }, + { + "epoch": 33.29595457771469, + "grad_norm": 0.07107844948768616, + "learning_rate": 6.671781405251952e-05, + "loss": 0.028411248326301576, + "step": 234570 + }, + { + "epoch": 33.29737402413059, + "grad_norm": 0.7129390239715576, + "learning_rate": 6.671639460610363e-05, + "loss": 0.029192206263542176, + "step": 234580 + }, + { + "epoch": 33.298793470546485, + "grad_norm": 2.24050235748291, + "learning_rate": 6.671497515968773e-05, + "loss": 0.012563689053058625, + "step": 234590 + }, + { + "epoch": 33.30021291696239, + "grad_norm": 0.26438552141189575, + "learning_rate": 6.671355571327182e-05, + "loss": 0.006303469836711884, + "step": 234600 + }, + { + "epoch": 33.30163236337828, + "grad_norm": 0.1668052077293396, + "learning_rate": 6.671213626685593e-05, + "loss": 0.017075373232364653, + "step": 234610 + }, + { + "epoch": 33.303051809794184, + "grad_norm": 0.12075202912092209, + "learning_rate": 6.671071682044003e-05, + "loss": 0.0013048507273197175, + "step": 234620 + }, + { + "epoch": 33.30447125621008, + "grad_norm": 0.45734259486198425, + "learning_rate": 6.670929737402414e-05, + "loss": 0.007796604186296463, + "step": 234630 + }, + { + "epoch": 33.30589070262597, + "grad_norm": 0.677655816078186, + "learning_rate": 6.670787792760823e-05, + "loss": 0.034337544441223146, + "step": 234640 + }, + { + "epoch": 33.307310149041875, + "grad_norm": 0.9228860139846802, + "learning_rate": 6.670645848119234e-05, + "loss": 0.014959685504436493, + "step": 234650 + }, + { + "epoch": 33.30872959545777, + "grad_norm": 7.429572105407715, + "learning_rate": 6.670503903477643e-05, + "loss": 0.0030174747109413145, + "step": 234660 + }, + { + "epoch": 33.31014904187367, + "grad_norm": 0.8459559082984924, + "learning_rate": 6.670361958836055e-05, + "loss": 0.017225393652915956, + "step": 234670 + }, + { + "epoch": 33.311568488289566, + "grad_norm": 0.019635699689388275, + "learning_rate": 6.670220014194464e-05, + "loss": 0.0026556309312582016, + "step": 234680 + }, + { + "epoch": 33.31298793470547, + "grad_norm": 0.02224583551287651, + "learning_rate": 6.670078069552874e-05, + "loss": 0.020078234374523163, + "step": 234690 + }, + { + "epoch": 33.31440738112136, + "grad_norm": 0.034853409975767136, + "learning_rate": 6.669936124911285e-05, + "loss": 0.000836276262998581, + "step": 234700 + }, + { + "epoch": 33.31582682753726, + "grad_norm": 0.07765503227710724, + "learning_rate": 6.669794180269695e-05, + "loss": 0.020959998667240142, + "step": 234710 + }, + { + "epoch": 33.31724627395316, + "grad_norm": 0.06944771856069565, + "learning_rate": 6.669652235628106e-05, + "loss": 0.0027199208736419676, + "step": 234720 + }, + { + "epoch": 33.318665720369054, + "grad_norm": 3.006577491760254, + "learning_rate": 6.669510290986516e-05, + "loss": 0.004440510272979736, + "step": 234730 + }, + { + "epoch": 33.320085166784956, + "grad_norm": 0.007389269303530455, + "learning_rate": 6.669368346344925e-05, + "loss": 0.018882551789283754, + "step": 234740 + }, + { + "epoch": 33.32150461320085, + "grad_norm": 2.5920827388763428, + "learning_rate": 6.669226401703335e-05, + "loss": 0.01302606463432312, + "step": 234750 + }, + { + "epoch": 33.32292405961675, + "grad_norm": 7.698230743408203, + "learning_rate": 6.669084457061746e-05, + "loss": 0.013394021987915039, + "step": 234760 + }, + { + "epoch": 33.32434350603265, + "grad_norm": 2.619337320327759, + "learning_rate": 6.668942512420156e-05, + "loss": 0.01170523539185524, + "step": 234770 + }, + { + "epoch": 33.32576295244854, + "grad_norm": 0.012867116369307041, + "learning_rate": 6.668800567778567e-05, + "loss": 0.03431047201156616, + "step": 234780 + }, + { + "epoch": 33.327182398864444, + "grad_norm": 1.1649588346481323, + "learning_rate": 6.668658623136977e-05, + "loss": 0.03159071505069733, + "step": 234790 + }, + { + "epoch": 33.32860184528034, + "grad_norm": 21.671737670898438, + "learning_rate": 6.668516678495387e-05, + "loss": 0.011280842870473862, + "step": 234800 + }, + { + "epoch": 33.33002129169624, + "grad_norm": 0.5960906147956848, + "learning_rate": 6.668374733853798e-05, + "loss": 0.024617476761341094, + "step": 234810 + }, + { + "epoch": 33.331440738112136, + "grad_norm": 8.573455810546875, + "learning_rate": 6.668232789212207e-05, + "loss": 0.009478040784597398, + "step": 234820 + }, + { + "epoch": 33.33286018452804, + "grad_norm": 8.772199630737305, + "learning_rate": 6.668090844570618e-05, + "loss": 0.014904661476612091, + "step": 234830 + }, + { + "epoch": 33.33427963094393, + "grad_norm": 13.281238555908203, + "learning_rate": 6.667948899929028e-05, + "loss": 0.03531968891620636, + "step": 234840 + }, + { + "epoch": 33.33569907735983, + "grad_norm": 0.1687241941690445, + "learning_rate": 6.667806955287438e-05, + "loss": 0.05764603018760681, + "step": 234850 + }, + { + "epoch": 33.33711852377573, + "grad_norm": 5.705677509307861, + "learning_rate": 6.667665010645848e-05, + "loss": 0.0064090371131896974, + "step": 234860 + }, + { + "epoch": 33.338537970191624, + "grad_norm": 2.3543097972869873, + "learning_rate": 6.667523066004259e-05, + "loss": 0.015577907860279083, + "step": 234870 + }, + { + "epoch": 33.339957416607525, + "grad_norm": 5.354547023773193, + "learning_rate": 6.667381121362669e-05, + "loss": 0.01168162226676941, + "step": 234880 + }, + { + "epoch": 33.34137686302342, + "grad_norm": 16.74074935913086, + "learning_rate": 6.66723917672108e-05, + "loss": 0.030881047248840332, + "step": 234890 + }, + { + "epoch": 33.34279630943932, + "grad_norm": 0.9549186825752258, + "learning_rate": 6.66709723207949e-05, + "loss": 0.002226455882191658, + "step": 234900 + }, + { + "epoch": 33.34421575585522, + "grad_norm": 0.1678696721792221, + "learning_rate": 6.666955287437899e-05, + "loss": 0.01232762485742569, + "step": 234910 + }, + { + "epoch": 33.34563520227111, + "grad_norm": 0.030697051435709, + "learning_rate": 6.66681334279631e-05, + "loss": 0.010964766144752502, + "step": 234920 + }, + { + "epoch": 33.34705464868701, + "grad_norm": 0.05631287768483162, + "learning_rate": 6.66667139815472e-05, + "loss": 0.0023579921573400496, + "step": 234930 + }, + { + "epoch": 33.34847409510291, + "grad_norm": 0.07416084408760071, + "learning_rate": 6.666529453513131e-05, + "loss": 0.01415758728981018, + "step": 234940 + }, + { + "epoch": 33.34989354151881, + "grad_norm": 0.08388140052556992, + "learning_rate": 6.66638750887154e-05, + "loss": 0.007225978374481201, + "step": 234950 + }, + { + "epoch": 33.351312987934705, + "grad_norm": 29.034101486206055, + "learning_rate": 6.66624556422995e-05, + "loss": 0.009948694705963134, + "step": 234960 + }, + { + "epoch": 33.35273243435061, + "grad_norm": 10.172396659851074, + "learning_rate": 6.66610361958836e-05, + "loss": 0.01689884066581726, + "step": 234970 + }, + { + "epoch": 33.3541518807665, + "grad_norm": 0.14728648960590363, + "learning_rate": 6.665961674946771e-05, + "loss": 0.0019043687731027602, + "step": 234980 + }, + { + "epoch": 33.355571327182396, + "grad_norm": 0.005285155028104782, + "learning_rate": 6.665819730305181e-05, + "loss": 0.004151477664709091, + "step": 234990 + }, + { + "epoch": 33.3569907735983, + "grad_norm": 0.04198514297604561, + "learning_rate": 6.665677785663591e-05, + "loss": 0.005835907906293869, + "step": 235000 + }, + { + "epoch": 33.3569907735983, + "eval_accuracy": 0.9816239587969734, + "eval_loss": 0.08126246184110641, + "eval_runtime": 34.4903, + "eval_samples_per_second": 455.983, + "eval_steps_per_second": 14.265, + "step": 235000 + }, + { + "epoch": 33.35841022001419, + "grad_norm": 0.2823791205883026, + "learning_rate": 6.665535841022002e-05, + "loss": 0.005201838165521622, + "step": 235010 + }, + { + "epoch": 33.359829666430095, + "grad_norm": 9.188587188720703, + "learning_rate": 6.665393896380412e-05, + "loss": 0.048169839382171634, + "step": 235020 + }, + { + "epoch": 33.36124911284599, + "grad_norm": 3.401510238647461, + "learning_rate": 6.665251951738823e-05, + "loss": 0.02359810471534729, + "step": 235030 + }, + { + "epoch": 33.36266855926189, + "grad_norm": 2.4992940425872803, + "learning_rate": 6.665110007097232e-05, + "loss": 0.01188446506857872, + "step": 235040 + }, + { + "epoch": 33.364088005677786, + "grad_norm": 1.5170930624008179, + "learning_rate": 6.664968062455642e-05, + "loss": 0.005945419892668724, + "step": 235050 + }, + { + "epoch": 33.36550745209368, + "grad_norm": 0.804684042930603, + "learning_rate": 6.664826117814052e-05, + "loss": 0.007629917562007904, + "step": 235060 + }, + { + "epoch": 33.36692689850958, + "grad_norm": 0.3214358985424042, + "learning_rate": 6.664684173172463e-05, + "loss": 0.010605528950691223, + "step": 235070 + }, + { + "epoch": 33.36834634492548, + "grad_norm": 0.6096239686012268, + "learning_rate": 6.664542228530873e-05, + "loss": 0.017958980798721314, + "step": 235080 + }, + { + "epoch": 33.36976579134138, + "grad_norm": 15.330238342285156, + "learning_rate": 6.664400283889284e-05, + "loss": 0.03129295706748962, + "step": 235090 + }, + { + "epoch": 33.371185237757274, + "grad_norm": 0.08993805944919586, + "learning_rate": 6.664258339247695e-05, + "loss": 0.010407954454421997, + "step": 235100 + }, + { + "epoch": 33.372604684173176, + "grad_norm": 0.029439205303788185, + "learning_rate": 6.664116394606103e-05, + "loss": 0.0035878725349903105, + "step": 235110 + }, + { + "epoch": 33.37402413058907, + "grad_norm": 5.231251239776611, + "learning_rate": 6.663974449964514e-05, + "loss": 0.00774899274110794, + "step": 235120 + }, + { + "epoch": 33.375443577004965, + "grad_norm": 1.6018730401992798, + "learning_rate": 6.663832505322924e-05, + "loss": 0.014872407913208008, + "step": 235130 + }, + { + "epoch": 33.37686302342087, + "grad_norm": 0.1530383974313736, + "learning_rate": 6.663690560681335e-05, + "loss": 0.011966295540332794, + "step": 235140 + }, + { + "epoch": 33.37828246983676, + "grad_norm": 0.9631804823875427, + "learning_rate": 6.663548616039745e-05, + "loss": 0.018679648637771606, + "step": 235150 + }, + { + "epoch": 33.379701916252664, + "grad_norm": 0.2622833251953125, + "learning_rate": 6.663406671398155e-05, + "loss": 0.010934543609619141, + "step": 235160 + }, + { + "epoch": 33.38112136266856, + "grad_norm": 0.1507440209388733, + "learning_rate": 6.663264726756564e-05, + "loss": 0.003171612694859505, + "step": 235170 + }, + { + "epoch": 33.38254080908446, + "grad_norm": 0.2600557506084442, + "learning_rate": 6.663122782114976e-05, + "loss": 0.0016430046409368515, + "step": 235180 + }, + { + "epoch": 33.383960255500355, + "grad_norm": 3.079826593399048, + "learning_rate": 6.662980837473387e-05, + "loss": 0.006145862489938736, + "step": 235190 + }, + { + "epoch": 33.38537970191625, + "grad_norm": 1.827941656112671, + "learning_rate": 6.662838892831796e-05, + "loss": 0.0014503873884677888, + "step": 235200 + }, + { + "epoch": 33.38679914833215, + "grad_norm": 0.0069921850226819515, + "learning_rate": 6.662696948190206e-05, + "loss": 0.017330604791641235, + "step": 235210 + }, + { + "epoch": 33.38821859474805, + "grad_norm": 0.04101432114839554, + "learning_rate": 6.662555003548616e-05, + "loss": 0.0010550927370786666, + "step": 235220 + }, + { + "epoch": 33.38963804116395, + "grad_norm": 0.47972235083580017, + "learning_rate": 6.662413058907027e-05, + "loss": 0.033364680409431455, + "step": 235230 + }, + { + "epoch": 33.39105748757984, + "grad_norm": 0.0061553954146802425, + "learning_rate": 6.662271114265437e-05, + "loss": 0.001547236368060112, + "step": 235240 + }, + { + "epoch": 33.392476933995745, + "grad_norm": 0.18937939405441284, + "learning_rate": 6.662129169623848e-05, + "loss": 0.0069836899638175964, + "step": 235250 + }, + { + "epoch": 33.39389638041164, + "grad_norm": 3.5285048484802246, + "learning_rate": 6.661987224982256e-05, + "loss": 0.006449513137340546, + "step": 235260 + }, + { + "epoch": 33.395315826827535, + "grad_norm": 0.004784604534506798, + "learning_rate": 6.661845280340667e-05, + "loss": 0.001292988657951355, + "step": 235270 + }, + { + "epoch": 33.396735273243436, + "grad_norm": 0.34084323048591614, + "learning_rate": 6.661703335699078e-05, + "loss": 0.014249762892723084, + "step": 235280 + }, + { + "epoch": 33.39815471965933, + "grad_norm": 0.0389975979924202, + "learning_rate": 6.661561391057488e-05, + "loss": 0.002713264897465706, + "step": 235290 + }, + { + "epoch": 33.39957416607523, + "grad_norm": 0.7087458968162537, + "learning_rate": 6.661419446415899e-05, + "loss": 0.02759576141834259, + "step": 235300 + }, + { + "epoch": 33.40099361249113, + "grad_norm": 0.00935293734073639, + "learning_rate": 6.661277501774308e-05, + "loss": 0.03171307146549225, + "step": 235310 + }, + { + "epoch": 33.40241305890703, + "grad_norm": 1.1015084981918335, + "learning_rate": 6.661135557132719e-05, + "loss": 0.0028567012399435044, + "step": 235320 + }, + { + "epoch": 33.403832505322924, + "grad_norm": 0.20881448686122894, + "learning_rate": 6.660993612491128e-05, + "loss": 0.005067836865782738, + "step": 235330 + }, + { + "epoch": 33.40525195173882, + "grad_norm": 11.618258476257324, + "learning_rate": 6.66085166784954e-05, + "loss": 0.013276898860931396, + "step": 235340 + }, + { + "epoch": 33.40667139815472, + "grad_norm": 0.04803112521767616, + "learning_rate": 6.660709723207949e-05, + "loss": 0.010163235664367675, + "step": 235350 + }, + { + "epoch": 33.408090844570616, + "grad_norm": 3.6424291133880615, + "learning_rate": 6.660567778566359e-05, + "loss": 0.0017462756484746933, + "step": 235360 + }, + { + "epoch": 33.40951029098652, + "grad_norm": 0.07144174724817276, + "learning_rate": 6.66042583392477e-05, + "loss": 0.006791225075721741, + "step": 235370 + }, + { + "epoch": 33.41092973740241, + "grad_norm": 0.03227861225605011, + "learning_rate": 6.66028388928318e-05, + "loss": 0.000998152419924736, + "step": 235380 + }, + { + "epoch": 33.412349183818314, + "grad_norm": 0.00404343381524086, + "learning_rate": 6.660141944641591e-05, + "loss": 0.014389073848724366, + "step": 235390 + }, + { + "epoch": 33.41376863023421, + "grad_norm": 9.240031242370605, + "learning_rate": 6.66e-05, + "loss": 0.028122138977050782, + "step": 235400 + }, + { + "epoch": 33.415188076650104, + "grad_norm": 0.7892340421676636, + "learning_rate": 6.65985805535841e-05, + "loss": 0.006256423890590668, + "step": 235410 + }, + { + "epoch": 33.416607523066006, + "grad_norm": 1.3016993999481201, + "learning_rate": 6.65971611071682e-05, + "loss": 0.009946727752685547, + "step": 235420 + }, + { + "epoch": 33.4180269694819, + "grad_norm": 0.6303293108940125, + "learning_rate": 6.659574166075231e-05, + "loss": 0.002230331301689148, + "step": 235430 + }, + { + "epoch": 33.4194464158978, + "grad_norm": 9.869156837463379, + "learning_rate": 6.659432221433641e-05, + "loss": 0.0055646166205406185, + "step": 235440 + }, + { + "epoch": 33.4208658623137, + "grad_norm": 10.80003547668457, + "learning_rate": 6.659290276792052e-05, + "loss": 0.027459606528282166, + "step": 235450 + }, + { + "epoch": 33.4222853087296, + "grad_norm": 0.02536005899310112, + "learning_rate": 6.659148332150462e-05, + "loss": 0.0030848663300275803, + "step": 235460 + }, + { + "epoch": 33.42370475514549, + "grad_norm": 0.3687489330768585, + "learning_rate": 6.659006387508871e-05, + "loss": 0.026397505402565004, + "step": 235470 + }, + { + "epoch": 33.42512420156139, + "grad_norm": 0.1944843977689743, + "learning_rate": 6.658864442867283e-05, + "loss": 0.013749566674232484, + "step": 235480 + }, + { + "epoch": 33.42654364797729, + "grad_norm": 0.014424562454223633, + "learning_rate": 6.658722498225692e-05, + "loss": 0.024265463650226592, + "step": 235490 + }, + { + "epoch": 33.427963094393185, + "grad_norm": 0.044132430106401443, + "learning_rate": 6.658580553584103e-05, + "loss": 0.009567026048898697, + "step": 235500 + }, + { + "epoch": 33.427963094393185, + "eval_accuracy": 0.987918865645069, + "eval_loss": 0.05181927978992462, + "eval_runtime": 33.9827, + "eval_samples_per_second": 462.794, + "eval_steps_per_second": 14.478, + "step": 235500 + }, + { + "epoch": 33.42938254080909, + "grad_norm": 3.449162721633911, + "learning_rate": 6.658438608942513e-05, + "loss": 0.029227519035339357, + "step": 235510 + }, + { + "epoch": 33.43080198722498, + "grad_norm": 0.09443685412406921, + "learning_rate": 6.658296664300923e-05, + "loss": 0.018623116612434387, + "step": 235520 + }, + { + "epoch": 33.43222143364088, + "grad_norm": 0.04425282031297684, + "learning_rate": 6.658154719659333e-05, + "loss": 0.00335221104323864, + "step": 235530 + }, + { + "epoch": 33.43364088005678, + "grad_norm": 0.032668713480234146, + "learning_rate": 6.658012775017744e-05, + "loss": 0.0014457322657108306, + "step": 235540 + }, + { + "epoch": 33.43506032647267, + "grad_norm": 5.195737838745117, + "learning_rate": 6.657870830376153e-05, + "loss": 0.03636741042137146, + "step": 235550 + }, + { + "epoch": 33.436479772888575, + "grad_norm": 18.16716957092285, + "learning_rate": 6.657728885734565e-05, + "loss": 0.018603785336017607, + "step": 235560 + }, + { + "epoch": 33.43789921930447, + "grad_norm": 0.041110165417194366, + "learning_rate": 6.657586941092974e-05, + "loss": 0.01847650408744812, + "step": 235570 + }, + { + "epoch": 33.43931866572037, + "grad_norm": 2.1264376640319824, + "learning_rate": 6.657444996451384e-05, + "loss": 0.0029576711356639864, + "step": 235580 + }, + { + "epoch": 33.440738112136266, + "grad_norm": 0.15681371092796326, + "learning_rate": 6.657303051809795e-05, + "loss": 0.0037828199565410614, + "step": 235590 + }, + { + "epoch": 33.44215755855217, + "grad_norm": 0.17200833559036255, + "learning_rate": 6.657161107168205e-05, + "loss": 0.004662205651402474, + "step": 235600 + }, + { + "epoch": 33.44357700496806, + "grad_norm": 0.07213917374610901, + "learning_rate": 6.657019162526616e-05, + "loss": 0.0075795017182827, + "step": 235610 + }, + { + "epoch": 33.44499645138396, + "grad_norm": 0.04245593026280403, + "learning_rate": 6.656877217885024e-05, + "loss": 0.0015060491859912872, + "step": 235620 + }, + { + "epoch": 33.44641589779986, + "grad_norm": 0.048839300870895386, + "learning_rate": 6.656735273243435e-05, + "loss": 0.030129536986351013, + "step": 235630 + }, + { + "epoch": 33.447835344215754, + "grad_norm": 0.027552437037229538, + "learning_rate": 6.656593328601845e-05, + "loss": 0.06833714246749878, + "step": 235640 + }, + { + "epoch": 33.449254790631656, + "grad_norm": 0.022810988128185272, + "learning_rate": 6.656451383960256e-05, + "loss": 0.0013104446232318878, + "step": 235650 + }, + { + "epoch": 33.45067423704755, + "grad_norm": 0.025451157242059708, + "learning_rate": 6.656309439318666e-05, + "loss": 0.0006534405052661896, + "step": 235660 + }, + { + "epoch": 33.45209368346345, + "grad_norm": 0.19485986232757568, + "learning_rate": 6.656167494677076e-05, + "loss": 0.009499014914035797, + "step": 235670 + }, + { + "epoch": 33.45351312987935, + "grad_norm": 0.047403182834386826, + "learning_rate": 6.656025550035487e-05, + "loss": 0.002471959590911865, + "step": 235680 + }, + { + "epoch": 33.45493257629524, + "grad_norm": 0.07094839960336685, + "learning_rate": 6.655883605393897e-05, + "loss": 0.0076077252626419066, + "step": 235690 + }, + { + "epoch": 33.456352022711144, + "grad_norm": 0.014107485301792622, + "learning_rate": 6.655741660752308e-05, + "loss": 0.010783177614212037, + "step": 235700 + }, + { + "epoch": 33.45777146912704, + "grad_norm": 0.48276394605636597, + "learning_rate": 6.655599716110717e-05, + "loss": 0.0408993661403656, + "step": 235710 + }, + { + "epoch": 33.45919091554294, + "grad_norm": 0.1214965209364891, + "learning_rate": 6.655457771469127e-05, + "loss": 0.02500196695327759, + "step": 235720 + }, + { + "epoch": 33.460610361958835, + "grad_norm": 0.022731337696313858, + "learning_rate": 6.655315826827537e-05, + "loss": 0.0009846098721027375, + "step": 235730 + }, + { + "epoch": 33.46202980837474, + "grad_norm": 3.4646096229553223, + "learning_rate": 6.655173882185948e-05, + "loss": 0.0030526302754878996, + "step": 235740 + }, + { + "epoch": 33.46344925479063, + "grad_norm": 0.0010678601684048772, + "learning_rate": 6.655031937544358e-05, + "loss": 0.018603216111660003, + "step": 235750 + }, + { + "epoch": 33.46486870120653, + "grad_norm": 18.439258575439453, + "learning_rate": 6.654889992902769e-05, + "loss": 0.03088049292564392, + "step": 235760 + }, + { + "epoch": 33.46628814762243, + "grad_norm": 1.8647218942642212, + "learning_rate": 6.654748048261179e-05, + "loss": 0.0023313678801059725, + "step": 235770 + }, + { + "epoch": 33.46770759403832, + "grad_norm": 0.022777894511818886, + "learning_rate": 6.654606103619588e-05, + "loss": 0.022899892926216126, + "step": 235780 + }, + { + "epoch": 33.469127040454225, + "grad_norm": 3.182232618331909, + "learning_rate": 6.654464158978e-05, + "loss": 0.004709577187895775, + "step": 235790 + }, + { + "epoch": 33.47054648687012, + "grad_norm": 0.007122524082660675, + "learning_rate": 6.654322214336409e-05, + "loss": 0.00163588747382164, + "step": 235800 + }, + { + "epoch": 33.47196593328602, + "grad_norm": 3.9915223121643066, + "learning_rate": 6.65418026969482e-05, + "loss": 0.0022245321422815325, + "step": 235810 + }, + { + "epoch": 33.47338537970192, + "grad_norm": 1.2849191427230835, + "learning_rate": 6.65403832505323e-05, + "loss": 0.04009699821472168, + "step": 235820 + }, + { + "epoch": 33.47480482611781, + "grad_norm": 0.011669970117509365, + "learning_rate": 6.65389638041164e-05, + "loss": 0.009413519501686096, + "step": 235830 + }, + { + "epoch": 33.47622427253371, + "grad_norm": 13.86462688446045, + "learning_rate": 6.65375443577005e-05, + "loss": 0.009452444314956666, + "step": 235840 + }, + { + "epoch": 33.47764371894961, + "grad_norm": 0.6683921813964844, + "learning_rate": 6.65361249112846e-05, + "loss": 0.01467156559228897, + "step": 235850 + }, + { + "epoch": 33.47906316536551, + "grad_norm": 5.290279865264893, + "learning_rate": 6.65347054648687e-05, + "loss": 0.03377980589866638, + "step": 235860 + }, + { + "epoch": 33.480482611781405, + "grad_norm": 0.1595839411020279, + "learning_rate": 6.653328601845281e-05, + "loss": 0.0016872625797986983, + "step": 235870 + }, + { + "epoch": 33.481902058197306, + "grad_norm": 0.17075879871845245, + "learning_rate": 6.653186657203691e-05, + "loss": 0.0026704508811235426, + "step": 235880 + }, + { + "epoch": 33.4833215046132, + "grad_norm": 5.776748180389404, + "learning_rate": 6.653044712562101e-05, + "loss": 0.024500134587287902, + "step": 235890 + }, + { + "epoch": 33.484740951029096, + "grad_norm": 1.0520236492156982, + "learning_rate": 6.652902767920512e-05, + "loss": 0.005768214538693428, + "step": 235900 + }, + { + "epoch": 33.486160397445, + "grad_norm": 0.237184077501297, + "learning_rate": 6.652760823278922e-05, + "loss": 0.07140228748321534, + "step": 235910 + }, + { + "epoch": 33.48757984386089, + "grad_norm": 15.787062644958496, + "learning_rate": 6.652618878637333e-05, + "loss": 0.020879819989204407, + "step": 235920 + }, + { + "epoch": 33.488999290276794, + "grad_norm": 1.7011421918869019, + "learning_rate": 6.652476933995741e-05, + "loss": 0.0037321679294109343, + "step": 235930 + }, + { + "epoch": 33.49041873669269, + "grad_norm": 6.780256271362305, + "learning_rate": 6.652334989354152e-05, + "loss": 0.003699200227856636, + "step": 235940 + }, + { + "epoch": 33.49183818310859, + "grad_norm": 0.5206728577613831, + "learning_rate": 6.652193044712562e-05, + "loss": 0.0232962965965271, + "step": 235950 + }, + { + "epoch": 33.493257629524486, + "grad_norm": 0.1160406544804573, + "learning_rate": 6.652051100070973e-05, + "loss": 0.00516592338681221, + "step": 235960 + }, + { + "epoch": 33.49467707594038, + "grad_norm": 5.341609954833984, + "learning_rate": 6.651909155429383e-05, + "loss": 0.021324607729911804, + "step": 235970 + }, + { + "epoch": 33.49609652235628, + "grad_norm": 16.140546798706055, + "learning_rate": 6.651767210787792e-05, + "loss": 0.013221198320388794, + "step": 235980 + }, + { + "epoch": 33.49751596877218, + "grad_norm": 0.13565504550933838, + "learning_rate": 6.651625266146204e-05, + "loss": 0.04967617392539978, + "step": 235990 + }, + { + "epoch": 33.49893541518808, + "grad_norm": 0.12654809653759003, + "learning_rate": 6.651483321504613e-05, + "loss": 0.034380877017974855, + "step": 236000 + }, + { + "epoch": 33.49893541518808, + "eval_accuracy": 0.9860749030330006, + "eval_loss": 0.06354205310344696, + "eval_runtime": 33.2876, + "eval_samples_per_second": 472.458, + "eval_steps_per_second": 14.78, + "step": 236000 + }, + { + "epoch": 33.500354861603974, + "grad_norm": 19.420103073120117, + "learning_rate": 6.651341376863024e-05, + "loss": 0.007565267384052277, + "step": 236010 + }, + { + "epoch": 33.501774308019876, + "grad_norm": 6.376337051391602, + "learning_rate": 6.651199432221434e-05, + "loss": 0.007062363624572754, + "step": 236020 + }, + { + "epoch": 33.50319375443577, + "grad_norm": 0.020643705502152443, + "learning_rate": 6.651057487579844e-05, + "loss": 0.01391189992427826, + "step": 236030 + }, + { + "epoch": 33.504613200851665, + "grad_norm": 0.193635031580925, + "learning_rate": 6.650915542938254e-05, + "loss": 0.005590712279081344, + "step": 236040 + }, + { + "epoch": 33.50603264726757, + "grad_norm": 0.3622765839099884, + "learning_rate": 6.650773598296665e-05, + "loss": 0.01452556997537613, + "step": 236050 + }, + { + "epoch": 33.50745209368346, + "grad_norm": 0.011631938628852367, + "learning_rate": 6.650631653655074e-05, + "loss": 0.015374015271663665, + "step": 236060 + }, + { + "epoch": 33.50887154009936, + "grad_norm": 0.556368887424469, + "learning_rate": 6.650489709013486e-05, + "loss": 0.006302888691425324, + "step": 236070 + }, + { + "epoch": 33.51029098651526, + "grad_norm": 0.02873803675174713, + "learning_rate": 6.650347764371895e-05, + "loss": 0.01258976012468338, + "step": 236080 + }, + { + "epoch": 33.51171043293116, + "grad_norm": 2.368380308151245, + "learning_rate": 6.650205819730305e-05, + "loss": 0.0014314144849777223, + "step": 236090 + }, + { + "epoch": 33.513129879347055, + "grad_norm": 6.239690780639648, + "learning_rate": 6.650063875088716e-05, + "loss": 0.006194013357162476, + "step": 236100 + }, + { + "epoch": 33.51454932576295, + "grad_norm": 1.027497410774231, + "learning_rate": 6.649921930447126e-05, + "loss": 0.004256626963615418, + "step": 236110 + }, + { + "epoch": 33.51596877217885, + "grad_norm": 0.36154264211654663, + "learning_rate": 6.649779985805537e-05, + "loss": 0.011199262738227845, + "step": 236120 + }, + { + "epoch": 33.517388218594746, + "grad_norm": 0.4511148929595947, + "learning_rate": 6.649638041163945e-05, + "loss": 0.014057667553424835, + "step": 236130 + }, + { + "epoch": 33.51880766501065, + "grad_norm": 2.0565364360809326, + "learning_rate": 6.649496096522356e-05, + "loss": 0.005125245079398155, + "step": 236140 + }, + { + "epoch": 33.52022711142654, + "grad_norm": 0.06335282325744629, + "learning_rate": 6.649354151880766e-05, + "loss": 0.0011022675782442093, + "step": 236150 + }, + { + "epoch": 33.521646557842445, + "grad_norm": 0.013669699430465698, + "learning_rate": 6.649212207239177e-05, + "loss": 0.008982424437999726, + "step": 236160 + }, + { + "epoch": 33.52306600425834, + "grad_norm": 0.5645576119422913, + "learning_rate": 6.649070262597587e-05, + "loss": 0.0017159886658191681, + "step": 236170 + }, + { + "epoch": 33.524485450674234, + "grad_norm": 0.018745195120573044, + "learning_rate": 6.648928317955998e-05, + "loss": 0.007560199499130249, + "step": 236180 + }, + { + "epoch": 33.525904897090136, + "grad_norm": 0.6842513084411621, + "learning_rate": 6.648786373314408e-05, + "loss": 0.01748439818620682, + "step": 236190 + }, + { + "epoch": 33.52732434350603, + "grad_norm": 0.08590345829725266, + "learning_rate": 6.648644428672818e-05, + "loss": 0.014878953993320464, + "step": 236200 + }, + { + "epoch": 33.52874378992193, + "grad_norm": 1.1169626712799072, + "learning_rate": 6.648502484031229e-05, + "loss": 0.01270223706960678, + "step": 236210 + }, + { + "epoch": 33.53016323633783, + "grad_norm": 0.12039110064506531, + "learning_rate": 6.648360539389638e-05, + "loss": 0.003787514567375183, + "step": 236220 + }, + { + "epoch": 33.53158268275373, + "grad_norm": 0.18446625769138336, + "learning_rate": 6.64821859474805e-05, + "loss": 0.021686600148677827, + "step": 236230 + }, + { + "epoch": 33.533002129169624, + "grad_norm": 1.7765637636184692, + "learning_rate": 6.648076650106458e-05, + "loss": 0.011064283549785614, + "step": 236240 + }, + { + "epoch": 33.53442157558552, + "grad_norm": 0.014579021371901035, + "learning_rate": 6.647934705464869e-05, + "loss": 0.0008182946592569351, + "step": 236250 + }, + { + "epoch": 33.53584102200142, + "grad_norm": 0.26481446623802185, + "learning_rate": 6.647792760823279e-05, + "loss": 0.0027527812868356704, + "step": 236260 + }, + { + "epoch": 33.537260468417315, + "grad_norm": 0.011365306563675404, + "learning_rate": 6.64765081618169e-05, + "loss": 0.0009671632200479507, + "step": 236270 + }, + { + "epoch": 33.53867991483322, + "grad_norm": 0.9495236277580261, + "learning_rate": 6.6475088715401e-05, + "loss": 0.0007413089275360108, + "step": 236280 + }, + { + "epoch": 33.54009936124911, + "grad_norm": 0.0399358756840229, + "learning_rate": 6.647366926898509e-05, + "loss": 0.005436543002724647, + "step": 236290 + }, + { + "epoch": 33.541518807665014, + "grad_norm": 0.17820541560649872, + "learning_rate": 6.64722498225692e-05, + "loss": 0.003376581147313118, + "step": 236300 + }, + { + "epoch": 33.54293825408091, + "grad_norm": 0.08540186285972595, + "learning_rate": 6.64708303761533e-05, + "loss": 0.00275021567940712, + "step": 236310 + }, + { + "epoch": 33.5443577004968, + "grad_norm": 9.0049409866333, + "learning_rate": 6.646941092973741e-05, + "loss": 0.005267953500151634, + "step": 236320 + }, + { + "epoch": 33.545777146912705, + "grad_norm": 0.4982687532901764, + "learning_rate": 6.646799148332151e-05, + "loss": 0.005479089915752411, + "step": 236330 + }, + { + "epoch": 33.5471965933286, + "grad_norm": 9.083754539489746, + "learning_rate": 6.64665720369056e-05, + "loss": 0.004898447915911675, + "step": 236340 + }, + { + "epoch": 33.5486160397445, + "grad_norm": 0.003461139742285013, + "learning_rate": 6.64651525904897e-05, + "loss": 0.007118852436542511, + "step": 236350 + }, + { + "epoch": 33.5500354861604, + "grad_norm": 0.0302155539393425, + "learning_rate": 6.646373314407381e-05, + "loss": 0.002137655019760132, + "step": 236360 + }, + { + "epoch": 33.5514549325763, + "grad_norm": 0.00435618543997407, + "learning_rate": 6.646231369765791e-05, + "loss": 0.03833278715610504, + "step": 236370 + }, + { + "epoch": 33.55287437899219, + "grad_norm": 1.2776954174041748, + "learning_rate": 6.646089425124202e-05, + "loss": 0.011963607370853424, + "step": 236380 + }, + { + "epoch": 33.55429382540809, + "grad_norm": 24.601215362548828, + "learning_rate": 6.645947480482612e-05, + "loss": 0.023995471000671387, + "step": 236390 + }, + { + "epoch": 33.55571327182399, + "grad_norm": 0.040073294192552567, + "learning_rate": 6.645805535841022e-05, + "loss": 0.021477241814136506, + "step": 236400 + }, + { + "epoch": 33.557132718239885, + "grad_norm": 1.0081900358200073, + "learning_rate": 6.645663591199433e-05, + "loss": 0.08554401993751526, + "step": 236410 + }, + { + "epoch": 33.55855216465579, + "grad_norm": 0.011840836144983768, + "learning_rate": 6.645521646557843e-05, + "loss": 0.013283661007881165, + "step": 236420 + }, + { + "epoch": 33.55997161107168, + "grad_norm": 0.03918057680130005, + "learning_rate": 6.645379701916254e-05, + "loss": 0.008303044736385346, + "step": 236430 + }, + { + "epoch": 33.56139105748758, + "grad_norm": 0.23121967911720276, + "learning_rate": 6.645237757274662e-05, + "loss": 0.016514645516872407, + "step": 236440 + }, + { + "epoch": 33.56281050390348, + "grad_norm": 0.015227931551635265, + "learning_rate": 6.645095812633073e-05, + "loss": 0.0037471286952495573, + "step": 236450 + }, + { + "epoch": 33.56422995031937, + "grad_norm": 0.046524375677108765, + "learning_rate": 6.644953867991483e-05, + "loss": 0.0026313338428735735, + "step": 236460 + }, + { + "epoch": 33.565649396735274, + "grad_norm": 0.022758420556783676, + "learning_rate": 6.644811923349894e-05, + "loss": 0.007217519730329513, + "step": 236470 + }, + { + "epoch": 33.56706884315117, + "grad_norm": 0.021831806749105453, + "learning_rate": 6.644669978708304e-05, + "loss": 0.0007819216698408127, + "step": 236480 + }, + { + "epoch": 33.56848828956707, + "grad_norm": 0.13722175359725952, + "learning_rate": 6.644528034066714e-05, + "loss": 0.03399091958999634, + "step": 236490 + }, + { + "epoch": 33.569907735982966, + "grad_norm": 3.25241756439209, + "learning_rate": 6.644386089425125e-05, + "loss": 0.011285433173179626, + "step": 236500 + }, + { + "epoch": 33.569907735982966, + "eval_accuracy": 0.9886182997393018, + "eval_loss": 0.045976631343364716, + "eval_runtime": 34.7511, + "eval_samples_per_second": 452.561, + "eval_steps_per_second": 14.158, + "step": 236500 + }, + { + "epoch": 33.57132718239887, + "grad_norm": 2.08339524269104, + "learning_rate": 6.644244144783534e-05, + "loss": 0.002354852482676506, + "step": 236510 + }, + { + "epoch": 33.57274662881476, + "grad_norm": 2.5385875701904297, + "learning_rate": 6.644102200141945e-05, + "loss": 0.003114531934261322, + "step": 236520 + }, + { + "epoch": 33.57416607523066, + "grad_norm": 0.07088449597358704, + "learning_rate": 6.643960255500355e-05, + "loss": 0.005603602156043053, + "step": 236530 + }, + { + "epoch": 33.57558552164656, + "grad_norm": 0.018828442320227623, + "learning_rate": 6.643818310858766e-05, + "loss": 0.010539069771766663, + "step": 236540 + }, + { + "epoch": 33.577004968062454, + "grad_norm": 0.24582193791866302, + "learning_rate": 6.643676366217175e-05, + "loss": 0.019606320559978484, + "step": 236550 + }, + { + "epoch": 33.578424414478356, + "grad_norm": 0.03349224478006363, + "learning_rate": 6.643534421575586e-05, + "loss": 0.0009034741669893265, + "step": 236560 + }, + { + "epoch": 33.57984386089425, + "grad_norm": 0.1840941458940506, + "learning_rate": 6.643392476933995e-05, + "loss": 0.003731156513094902, + "step": 236570 + }, + { + "epoch": 33.58126330731015, + "grad_norm": 0.0032088211737573147, + "learning_rate": 6.643250532292407e-05, + "loss": 0.024331894516944886, + "step": 236580 + }, + { + "epoch": 33.58268275372605, + "grad_norm": 0.1330462247133255, + "learning_rate": 6.643108587650818e-05, + "loss": 0.028975272178649904, + "step": 236590 + }, + { + "epoch": 33.58410220014194, + "grad_norm": 2.338381052017212, + "learning_rate": 6.642966643009226e-05, + "loss": 0.013097062706947327, + "step": 236600 + }, + { + "epoch": 33.585521646557844, + "grad_norm": 0.2813742160797119, + "learning_rate": 6.642824698367637e-05, + "loss": 0.03513563573360443, + "step": 236610 + }, + { + "epoch": 33.58694109297374, + "grad_norm": 17.224775314331055, + "learning_rate": 6.642682753726047e-05, + "loss": 0.006093912944197655, + "step": 236620 + }, + { + "epoch": 33.58836053938964, + "grad_norm": 5.648160457611084, + "learning_rate": 6.642540809084458e-05, + "loss": 0.007199600338935852, + "step": 236630 + }, + { + "epoch": 33.589779985805535, + "grad_norm": 0.60243159532547, + "learning_rate": 6.642398864442868e-05, + "loss": 0.022020699083805086, + "step": 236640 + }, + { + "epoch": 33.59119943222144, + "grad_norm": 4.475730895996094, + "learning_rate": 6.642256919801277e-05, + "loss": 0.006486359238624573, + "step": 236650 + }, + { + "epoch": 33.59261887863733, + "grad_norm": 0.21232713758945465, + "learning_rate": 6.642114975159687e-05, + "loss": 0.03700805008411408, + "step": 236660 + }, + { + "epoch": 33.594038325053226, + "grad_norm": 0.7617281079292297, + "learning_rate": 6.641973030518098e-05, + "loss": 0.008928616344928742, + "step": 236670 + }, + { + "epoch": 33.59545777146913, + "grad_norm": 0.5175411701202393, + "learning_rate": 6.64183108587651e-05, + "loss": 0.02087359130382538, + "step": 236680 + }, + { + "epoch": 33.59687721788502, + "grad_norm": 0.01709691621363163, + "learning_rate": 6.641689141234919e-05, + "loss": 0.03176991939544678, + "step": 236690 + }, + { + "epoch": 33.598296664300925, + "grad_norm": 19.185970306396484, + "learning_rate": 6.641547196593329e-05, + "loss": 0.06384724378585815, + "step": 236700 + }, + { + "epoch": 33.59971611071682, + "grad_norm": 1.3626805543899536, + "learning_rate": 6.641405251951739e-05, + "loss": 0.0052518840879201886, + "step": 236710 + }, + { + "epoch": 33.60113555713272, + "grad_norm": 0.02667025476694107, + "learning_rate": 6.64126330731015e-05, + "loss": 0.0008886933326721191, + "step": 236720 + }, + { + "epoch": 33.602555003548616, + "grad_norm": 0.05405401065945625, + "learning_rate": 6.64112136266856e-05, + "loss": 0.023034927248954774, + "step": 236730 + }, + { + "epoch": 33.60397444996451, + "grad_norm": 0.04472708702087402, + "learning_rate": 6.64097941802697e-05, + "loss": 0.005884993076324463, + "step": 236740 + }, + { + "epoch": 33.60539389638041, + "grad_norm": 0.48857685923576355, + "learning_rate": 6.640837473385379e-05, + "loss": 0.048942452669143675, + "step": 236750 + }, + { + "epoch": 33.60681334279631, + "grad_norm": 0.1200292780995369, + "learning_rate": 6.64069552874379e-05, + "loss": 0.03967183530330658, + "step": 236760 + }, + { + "epoch": 33.60823278921221, + "grad_norm": 1.0115933418273926, + "learning_rate": 6.640553584102201e-05, + "loss": 0.01583397835493088, + "step": 236770 + }, + { + "epoch": 33.609652235628104, + "grad_norm": 0.013855859637260437, + "learning_rate": 6.640411639460611e-05, + "loss": 0.026114878058433533, + "step": 236780 + }, + { + "epoch": 33.611071682044006, + "grad_norm": 0.18910335004329681, + "learning_rate": 6.640269694819022e-05, + "loss": 0.0236398309469223, + "step": 236790 + }, + { + "epoch": 33.6124911284599, + "grad_norm": 0.41339996457099915, + "learning_rate": 6.64012775017743e-05, + "loss": 0.026897454261779787, + "step": 236800 + }, + { + "epoch": 33.613910574875796, + "grad_norm": 0.7661712169647217, + "learning_rate": 6.639985805535841e-05, + "loss": 0.034416279196739195, + "step": 236810 + }, + { + "epoch": 33.6153300212917, + "grad_norm": 0.031739648431539536, + "learning_rate": 6.639843860894251e-05, + "loss": 0.013792920112609863, + "step": 236820 + }, + { + "epoch": 33.61674946770759, + "grad_norm": 30.670103073120117, + "learning_rate": 6.639701916252662e-05, + "loss": 0.014661300182342529, + "step": 236830 + }, + { + "epoch": 33.618168914123494, + "grad_norm": 0.024655727669596672, + "learning_rate": 6.639559971611072e-05, + "loss": 0.0035085514187812804, + "step": 236840 + }, + { + "epoch": 33.61958836053939, + "grad_norm": 0.4393582344055176, + "learning_rate": 6.639418026969482e-05, + "loss": 0.003168940171599388, + "step": 236850 + }, + { + "epoch": 33.62100780695529, + "grad_norm": 0.06497247517108917, + "learning_rate": 6.639276082327893e-05, + "loss": 0.01722613275051117, + "step": 236860 + }, + { + "epoch": 33.622427253371185, + "grad_norm": 1.8670403957366943, + "learning_rate": 6.639134137686303e-05, + "loss": 0.01887476146221161, + "step": 236870 + }, + { + "epoch": 33.62384669978708, + "grad_norm": 0.002441816497594118, + "learning_rate": 6.638992193044714e-05, + "loss": 0.015341357886791229, + "step": 236880 + }, + { + "epoch": 33.62526614620298, + "grad_norm": 0.2543332874774933, + "learning_rate": 6.638850248403123e-05, + "loss": 0.03128589391708374, + "step": 236890 + }, + { + "epoch": 33.62668559261888, + "grad_norm": 1.2103768587112427, + "learning_rate": 6.638708303761534e-05, + "loss": 0.039283165335655214, + "step": 236900 + }, + { + "epoch": 33.62810503903478, + "grad_norm": 1.0677180290222168, + "learning_rate": 6.638566359119943e-05, + "loss": 0.04233380854129791, + "step": 236910 + }, + { + "epoch": 33.62952448545067, + "grad_norm": 0.06582970917224884, + "learning_rate": 6.638424414478354e-05, + "loss": 0.013340990245342254, + "step": 236920 + }, + { + "epoch": 33.630943931866575, + "grad_norm": 0.28778210282325745, + "learning_rate": 6.638282469836764e-05, + "loss": 0.012894253432750701, + "step": 236930 + }, + { + "epoch": 33.63236337828247, + "grad_norm": 0.07647979259490967, + "learning_rate": 6.638140525195175e-05, + "loss": 0.017018085718154906, + "step": 236940 + }, + { + "epoch": 33.633782824698365, + "grad_norm": 0.44766396284103394, + "learning_rate": 6.637998580553584e-05, + "loss": 0.0007906127721071243, + "step": 236950 + }, + { + "epoch": 33.63520227111427, + "grad_norm": 0.016751157119870186, + "learning_rate": 6.637856635911994e-05, + "loss": 0.0008848458528518677, + "step": 236960 + }, + { + "epoch": 33.63662171753016, + "grad_norm": 0.3203131854534149, + "learning_rate": 6.637714691270405e-05, + "loss": 0.0035949133336544035, + "step": 236970 + }, + { + "epoch": 33.63804116394606, + "grad_norm": 0.08986318111419678, + "learning_rate": 6.637572746628815e-05, + "loss": 0.03398029208183288, + "step": 236980 + }, + { + "epoch": 33.63946061036196, + "grad_norm": 19.360506057739258, + "learning_rate": 6.637430801987226e-05, + "loss": 0.014412401616573334, + "step": 236990 + }, + { + "epoch": 33.64088005677786, + "grad_norm": 0.14479483664035797, + "learning_rate": 6.637288857345636e-05, + "loss": 0.013692456483840942, + "step": 237000 + }, + { + "epoch": 33.64088005677786, + "eval_accuracy": 0.9877916958097539, + "eval_loss": 0.05581839755177498, + "eval_runtime": 33.1109, + "eval_samples_per_second": 474.98, + "eval_steps_per_second": 14.859, + "step": 237000 + } + ], + "logging_steps": 10, + "max_steps": 704500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}