diff --git "a/resnet50/checkpoint-24500/trainer_state.json" "b/resnet50/checkpoint-24500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/resnet50/checkpoint-24500/trainer_state.json" @@ -0,0 +1,17625 @@ +{ + "best_global_step": 23000, + "best_metric": 0.9688433903478095, + "best_model_checkpoint": "/workspace/output/resnet50/checkpoint-23000", + "epoch": 3.4776437189496097, + "eval_steps": 500, + "global_step": 24500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014194464158978, + "grad_norm": 3.2342276573181152, + "learning_rate": 9.999872249822569e-05, + "loss": 5.98863525390625, + "step": 10 + }, + { + "epoch": 0.0028388928317956, + "grad_norm": 3.3994972705841064, + "learning_rate": 9.99973030518098e-05, + "loss": 5.97633056640625, + "step": 20 + }, + { + "epoch": 0.0042583392476933995, + "grad_norm": 3.3180341720581055, + "learning_rate": 9.99958836053939e-05, + "loss": 5.97711181640625, + "step": 30 + }, + { + "epoch": 0.0056777856635912, + "grad_norm": 2.9379143714904785, + "learning_rate": 9.999446415897801e-05, + "loss": 5.9991455078125, + "step": 40 + }, + { + "epoch": 0.007097232079488999, + "grad_norm": 2.2698018550872803, + "learning_rate": 9.99930447125621e-05, + "loss": 5.96363525390625, + "step": 50 + }, + { + "epoch": 0.008516678495386799, + "grad_norm": 2.0626659393310547, + "learning_rate": 9.99916252661462e-05, + "loss": 5.96995849609375, + "step": 60 + }, + { + "epoch": 0.0099361249112846, + "grad_norm": 2.814460277557373, + "learning_rate": 9.999020581973031e-05, + "loss": 5.9493408203125, + "step": 70 + }, + { + "epoch": 0.0113555713271824, + "grad_norm": 2.871051788330078, + "learning_rate": 9.998878637331441e-05, + "loss": 5.9510498046875, + "step": 80 + }, + { + "epoch": 0.0127750177430802, + "grad_norm": 2.3897151947021484, + "learning_rate": 9.998736692689852e-05, + "loss": 5.94254150390625, + "step": 90 + }, + { + "epoch": 0.014194464158977998, + "grad_norm": 2.9910531044006348, + "learning_rate": 9.99859474804826e-05, + "loss": 5.9062255859375, + "step": 100 + }, + { + "epoch": 0.015613910574875798, + "grad_norm": 3.137518882751465, + "learning_rate": 9.998452803406672e-05, + "loss": 5.9070068359375, + "step": 110 + }, + { + "epoch": 0.017033356990773598, + "grad_norm": 3.021024703979492, + "learning_rate": 9.998310858765082e-05, + "loss": 5.87197265625, + "step": 120 + }, + { + "epoch": 0.018452803406671398, + "grad_norm": 3.499450445175171, + "learning_rate": 9.998168914123493e-05, + "loss": 5.8237548828125, + "step": 130 + }, + { + "epoch": 0.0198722498225692, + "grad_norm": 3.87576961517334, + "learning_rate": 9.998026969481902e-05, + "loss": 5.754150390625, + "step": 140 + }, + { + "epoch": 0.021291696238467, + "grad_norm": 3.9846458435058594, + "learning_rate": 9.997885024840313e-05, + "loss": 5.697198486328125, + "step": 150 + }, + { + "epoch": 0.0227111426543648, + "grad_norm": 4.339130878448486, + "learning_rate": 9.997743080198723e-05, + "loss": 5.63760986328125, + "step": 160 + }, + { + "epoch": 0.0241305890702626, + "grad_norm": 4.891483783721924, + "learning_rate": 9.997601135557133e-05, + "loss": 5.5271728515625, + "step": 170 + }, + { + "epoch": 0.0255500354861604, + "grad_norm": 5.147222995758057, + "learning_rate": 9.997459190915544e-05, + "loss": 5.45938720703125, + "step": 180 + }, + { + "epoch": 0.0269694819020582, + "grad_norm": 5.365755558013916, + "learning_rate": 9.997317246273954e-05, + "loss": 5.355255126953125, + "step": 190 + }, + { + "epoch": 0.028388928317955996, + "grad_norm": 5.888001918792725, + "learning_rate": 9.997175301632365e-05, + "loss": 5.1554931640625, + "step": 200 + }, + { + "epoch": 0.029808374733853796, + "grad_norm": 6.100172996520996, + "learning_rate": 9.997033356990773e-05, + "loss": 5.035284423828125, + "step": 210 + }, + { + "epoch": 0.031227821149751596, + "grad_norm": 6.491486549377441, + "learning_rate": 9.996891412349184e-05, + "loss": 4.899530029296875, + "step": 220 + }, + { + "epoch": 0.032647267565649396, + "grad_norm": 6.916806697845459, + "learning_rate": 9.996749467707594e-05, + "loss": 4.851350402832031, + "step": 230 + }, + { + "epoch": 0.034066713981547196, + "grad_norm": 6.837950706481934, + "learning_rate": 9.996607523066005e-05, + "loss": 4.726431274414063, + "step": 240 + }, + { + "epoch": 0.035486160397444996, + "grad_norm": 7.554074287414551, + "learning_rate": 9.996465578424415e-05, + "loss": 4.4839630126953125, + "step": 250 + }, + { + "epoch": 0.036905606813342796, + "grad_norm": 7.574995994567871, + "learning_rate": 9.996323633782825e-05, + "loss": 4.506732177734375, + "step": 260 + }, + { + "epoch": 0.0383250532292406, + "grad_norm": 7.498238563537598, + "learning_rate": 9.996181689141236e-05, + "loss": 4.319998168945313, + "step": 270 + }, + { + "epoch": 0.0397444996451384, + "grad_norm": 7.978142261505127, + "learning_rate": 9.996039744499645e-05, + "loss": 4.214613342285157, + "step": 280 + }, + { + "epoch": 0.0411639460610362, + "grad_norm": 8.194511413574219, + "learning_rate": 9.995897799858057e-05, + "loss": 4.212762451171875, + "step": 290 + }, + { + "epoch": 0.042583392476934, + "grad_norm": 8.136639595031738, + "learning_rate": 9.995755855216466e-05, + "loss": 4.009028625488281, + "step": 300 + }, + { + "epoch": 0.0440028388928318, + "grad_norm": 8.684012413024902, + "learning_rate": 9.995613910574876e-05, + "loss": 3.9817459106445314, + "step": 310 + }, + { + "epoch": 0.0454222853087296, + "grad_norm": 8.888952255249023, + "learning_rate": 9.995471965933286e-05, + "loss": 3.94019775390625, + "step": 320 + }, + { + "epoch": 0.0468417317246274, + "grad_norm": 8.79919719696045, + "learning_rate": 9.995330021291697e-05, + "loss": 3.9265777587890627, + "step": 330 + }, + { + "epoch": 0.0482611781405252, + "grad_norm": 8.571785926818848, + "learning_rate": 9.995188076650107e-05, + "loss": 3.7262115478515625, + "step": 340 + }, + { + "epoch": 0.049680624556423, + "grad_norm": 8.640142440795898, + "learning_rate": 9.995046132008518e-05, + "loss": 3.644915771484375, + "step": 350 + }, + { + "epoch": 0.0511000709723208, + "grad_norm": 9.322779655456543, + "learning_rate": 9.994904187366927e-05, + "loss": 3.644049072265625, + "step": 360 + }, + { + "epoch": 0.0525195173882186, + "grad_norm": 8.790424346923828, + "learning_rate": 9.994762242725337e-05, + "loss": 3.4869285583496095, + "step": 370 + }, + { + "epoch": 0.0539389638041164, + "grad_norm": 9.344154357910156, + "learning_rate": 9.994620298083748e-05, + "loss": 3.55142822265625, + "step": 380 + }, + { + "epoch": 0.05535841022001419, + "grad_norm": 8.807840347290039, + "learning_rate": 9.994478353442158e-05, + "loss": 3.4293190002441407, + "step": 390 + }, + { + "epoch": 0.05677785663591199, + "grad_norm": 9.36971378326416, + "learning_rate": 9.994336408800569e-05, + "loss": 3.429082489013672, + "step": 400 + }, + { + "epoch": 0.05819730305180979, + "grad_norm": 9.73521900177002, + "learning_rate": 9.994194464158977e-05, + "loss": 3.408639907836914, + "step": 410 + }, + { + "epoch": 0.05961674946770759, + "grad_norm": 9.646844863891602, + "learning_rate": 9.994052519517389e-05, + "loss": 3.1950119018554686, + "step": 420 + }, + { + "epoch": 0.06103619588360539, + "grad_norm": 9.722207069396973, + "learning_rate": 9.993910574875798e-05, + "loss": 3.4140243530273438, + "step": 430 + }, + { + "epoch": 0.06245564229950319, + "grad_norm": 10.609601020812988, + "learning_rate": 9.99376863023421e-05, + "loss": 3.320109558105469, + "step": 440 + }, + { + "epoch": 0.063875088715401, + "grad_norm": 10.271575927734375, + "learning_rate": 9.993626685592619e-05, + "loss": 3.232251739501953, + "step": 450 + }, + { + "epoch": 0.06529453513129879, + "grad_norm": 9.766585350036621, + "learning_rate": 9.993484740951029e-05, + "loss": 3.149517059326172, + "step": 460 + }, + { + "epoch": 0.0667139815471966, + "grad_norm": 10.358244895935059, + "learning_rate": 9.99334279630944e-05, + "loss": 3.1863967895507814, + "step": 470 + }, + { + "epoch": 0.06813342796309439, + "grad_norm": 10.473136901855469, + "learning_rate": 9.99320085166785e-05, + "loss": 3.222390365600586, + "step": 480 + }, + { + "epoch": 0.0695528743789922, + "grad_norm": 9.905110359191895, + "learning_rate": 9.993058907026261e-05, + "loss": 3.1823768615722656, + "step": 490 + }, + { + "epoch": 0.07097232079488999, + "grad_norm": 9.858973503112793, + "learning_rate": 9.99291696238467e-05, + "loss": 2.9202560424804687, + "step": 500 + }, + { + "epoch": 0.07097232079488999, + "eval_accuracy": 0.1867489031601704, + "eval_loss": 3.0744524002075195, + "eval_runtime": 31.2289, + "eval_samples_per_second": 503.605, + "eval_steps_per_second": 15.755, + "step": 500 + }, + { + "epoch": 0.0723917672107878, + "grad_norm": 10.224215507507324, + "learning_rate": 9.992775017743082e-05, + "loss": 3.0410499572753906, + "step": 510 + }, + { + "epoch": 0.07381121362668559, + "grad_norm": 9.867650032043457, + "learning_rate": 9.99263307310149e-05, + "loss": 3.116912078857422, + "step": 520 + }, + { + "epoch": 0.07523066004258339, + "grad_norm": 10.343064308166504, + "learning_rate": 9.992491128459901e-05, + "loss": 3.06390266418457, + "step": 530 + }, + { + "epoch": 0.0766501064584812, + "grad_norm": 10.38116455078125, + "learning_rate": 9.992349183818311e-05, + "loss": 2.973680114746094, + "step": 540 + }, + { + "epoch": 0.07806955287437899, + "grad_norm": 10.979643821716309, + "learning_rate": 9.992207239176722e-05, + "loss": 3.0906436920166014, + "step": 550 + }, + { + "epoch": 0.0794889992902768, + "grad_norm": 10.06657886505127, + "learning_rate": 9.992065294535132e-05, + "loss": 3.0091484069824217, + "step": 560 + }, + { + "epoch": 0.08090844570617459, + "grad_norm": 10.663322448730469, + "learning_rate": 9.991923349893541e-05, + "loss": 2.862255859375, + "step": 570 + }, + { + "epoch": 0.0823278921220724, + "grad_norm": 9.277785301208496, + "learning_rate": 9.991781405251952e-05, + "loss": 2.8638259887695314, + "step": 580 + }, + { + "epoch": 0.08374733853797019, + "grad_norm": 10.807332038879395, + "learning_rate": 9.991639460610362e-05, + "loss": 2.732352066040039, + "step": 590 + }, + { + "epoch": 0.085166784953868, + "grad_norm": 9.970373153686523, + "learning_rate": 9.991497515968773e-05, + "loss": 2.736968231201172, + "step": 600 + }, + { + "epoch": 0.08658623136976579, + "grad_norm": 11.008269309997559, + "learning_rate": 9.991355571327183e-05, + "loss": 2.7735246658325194, + "step": 610 + }, + { + "epoch": 0.0880056777856636, + "grad_norm": 8.758193969726562, + "learning_rate": 9.991213626685593e-05, + "loss": 2.5436214447021483, + "step": 620 + }, + { + "epoch": 0.08942512420156139, + "grad_norm": 11.253259658813477, + "learning_rate": 9.991071682044003e-05, + "loss": 2.748835563659668, + "step": 630 + }, + { + "epoch": 0.0908445706174592, + "grad_norm": 10.979547500610352, + "learning_rate": 9.990929737402414e-05, + "loss": 2.7314834594726562, + "step": 640 + }, + { + "epoch": 0.09226401703335699, + "grad_norm": 11.182887077331543, + "learning_rate": 9.990787792760823e-05, + "loss": 2.645678901672363, + "step": 650 + }, + { + "epoch": 0.0936834634492548, + "grad_norm": 10.636208534240723, + "learning_rate": 9.990645848119234e-05, + "loss": 2.5704013824462892, + "step": 660 + }, + { + "epoch": 0.09510290986515259, + "grad_norm": 10.351170539855957, + "learning_rate": 9.990503903477644e-05, + "loss": 2.5628406524658205, + "step": 670 + }, + { + "epoch": 0.0965223562810504, + "grad_norm": 9.914809226989746, + "learning_rate": 9.990361958836054e-05, + "loss": 2.5872230529785156, + "step": 680 + }, + { + "epoch": 0.09794180269694819, + "grad_norm": 10.839837074279785, + "learning_rate": 9.990220014194465e-05, + "loss": 2.490940475463867, + "step": 690 + }, + { + "epoch": 0.099361249112846, + "grad_norm": 11.259613990783691, + "learning_rate": 9.990078069552875e-05, + "loss": 2.64483585357666, + "step": 700 + }, + { + "epoch": 0.10078069552874379, + "grad_norm": 11.213078498840332, + "learning_rate": 9.989936124911286e-05, + "loss": 2.5397150039672853, + "step": 710 + }, + { + "epoch": 0.1022001419446416, + "grad_norm": 10.366206169128418, + "learning_rate": 9.989794180269694e-05, + "loss": 2.457781219482422, + "step": 720 + }, + { + "epoch": 0.10361958836053939, + "grad_norm": 11.44458293914795, + "learning_rate": 9.989652235628105e-05, + "loss": 2.5090484619140625, + "step": 730 + }, + { + "epoch": 0.1050390347764372, + "grad_norm": 11.689805030822754, + "learning_rate": 9.989510290986515e-05, + "loss": 2.409171485900879, + "step": 740 + }, + { + "epoch": 0.10645848119233499, + "grad_norm": 10.568279266357422, + "learning_rate": 9.989368346344926e-05, + "loss": 2.3308380126953123, + "step": 750 + }, + { + "epoch": 0.1078779276082328, + "grad_norm": 11.917696952819824, + "learning_rate": 9.989226401703337e-05, + "loss": 2.3733493804931642, + "step": 760 + }, + { + "epoch": 0.10929737402413059, + "grad_norm": 9.960722923278809, + "learning_rate": 9.989098651525906e-05, + "loss": 2.4058095932006838, + "step": 770 + }, + { + "epoch": 0.11071682044002838, + "grad_norm": 11.068999290466309, + "learning_rate": 9.988956706884315e-05, + "loss": 2.4371658325195313, + "step": 780 + }, + { + "epoch": 0.11213626685592619, + "grad_norm": 10.340009689331055, + "learning_rate": 9.988814762242725e-05, + "loss": 2.2587520599365236, + "step": 790 + }, + { + "epoch": 0.11355571327182398, + "grad_norm": 9.941303253173828, + "learning_rate": 9.988672817601136e-05, + "loss": 2.268446350097656, + "step": 800 + }, + { + "epoch": 0.11497515968772179, + "grad_norm": 11.490272521972656, + "learning_rate": 9.988530872959546e-05, + "loss": 2.471067428588867, + "step": 810 + }, + { + "epoch": 0.11639460610361958, + "grad_norm": 10.67241382598877, + "learning_rate": 9.988388928317957e-05, + "loss": 2.3497791290283203, + "step": 820 + }, + { + "epoch": 0.11781405251951739, + "grad_norm": 10.710894584655762, + "learning_rate": 9.988246983676367e-05, + "loss": 2.1724626541137697, + "step": 830 + }, + { + "epoch": 0.11923349893541518, + "grad_norm": 10.985452651977539, + "learning_rate": 9.988105039034778e-05, + "loss": 2.1848114013671873, + "step": 840 + }, + { + "epoch": 0.12065294535131299, + "grad_norm": 10.063145637512207, + "learning_rate": 9.987963094393186e-05, + "loss": 2.180558776855469, + "step": 850 + }, + { + "epoch": 0.12207239176721078, + "grad_norm": 11.236614227294922, + "learning_rate": 9.987821149751597e-05, + "loss": 2.282668876647949, + "step": 860 + }, + { + "epoch": 0.12349183818310859, + "grad_norm": 10.98898983001709, + "learning_rate": 9.987679205110007e-05, + "loss": 2.235186767578125, + "step": 870 + }, + { + "epoch": 0.12491128459900638, + "grad_norm": 11.805492401123047, + "learning_rate": 9.987537260468418e-05, + "loss": 2.2264921188354494, + "step": 880 + }, + { + "epoch": 0.1263307310149042, + "grad_norm": 10.717041015625, + "learning_rate": 9.987395315826828e-05, + "loss": 2.1385255813598634, + "step": 890 + }, + { + "epoch": 0.127750177430802, + "grad_norm": 9.613192558288574, + "learning_rate": 9.987253371185238e-05, + "loss": 2.1964336395263673, + "step": 900 + }, + { + "epoch": 0.12916962384669978, + "grad_norm": 10.594833374023438, + "learning_rate": 9.987111426543649e-05, + "loss": 2.050688362121582, + "step": 910 + }, + { + "epoch": 0.13058907026259758, + "grad_norm": 11.596671104431152, + "learning_rate": 9.986969481902059e-05, + "loss": 2.077385139465332, + "step": 920 + }, + { + "epoch": 0.1320085166784954, + "grad_norm": 10.779032707214355, + "learning_rate": 9.98682753726047e-05, + "loss": 2.0280479431152343, + "step": 930 + }, + { + "epoch": 0.1334279630943932, + "grad_norm": 10.522924423217773, + "learning_rate": 9.98668559261888e-05, + "loss": 1.9384689331054688, + "step": 940 + }, + { + "epoch": 0.13484740951029098, + "grad_norm": 9.86844539642334, + "learning_rate": 9.986543647977289e-05, + "loss": 2.0612548828125, + "step": 950 + }, + { + "epoch": 0.13626685592618878, + "grad_norm": 12.521405220031738, + "learning_rate": 9.986401703335699e-05, + "loss": 2.139466094970703, + "step": 960 + }, + { + "epoch": 0.1376863023420866, + "grad_norm": 11.292656898498535, + "learning_rate": 9.98625975869411e-05, + "loss": 2.077956199645996, + "step": 970 + }, + { + "epoch": 0.1391057487579844, + "grad_norm": 11.186986923217773, + "learning_rate": 9.98611781405252e-05, + "loss": 2.028730010986328, + "step": 980 + }, + { + "epoch": 0.14052519517388218, + "grad_norm": 10.553022384643555, + "learning_rate": 9.985975869410931e-05, + "loss": 1.9375551223754883, + "step": 990 + }, + { + "epoch": 0.14194464158977999, + "grad_norm": 11.089204788208008, + "learning_rate": 9.98583392476934e-05, + "loss": 2.0689823150634767, + "step": 1000 + }, + { + "epoch": 0.14194464158977999, + "eval_accuracy": 0.42239460799898265, + "eval_loss": 1.9010688066482544, + "eval_runtime": 31.4593, + "eval_samples_per_second": 499.916, + "eval_steps_per_second": 15.639, + "step": 1000 + }, + { + "epoch": 0.1433640880056778, + "grad_norm": 10.988676071166992, + "learning_rate": 9.98569198012775e-05, + "loss": 1.9830604553222657, + "step": 1010 + }, + { + "epoch": 0.1447835344215756, + "grad_norm": 11.2459077835083, + "learning_rate": 9.985550035486161e-05, + "loss": 1.9190074920654296, + "step": 1020 + }, + { + "epoch": 0.14620298083747338, + "grad_norm": 10.437894821166992, + "learning_rate": 9.985408090844571e-05, + "loss": 1.8999460220336915, + "step": 1030 + }, + { + "epoch": 0.14762242725337119, + "grad_norm": 10.94793701171875, + "learning_rate": 9.985266146202982e-05, + "loss": 1.8579456329345703, + "step": 1040 + }, + { + "epoch": 0.149041873669269, + "grad_norm": 11.168233871459961, + "learning_rate": 9.98512420156139e-05, + "loss": 1.8979732513427734, + "step": 1050 + }, + { + "epoch": 0.15046132008516677, + "grad_norm": 10.14195728302002, + "learning_rate": 9.984982256919802e-05, + "loss": 1.7833553314208985, + "step": 1060 + }, + { + "epoch": 0.15188076650106458, + "grad_norm": 9.160737991333008, + "learning_rate": 9.984840312278211e-05, + "loss": 1.8624576568603515, + "step": 1070 + }, + { + "epoch": 0.1533002129169624, + "grad_norm": 11.151049613952637, + "learning_rate": 9.984698367636623e-05, + "loss": 1.8210905075073243, + "step": 1080 + }, + { + "epoch": 0.1547196593328602, + "grad_norm": 10.053725242614746, + "learning_rate": 9.984556422995032e-05, + "loss": 1.7738643646240235, + "step": 1090 + }, + { + "epoch": 0.15613910574875797, + "grad_norm": 10.97727108001709, + "learning_rate": 9.984414478353442e-05, + "loss": 1.866429328918457, + "step": 1100 + }, + { + "epoch": 0.15755855216465578, + "grad_norm": 12.384384155273438, + "learning_rate": 9.984272533711853e-05, + "loss": 1.8680984497070312, + "step": 1110 + }, + { + "epoch": 0.1589779985805536, + "grad_norm": 11.387879371643066, + "learning_rate": 9.984130589070263e-05, + "loss": 1.8034194946289062, + "step": 1120 + }, + { + "epoch": 0.1603974449964514, + "grad_norm": 10.6587495803833, + "learning_rate": 9.983988644428674e-05, + "loss": 1.772690773010254, + "step": 1130 + }, + { + "epoch": 0.16181689141234917, + "grad_norm": 12.721858024597168, + "learning_rate": 9.983846699787084e-05, + "loss": 1.7724496841430664, + "step": 1140 + }, + { + "epoch": 0.16323633782824698, + "grad_norm": 11.116838455200195, + "learning_rate": 9.983704755145493e-05, + "loss": 1.7527042388916017, + "step": 1150 + }, + { + "epoch": 0.1646557842441448, + "grad_norm": 10.033406257629395, + "learning_rate": 9.983562810503903e-05, + "loss": 1.674898338317871, + "step": 1160 + }, + { + "epoch": 0.1660752306600426, + "grad_norm": 11.121773719787598, + "learning_rate": 9.983420865862314e-05, + "loss": 1.741505241394043, + "step": 1170 + }, + { + "epoch": 0.16749467707594037, + "grad_norm": 11.052094459533691, + "learning_rate": 9.983278921220724e-05, + "loss": 1.7749841690063477, + "step": 1180 + }, + { + "epoch": 0.16891412349183818, + "grad_norm": 10.183452606201172, + "learning_rate": 9.983136976579135e-05, + "loss": 1.6881484985351562, + "step": 1190 + }, + { + "epoch": 0.170333569907736, + "grad_norm": 11.106999397277832, + "learning_rate": 9.982995031937545e-05, + "loss": 1.814961051940918, + "step": 1200 + }, + { + "epoch": 0.1717530163236338, + "grad_norm": 12.08647632598877, + "learning_rate": 9.982853087295955e-05, + "loss": 1.682515525817871, + "step": 1210 + }, + { + "epoch": 0.17317246273953157, + "grad_norm": 13.744584083557129, + "learning_rate": 9.982711142654366e-05, + "loss": 1.6713733673095703, + "step": 1220 + }, + { + "epoch": 0.17459190915542938, + "grad_norm": 9.970173835754395, + "learning_rate": 9.982569198012775e-05, + "loss": 1.711156463623047, + "step": 1230 + }, + { + "epoch": 0.1760113555713272, + "grad_norm": 11.027495384216309, + "learning_rate": 9.982427253371186e-05, + "loss": 1.759619140625, + "step": 1240 + }, + { + "epoch": 0.177430801987225, + "grad_norm": 10.876315116882324, + "learning_rate": 9.982285308729596e-05, + "loss": 1.618482780456543, + "step": 1250 + }, + { + "epoch": 0.17885024840312277, + "grad_norm": 10.26490592956543, + "learning_rate": 9.982143364088006e-05, + "loss": 1.6674427032470702, + "step": 1260 + }, + { + "epoch": 0.18026969481902058, + "grad_norm": 11.872292518615723, + "learning_rate": 9.982001419446416e-05, + "loss": 1.6325908660888673, + "step": 1270 + }, + { + "epoch": 0.1816891412349184, + "grad_norm": 9.946234703063965, + "learning_rate": 9.981859474804827e-05, + "loss": 1.5453743934631348, + "step": 1280 + }, + { + "epoch": 0.18310858765081617, + "grad_norm": 11.03128719329834, + "learning_rate": 9.981717530163236e-05, + "loss": 1.658684539794922, + "step": 1290 + }, + { + "epoch": 0.18452803406671398, + "grad_norm": 12.145915031433105, + "learning_rate": 9.981575585521648e-05, + "loss": 1.5792274475097656, + "step": 1300 + }, + { + "epoch": 0.18594748048261178, + "grad_norm": 11.820379257202148, + "learning_rate": 9.981433640880057e-05, + "loss": 1.5301803588867187, + "step": 1310 + }, + { + "epoch": 0.1873669268985096, + "grad_norm": 11.046746253967285, + "learning_rate": 9.981291696238467e-05, + "loss": 1.6124080657958983, + "step": 1320 + }, + { + "epoch": 0.18878637331440737, + "grad_norm": 9.545868873596191, + "learning_rate": 9.981149751596878e-05, + "loss": 1.5502593994140625, + "step": 1330 + }, + { + "epoch": 0.19020581973030518, + "grad_norm": 11.999979019165039, + "learning_rate": 9.981007806955288e-05, + "loss": 1.5360203742980958, + "step": 1340 + }, + { + "epoch": 0.19162526614620298, + "grad_norm": 9.949675559997559, + "learning_rate": 9.980865862313699e-05, + "loss": 1.353858470916748, + "step": 1350 + }, + { + "epoch": 0.1930447125621008, + "grad_norm": 11.573400497436523, + "learning_rate": 9.980723917672107e-05, + "loss": 1.3946660995483398, + "step": 1360 + }, + { + "epoch": 0.19446415897799857, + "grad_norm": 10.249485969543457, + "learning_rate": 9.980581973030518e-05, + "loss": 1.518262004852295, + "step": 1370 + }, + { + "epoch": 0.19588360539389638, + "grad_norm": 10.011629104614258, + "learning_rate": 9.980440028388928e-05, + "loss": 1.5000194549560546, + "step": 1380 + }, + { + "epoch": 0.19730305180979418, + "grad_norm": 12.186440467834473, + "learning_rate": 9.980298083747339e-05, + "loss": 1.554741382598877, + "step": 1390 + }, + { + "epoch": 0.198722498225692, + "grad_norm": 11.845844268798828, + "learning_rate": 9.980156139105749e-05, + "loss": 1.4599843978881837, + "step": 1400 + }, + { + "epoch": 0.20014194464158977, + "grad_norm": 10.98592472076416, + "learning_rate": 9.980014194464159e-05, + "loss": 1.4062080383300781, + "step": 1410 + }, + { + "epoch": 0.20156139105748758, + "grad_norm": 11.54171371459961, + "learning_rate": 9.97987224982257e-05, + "loss": 1.5128003120422364, + "step": 1420 + }, + { + "epoch": 0.20298083747338538, + "grad_norm": 10.248682022094727, + "learning_rate": 9.97973030518098e-05, + "loss": 1.5022719383239747, + "step": 1430 + }, + { + "epoch": 0.2044002838892832, + "grad_norm": 8.78536319732666, + "learning_rate": 9.97958836053939e-05, + "loss": 1.4118841171264649, + "step": 1440 + }, + { + "epoch": 0.20581973030518097, + "grad_norm": 9.993626594543457, + "learning_rate": 9.9794464158978e-05, + "loss": 1.3945957183837892, + "step": 1450 + }, + { + "epoch": 0.20723917672107878, + "grad_norm": 11.31412124633789, + "learning_rate": 9.97930447125621e-05, + "loss": 1.26229887008667, + "step": 1460 + }, + { + "epoch": 0.20865862313697658, + "grad_norm": 11.182840347290039, + "learning_rate": 9.97916252661462e-05, + "loss": 1.3171740531921388, + "step": 1470 + }, + { + "epoch": 0.2100780695528744, + "grad_norm": 12.25224781036377, + "learning_rate": 9.979020581973031e-05, + "loss": 1.3310781478881837, + "step": 1480 + }, + { + "epoch": 0.21149751596877217, + "grad_norm": 11.81201457977295, + "learning_rate": 9.978878637331441e-05, + "loss": 1.3043070793151856, + "step": 1490 + }, + { + "epoch": 0.21291696238466998, + "grad_norm": 10.484480857849121, + "learning_rate": 9.978736692689852e-05, + "loss": 1.2629288673400878, + "step": 1500 + }, + { + "epoch": 0.21291696238466998, + "eval_accuracy": 0.5395180263241559, + "eval_loss": 1.438815712928772, + "eval_runtime": 32.1456, + "eval_samples_per_second": 489.242, + "eval_steps_per_second": 15.305, + "step": 1500 + }, + { + "epoch": 0.21433640880056778, + "grad_norm": 10.796157836914062, + "learning_rate": 9.978594748048262e-05, + "loss": 1.3752121925354004, + "step": 1510 + }, + { + "epoch": 0.2157558552164656, + "grad_norm": 10.1256742477417, + "learning_rate": 9.978452803406671e-05, + "loss": 1.3005435943603516, + "step": 1520 + }, + { + "epoch": 0.21717530163236337, + "grad_norm": 11.182530403137207, + "learning_rate": 9.978310858765082e-05, + "loss": 1.3048934936523438, + "step": 1530 + }, + { + "epoch": 0.21859474804826118, + "grad_norm": 10.190278053283691, + "learning_rate": 9.978168914123492e-05, + "loss": 1.3993605613708495, + "step": 1540 + }, + { + "epoch": 0.22001419446415899, + "grad_norm": 10.497735977172852, + "learning_rate": 9.978026969481903e-05, + "loss": 1.303945541381836, + "step": 1550 + }, + { + "epoch": 0.22143364088005676, + "grad_norm": 10.535606384277344, + "learning_rate": 9.977885024840313e-05, + "loss": 1.2210904121398927, + "step": 1560 + }, + { + "epoch": 0.22285308729595457, + "grad_norm": 11.385029792785645, + "learning_rate": 9.977743080198723e-05, + "loss": 1.3508376121520995, + "step": 1570 + }, + { + "epoch": 0.22427253371185238, + "grad_norm": 9.528643608093262, + "learning_rate": 9.977601135557132e-05, + "loss": 1.2278815269470216, + "step": 1580 + }, + { + "epoch": 0.22569198012775019, + "grad_norm": 13.161009788513184, + "learning_rate": 9.977459190915544e-05, + "loss": 1.254448413848877, + "step": 1590 + }, + { + "epoch": 0.22711142654364797, + "grad_norm": 11.288809776306152, + "learning_rate": 9.977317246273953e-05, + "loss": 1.271047878265381, + "step": 1600 + }, + { + "epoch": 0.22853087295954577, + "grad_norm": 11.30105209350586, + "learning_rate": 9.977175301632364e-05, + "loss": 1.3242988586425781, + "step": 1610 + }, + { + "epoch": 0.22995031937544358, + "grad_norm": 10.600774765014648, + "learning_rate": 9.977033356990774e-05, + "loss": 1.3170942306518554, + "step": 1620 + }, + { + "epoch": 0.2313697657913414, + "grad_norm": 10.652543067932129, + "learning_rate": 9.976891412349184e-05, + "loss": 1.3998719215393067, + "step": 1630 + }, + { + "epoch": 0.23278921220723917, + "grad_norm": 11.354793548583984, + "learning_rate": 9.976749467707595e-05, + "loss": 1.270443820953369, + "step": 1640 + }, + { + "epoch": 0.23420865862313697, + "grad_norm": 9.926568031311035, + "learning_rate": 9.976607523066005e-05, + "loss": 1.117215347290039, + "step": 1650 + }, + { + "epoch": 0.23562810503903478, + "grad_norm": 11.167335510253906, + "learning_rate": 9.976465578424416e-05, + "loss": 1.348717212677002, + "step": 1660 + }, + { + "epoch": 0.2370475514549326, + "grad_norm": 11.364425659179688, + "learning_rate": 9.976323633782824e-05, + "loss": 1.2113998413085938, + "step": 1670 + }, + { + "epoch": 0.23846699787083037, + "grad_norm": 10.315034866333008, + "learning_rate": 9.976181689141235e-05, + "loss": 1.2621678352355956, + "step": 1680 + }, + { + "epoch": 0.23988644428672817, + "grad_norm": 11.332146644592285, + "learning_rate": 9.976039744499645e-05, + "loss": 1.2919418334960937, + "step": 1690 + }, + { + "epoch": 0.24130589070262598, + "grad_norm": 9.863037109375, + "learning_rate": 9.975897799858056e-05, + "loss": 1.262222957611084, + "step": 1700 + }, + { + "epoch": 0.2427253371185238, + "grad_norm": 13.898163795471191, + "learning_rate": 9.975755855216467e-05, + "loss": 1.349098300933838, + "step": 1710 + }, + { + "epoch": 0.24414478353442157, + "grad_norm": 9.008386611938477, + "learning_rate": 9.975613910574876e-05, + "loss": 1.1653017044067382, + "step": 1720 + }, + { + "epoch": 0.24556422995031937, + "grad_norm": 9.755669593811035, + "learning_rate": 9.975471965933287e-05, + "loss": 1.304057788848877, + "step": 1730 + }, + { + "epoch": 0.24698367636621718, + "grad_norm": 10.742278099060059, + "learning_rate": 9.975330021291696e-05, + "loss": 1.1656038284301757, + "step": 1740 + }, + { + "epoch": 0.248403122782115, + "grad_norm": 11.937880516052246, + "learning_rate": 9.975188076650107e-05, + "loss": 1.2565963745117188, + "step": 1750 + }, + { + "epoch": 0.24982256919801277, + "grad_norm": 9.80545711517334, + "learning_rate": 9.975046132008517e-05, + "loss": 1.1316876411437988, + "step": 1760 + }, + { + "epoch": 0.2512420156139106, + "grad_norm": 11.162557601928711, + "learning_rate": 9.974904187366927e-05, + "loss": 1.2094581604003907, + "step": 1770 + }, + { + "epoch": 0.2526614620298084, + "grad_norm": 12.278450965881348, + "learning_rate": 9.974762242725337e-05, + "loss": 1.2499947547912598, + "step": 1780 + }, + { + "epoch": 0.2540809084457062, + "grad_norm": 10.95953369140625, + "learning_rate": 9.974620298083748e-05, + "loss": 1.1540046691894532, + "step": 1790 + }, + { + "epoch": 0.255500354861604, + "grad_norm": 7.865696430206299, + "learning_rate": 9.974478353442159e-05, + "loss": 1.1665989875793457, + "step": 1800 + }, + { + "epoch": 0.25691980127750175, + "grad_norm": 12.1609468460083, + "learning_rate": 9.974336408800569e-05, + "loss": 1.120746898651123, + "step": 1810 + }, + { + "epoch": 0.25833924769339955, + "grad_norm": 9.554359436035156, + "learning_rate": 9.974194464158978e-05, + "loss": 1.3381189346313476, + "step": 1820 + }, + { + "epoch": 0.25975869410929736, + "grad_norm": 9.497129440307617, + "learning_rate": 9.974052519517388e-05, + "loss": 1.1758546829223633, + "step": 1830 + }, + { + "epoch": 0.26117814052519517, + "grad_norm": 10.584992408752441, + "learning_rate": 9.973910574875799e-05, + "loss": 1.0787659645080567, + "step": 1840 + }, + { + "epoch": 0.262597586941093, + "grad_norm": 9.558980941772461, + "learning_rate": 9.973768630234209e-05, + "loss": 0.9334567070007325, + "step": 1850 + }, + { + "epoch": 0.2640170333569908, + "grad_norm": 9.41112995147705, + "learning_rate": 9.97362668559262e-05, + "loss": 1.1376053810119628, + "step": 1860 + }, + { + "epoch": 0.2654364797728886, + "grad_norm": 11.666831970214844, + "learning_rate": 9.973484740951028e-05, + "loss": 1.207914447784424, + "step": 1870 + }, + { + "epoch": 0.2668559261887864, + "grad_norm": 11.217955589294434, + "learning_rate": 9.97334279630944e-05, + "loss": 1.052849578857422, + "step": 1880 + }, + { + "epoch": 0.26827537260468415, + "grad_norm": 8.3615083694458, + "learning_rate": 9.97320085166785e-05, + "loss": 0.9782976150512696, + "step": 1890 + }, + { + "epoch": 0.26969481902058196, + "grad_norm": 10.69944953918457, + "learning_rate": 9.97305890702626e-05, + "loss": 0.9639101982116699, + "step": 1900 + }, + { + "epoch": 0.27111426543647976, + "grad_norm": 11.15194034576416, + "learning_rate": 9.972916962384671e-05, + "loss": 1.0744239807128906, + "step": 1910 + }, + { + "epoch": 0.27253371185237757, + "grad_norm": 10.363690376281738, + "learning_rate": 9.972775017743081e-05, + "loss": 1.1180108070373536, + "step": 1920 + }, + { + "epoch": 0.2739531582682754, + "grad_norm": 10.816513061523438, + "learning_rate": 9.972633073101491e-05, + "loss": 1.118791103363037, + "step": 1930 + }, + { + "epoch": 0.2753726046841732, + "grad_norm": 8.64388656616211, + "learning_rate": 9.9724911284599e-05, + "loss": 1.1368459701538085, + "step": 1940 + }, + { + "epoch": 0.276792051100071, + "grad_norm": 9.002252578735352, + "learning_rate": 9.972349183818312e-05, + "loss": 1.1344121932983398, + "step": 1950 + }, + { + "epoch": 0.2782114975159688, + "grad_norm": 11.083386421203613, + "learning_rate": 9.972207239176721e-05, + "loss": 1.1827295303344727, + "step": 1960 + }, + { + "epoch": 0.27963094393186655, + "grad_norm": 8.360145568847656, + "learning_rate": 9.972065294535133e-05, + "loss": 0.9954969406127929, + "step": 1970 + }, + { + "epoch": 0.28105039034776436, + "grad_norm": 12.982026100158691, + "learning_rate": 9.971923349893542e-05, + "loss": 0.9865982055664062, + "step": 1980 + }, + { + "epoch": 0.28246983676366216, + "grad_norm": 9.3854341506958, + "learning_rate": 9.971781405251952e-05, + "loss": 0.9238475799560547, + "step": 1990 + }, + { + "epoch": 0.28388928317955997, + "grad_norm": 10.693597793579102, + "learning_rate": 9.971639460610363e-05, + "loss": 0.9660484313964843, + "step": 2000 + }, + { + "epoch": 0.28388928317955997, + "eval_accuracy": 0.6596935206968907, + "eval_loss": 1.0827350616455078, + "eval_runtime": 31.44, + "eval_samples_per_second": 500.222, + "eval_steps_per_second": 15.649, + "step": 2000 + }, + { + "epoch": 0.2853087295954578, + "grad_norm": 11.403952598571777, + "learning_rate": 9.971497515968773e-05, + "loss": 0.987885856628418, + "step": 2010 + }, + { + "epoch": 0.2867281760113556, + "grad_norm": 11.068461418151855, + "learning_rate": 9.971355571327184e-05, + "loss": 1.0180384635925293, + "step": 2020 + }, + { + "epoch": 0.2881476224272534, + "grad_norm": 10.536505699157715, + "learning_rate": 9.971213626685592e-05, + "loss": 1.0148059844970703, + "step": 2030 + }, + { + "epoch": 0.2895670688431512, + "grad_norm": 9.358129501342773, + "learning_rate": 9.971071682044003e-05, + "loss": 0.9920819282531739, + "step": 2040 + }, + { + "epoch": 0.29098651525904895, + "grad_norm": 10.33521842956543, + "learning_rate": 9.970929737402413e-05, + "loss": 1.0010162353515626, + "step": 2050 + }, + { + "epoch": 0.29240596167494676, + "grad_norm": 10.490190505981445, + "learning_rate": 9.970787792760824e-05, + "loss": 0.9781021118164063, + "step": 2060 + }, + { + "epoch": 0.29382540809084456, + "grad_norm": 9.507524490356445, + "learning_rate": 9.970645848119234e-05, + "loss": 0.9722440719604493, + "step": 2070 + }, + { + "epoch": 0.29524485450674237, + "grad_norm": 10.77835464477539, + "learning_rate": 9.970503903477644e-05, + "loss": 0.9851055145263672, + "step": 2080 + }, + { + "epoch": 0.2966643009226402, + "grad_norm": 9.847874641418457, + "learning_rate": 9.970361958836055e-05, + "loss": 0.8958380699157715, + "step": 2090 + }, + { + "epoch": 0.298083747338538, + "grad_norm": 11.703569412231445, + "learning_rate": 9.970220014194465e-05, + "loss": 0.9267073631286621, + "step": 2100 + }, + { + "epoch": 0.2995031937544358, + "grad_norm": 6.974740028381348, + "learning_rate": 9.970078069552876e-05, + "loss": 0.787592887878418, + "step": 2110 + }, + { + "epoch": 0.30092264017033354, + "grad_norm": 5.989770889282227, + "learning_rate": 9.969936124911285e-05, + "loss": 0.8309663772583008, + "step": 2120 + }, + { + "epoch": 0.30234208658623135, + "grad_norm": 8.477362632751465, + "learning_rate": 9.969794180269695e-05, + "loss": 0.8926510810852051, + "step": 2130 + }, + { + "epoch": 0.30376153300212916, + "grad_norm": 8.412622451782227, + "learning_rate": 9.969652235628105e-05, + "loss": 0.8592344284057617, + "step": 2140 + }, + { + "epoch": 0.30518097941802697, + "grad_norm": 10.356178283691406, + "learning_rate": 9.969510290986516e-05, + "loss": 0.8827583312988281, + "step": 2150 + }, + { + "epoch": 0.3066004258339248, + "grad_norm": 7.666086673736572, + "learning_rate": 9.969368346344926e-05, + "loss": 0.9427967071533203, + "step": 2160 + }, + { + "epoch": 0.3080198722498226, + "grad_norm": 11.2577486038208, + "learning_rate": 9.969226401703337e-05, + "loss": 0.8580154418945313, + "step": 2170 + }, + { + "epoch": 0.3094393186657204, + "grad_norm": 10.915003776550293, + "learning_rate": 9.969084457061746e-05, + "loss": 0.9019613265991211, + "step": 2180 + }, + { + "epoch": 0.3108587650816182, + "grad_norm": 9.683639526367188, + "learning_rate": 9.968942512420156e-05, + "loss": 0.8982448577880859, + "step": 2190 + }, + { + "epoch": 0.31227821149751595, + "grad_norm": 8.5520601272583, + "learning_rate": 9.968800567778567e-05, + "loss": 0.7967979431152343, + "step": 2200 + }, + { + "epoch": 0.31369765791341375, + "grad_norm": 11.931614875793457, + "learning_rate": 9.968658623136977e-05, + "loss": 0.9725972175598144, + "step": 2210 + }, + { + "epoch": 0.31511710432931156, + "grad_norm": 11.004504203796387, + "learning_rate": 9.968516678495388e-05, + "loss": 0.9189895629882813, + "step": 2220 + }, + { + "epoch": 0.31653655074520937, + "grad_norm": 9.460184097290039, + "learning_rate": 9.968374733853797e-05, + "loss": 0.9253165245056152, + "step": 2230 + }, + { + "epoch": 0.3179559971611072, + "grad_norm": 9.675958633422852, + "learning_rate": 9.968232789212208e-05, + "loss": 0.7977495670318604, + "step": 2240 + }, + { + "epoch": 0.319375443577005, + "grad_norm": 8.858159065246582, + "learning_rate": 9.968090844570617e-05, + "loss": 0.9056186676025391, + "step": 2250 + }, + { + "epoch": 0.3207948899929028, + "grad_norm": 10.144878387451172, + "learning_rate": 9.967948899929028e-05, + "loss": 0.9273244857788085, + "step": 2260 + }, + { + "epoch": 0.3222143364088006, + "grad_norm": 9.78799819946289, + "learning_rate": 9.967806955287438e-05, + "loss": 0.8192936897277832, + "step": 2270 + }, + { + "epoch": 0.32363378282469835, + "grad_norm": 8.891179084777832, + "learning_rate": 9.967665010645849e-05, + "loss": 0.8823507308959961, + "step": 2280 + }, + { + "epoch": 0.32505322924059615, + "grad_norm": 9.303411483764648, + "learning_rate": 9.967523066004259e-05, + "loss": 0.8374591827392578, + "step": 2290 + }, + { + "epoch": 0.32647267565649396, + "grad_norm": 8.408880233764648, + "learning_rate": 9.967381121362669e-05, + "loss": 0.849891471862793, + "step": 2300 + }, + { + "epoch": 0.32789212207239177, + "grad_norm": 9.384819030761719, + "learning_rate": 9.96723917672108e-05, + "loss": 0.7750972747802735, + "step": 2310 + }, + { + "epoch": 0.3293115684882896, + "grad_norm": 9.170500755310059, + "learning_rate": 9.96709723207949e-05, + "loss": 0.7687624454498291, + "step": 2320 + }, + { + "epoch": 0.3307310149041874, + "grad_norm": 8.488929748535156, + "learning_rate": 9.966955287437901e-05, + "loss": 0.8498885154724121, + "step": 2330 + }, + { + "epoch": 0.3321504613200852, + "grad_norm": 10.291971206665039, + "learning_rate": 9.966813342796309e-05, + "loss": 0.7807302474975586, + "step": 2340 + }, + { + "epoch": 0.33356990773598294, + "grad_norm": 11.644806861877441, + "learning_rate": 9.96667139815472e-05, + "loss": 0.7917065143585205, + "step": 2350 + }, + { + "epoch": 0.33498935415188075, + "grad_norm": 13.938374519348145, + "learning_rate": 9.96652945351313e-05, + "loss": 0.8718063354492187, + "step": 2360 + }, + { + "epoch": 0.33640880056777855, + "grad_norm": 10.399706840515137, + "learning_rate": 9.966387508871541e-05, + "loss": 0.8703582763671875, + "step": 2370 + }, + { + "epoch": 0.33782824698367636, + "grad_norm": 7.870115756988525, + "learning_rate": 9.966245564229951e-05, + "loss": 0.8549924850463867, + "step": 2380 + }, + { + "epoch": 0.33924769339957417, + "grad_norm": 9.777918815612793, + "learning_rate": 9.96610361958836e-05, + "loss": 1.0234166145324708, + "step": 2390 + }, + { + "epoch": 0.340667139815472, + "grad_norm": 10.103452682495117, + "learning_rate": 9.965961674946772e-05, + "loss": 0.9040670394897461, + "step": 2400 + }, + { + "epoch": 0.3420865862313698, + "grad_norm": 10.497400283813477, + "learning_rate": 9.965819730305181e-05, + "loss": 0.7955552577972412, + "step": 2410 + }, + { + "epoch": 0.3435060326472676, + "grad_norm": 8.0149564743042, + "learning_rate": 9.965677785663592e-05, + "loss": 0.856791877746582, + "step": 2420 + }, + { + "epoch": 0.34492547906316534, + "grad_norm": 8.111480712890625, + "learning_rate": 9.965535841022002e-05, + "loss": 0.8129085540771485, + "step": 2430 + }, + { + "epoch": 0.34634492547906315, + "grad_norm": 7.93813419342041, + "learning_rate": 9.965393896380412e-05, + "loss": 0.809941291809082, + "step": 2440 + }, + { + "epoch": 0.34776437189496096, + "grad_norm": 10.88427448272705, + "learning_rate": 9.965251951738822e-05, + "loss": 0.7622882843017578, + "step": 2450 + }, + { + "epoch": 0.34918381831085876, + "grad_norm": 9.509648323059082, + "learning_rate": 9.965110007097233e-05, + "loss": 0.7235064029693603, + "step": 2460 + }, + { + "epoch": 0.35060326472675657, + "grad_norm": 10.343646049499512, + "learning_rate": 9.964968062455642e-05, + "loss": 0.7792426586151123, + "step": 2470 + }, + { + "epoch": 0.3520227111426544, + "grad_norm": 11.936261177062988, + "learning_rate": 9.964826117814054e-05, + "loss": 0.8023401260375976, + "step": 2480 + }, + { + "epoch": 0.3534421575585522, + "grad_norm": 8.382633209228516, + "learning_rate": 9.964684173172463e-05, + "loss": 0.7960898399353027, + "step": 2490 + }, + { + "epoch": 0.35486160397445, + "grad_norm": 11.01586627960205, + "learning_rate": 9.964542228530873e-05, + "loss": 0.7975746631622315, + "step": 2500 + }, + { + "epoch": 0.35486160397445, + "eval_accuracy": 0.730527118967381, + "eval_loss": 0.8166059255599976, + "eval_runtime": 32.3274, + "eval_samples_per_second": 486.491, + "eval_steps_per_second": 15.219, + "step": 2500 + }, + { + "epoch": 0.35628105039034774, + "grad_norm": 8.113981246948242, + "learning_rate": 9.964400283889284e-05, + "loss": 0.7863178253173828, + "step": 2510 + }, + { + "epoch": 0.35770049680624555, + "grad_norm": 9.127975463867188, + "learning_rate": 9.964258339247694e-05, + "loss": 0.8487259864807128, + "step": 2520 + }, + { + "epoch": 0.35911994322214336, + "grad_norm": 8.597822189331055, + "learning_rate": 9.964116394606105e-05, + "loss": 0.8151129722595215, + "step": 2530 + }, + { + "epoch": 0.36053938963804116, + "grad_norm": 8.069273948669434, + "learning_rate": 9.963974449964513e-05, + "loss": 0.6664574623107911, + "step": 2540 + }, + { + "epoch": 0.36195883605393897, + "grad_norm": 8.314419746398926, + "learning_rate": 9.963832505322924e-05, + "loss": 0.8365516662597656, + "step": 2550 + }, + { + "epoch": 0.3633782824698368, + "grad_norm": 9.172304153442383, + "learning_rate": 9.963690560681334e-05, + "loss": 0.7865428924560547, + "step": 2560 + }, + { + "epoch": 0.3647977288857346, + "grad_norm": 9.639200210571289, + "learning_rate": 9.963548616039745e-05, + "loss": 0.7925633430480957, + "step": 2570 + }, + { + "epoch": 0.36621717530163234, + "grad_norm": 8.856132507324219, + "learning_rate": 9.963406671398155e-05, + "loss": 0.7005198955535888, + "step": 2580 + }, + { + "epoch": 0.36763662171753014, + "grad_norm": 7.9700422286987305, + "learning_rate": 9.963264726756566e-05, + "loss": 0.6712905883789062, + "step": 2590 + }, + { + "epoch": 0.36905606813342795, + "grad_norm": 9.465399742126465, + "learning_rate": 9.963122782114976e-05, + "loss": 0.7288703441619873, + "step": 2600 + }, + { + "epoch": 0.37047551454932576, + "grad_norm": 8.769003868103027, + "learning_rate": 9.962980837473386e-05, + "loss": 0.7671696662902832, + "step": 2610 + }, + { + "epoch": 0.37189496096522356, + "grad_norm": 6.981420040130615, + "learning_rate": 9.962838892831797e-05, + "loss": 0.6548487663269043, + "step": 2620 + }, + { + "epoch": 0.37331440738112137, + "grad_norm": 8.440009117126465, + "learning_rate": 9.962696948190206e-05, + "loss": 0.705223274230957, + "step": 2630 + }, + { + "epoch": 0.3747338537970192, + "grad_norm": 12.392814636230469, + "learning_rate": 9.962555003548617e-05, + "loss": 0.8219353675842285, + "step": 2640 + }, + { + "epoch": 0.376153300212917, + "grad_norm": 9.1260404586792, + "learning_rate": 9.962413058907026e-05, + "loss": 0.7202134132385254, + "step": 2650 + }, + { + "epoch": 0.37757274662881474, + "grad_norm": 9.437945365905762, + "learning_rate": 9.962271114265437e-05, + "loss": 0.7196836471557617, + "step": 2660 + }, + { + "epoch": 0.37899219304471254, + "grad_norm": 8.03176212310791, + "learning_rate": 9.962129169623847e-05, + "loss": 0.6017679214477539, + "step": 2670 + }, + { + "epoch": 0.38041163946061035, + "grad_norm": 11.21246337890625, + "learning_rate": 9.961987224982258e-05, + "loss": 0.8073585510253907, + "step": 2680 + }, + { + "epoch": 0.38183108587650816, + "grad_norm": 8.937601089477539, + "learning_rate": 9.961845280340667e-05, + "loss": 0.5765426635742188, + "step": 2690 + }, + { + "epoch": 0.38325053229240597, + "grad_norm": 10.750785827636719, + "learning_rate": 9.961703335699077e-05, + "loss": 0.850700569152832, + "step": 2700 + }, + { + "epoch": 0.3846699787083038, + "grad_norm": 8.476407051086426, + "learning_rate": 9.961561391057488e-05, + "loss": 0.6841172695159912, + "step": 2710 + }, + { + "epoch": 0.3860894251242016, + "grad_norm": 8.174555778503418, + "learning_rate": 9.961419446415898e-05, + "loss": 0.6521795272827149, + "step": 2720 + }, + { + "epoch": 0.3875088715400994, + "grad_norm": 6.744903564453125, + "learning_rate": 9.961277501774309e-05, + "loss": 0.7194175720214844, + "step": 2730 + }, + { + "epoch": 0.38892831795599714, + "grad_norm": 7.107284069061279, + "learning_rate": 9.961135557132719e-05, + "loss": 0.7437104701995849, + "step": 2740 + }, + { + "epoch": 0.39034776437189495, + "grad_norm": 12.026649475097656, + "learning_rate": 9.960993612491129e-05, + "loss": 0.7481307029724121, + "step": 2750 + }, + { + "epoch": 0.39176721078779275, + "grad_norm": 10.131022453308105, + "learning_rate": 9.960851667849538e-05, + "loss": 0.6669661521911621, + "step": 2760 + }, + { + "epoch": 0.39318665720369056, + "grad_norm": 7.589590072631836, + "learning_rate": 9.96070972320795e-05, + "loss": 0.5883037567138671, + "step": 2770 + }, + { + "epoch": 0.39460610361958837, + "grad_norm": 8.32777214050293, + "learning_rate": 9.960567778566359e-05, + "loss": 0.6772464752197266, + "step": 2780 + }, + { + "epoch": 0.3960255500354862, + "grad_norm": 6.111226558685303, + "learning_rate": 9.96042583392477e-05, + "loss": 0.6943521976470948, + "step": 2790 + }, + { + "epoch": 0.397444996451384, + "grad_norm": 10.40073299407959, + "learning_rate": 9.96028388928318e-05, + "loss": 0.6597262382507324, + "step": 2800 + }, + { + "epoch": 0.3988644428672818, + "grad_norm": 11.990081787109375, + "learning_rate": 9.96014194464159e-05, + "loss": 0.6846660614013672, + "step": 2810 + }, + { + "epoch": 0.40028388928317954, + "grad_norm": 7.820896625518799, + "learning_rate": 9.960000000000001e-05, + "loss": 0.5972445487976075, + "step": 2820 + }, + { + "epoch": 0.40170333569907735, + "grad_norm": 9.078740119934082, + "learning_rate": 9.95985805535841e-05, + "loss": 0.7440935611724854, + "step": 2830 + }, + { + "epoch": 0.40312278211497515, + "grad_norm": 8.869423866271973, + "learning_rate": 9.959716110716822e-05, + "loss": 0.7406916141510009, + "step": 2840 + }, + { + "epoch": 0.40454222853087296, + "grad_norm": 9.250556945800781, + "learning_rate": 9.95957416607523e-05, + "loss": 0.6444163799285889, + "step": 2850 + }, + { + "epoch": 0.40596167494677077, + "grad_norm": 12.534906387329102, + "learning_rate": 9.959432221433641e-05, + "loss": 0.7008297920227051, + "step": 2860 + }, + { + "epoch": 0.4073811213626686, + "grad_norm": 10.320120811462402, + "learning_rate": 9.959290276792051e-05, + "loss": 0.6261786460876465, + "step": 2870 + }, + { + "epoch": 0.4088005677785664, + "grad_norm": 7.483973979949951, + "learning_rate": 9.959148332150462e-05, + "loss": 0.6434149742126465, + "step": 2880 + }, + { + "epoch": 0.41022001419446413, + "grad_norm": 9.007946014404297, + "learning_rate": 9.959006387508872e-05, + "loss": 0.6796345233917236, + "step": 2890 + }, + { + "epoch": 0.41163946061036194, + "grad_norm": 8.191641807556152, + "learning_rate": 9.958864442867281e-05, + "loss": 0.5003190994262695, + "step": 2900 + }, + { + "epoch": 0.41305890702625975, + "grad_norm": 9.307744979858398, + "learning_rate": 9.958722498225693e-05, + "loss": 0.6988365173339843, + "step": 2910 + }, + { + "epoch": 0.41447835344215755, + "grad_norm": 6.16031551361084, + "learning_rate": 9.958580553584102e-05, + "loss": 0.6487136840820312, + "step": 2920 + }, + { + "epoch": 0.41589779985805536, + "grad_norm": 9.785910606384277, + "learning_rate": 9.958438608942513e-05, + "loss": 0.6544306755065918, + "step": 2930 + }, + { + "epoch": 0.41731724627395317, + "grad_norm": 12.08917236328125, + "learning_rate": 9.958296664300923e-05, + "loss": 0.6119012832641602, + "step": 2940 + }, + { + "epoch": 0.418736692689851, + "grad_norm": 10.118932723999023, + "learning_rate": 9.958154719659334e-05, + "loss": 0.5515688896179199, + "step": 2950 + }, + { + "epoch": 0.4201561391057488, + "grad_norm": 10.645463943481445, + "learning_rate": 9.958012775017743e-05, + "loss": 0.6795665740966796, + "step": 2960 + }, + { + "epoch": 0.42157558552164653, + "grad_norm": 8.745086669921875, + "learning_rate": 9.957870830376154e-05, + "loss": 0.6612170219421387, + "step": 2970 + }, + { + "epoch": 0.42299503193754434, + "grad_norm": 7.060173511505127, + "learning_rate": 9.957728885734563e-05, + "loss": 0.635819387435913, + "step": 2980 + }, + { + "epoch": 0.42441447835344215, + "grad_norm": 11.630016326904297, + "learning_rate": 9.957586941092975e-05, + "loss": 0.5891122341156005, + "step": 2990 + }, + { + "epoch": 0.42583392476933996, + "grad_norm": 11.667549133300781, + "learning_rate": 9.957444996451386e-05, + "loss": 0.7183985233306884, + "step": 3000 + }, + { + "epoch": 0.42583392476933996, + "eval_accuracy": 0.7175557957652445, + "eval_loss": 0.8312568068504333, + "eval_runtime": 32.7465, + "eval_samples_per_second": 480.265, + "eval_steps_per_second": 15.024, + "step": 3000 + }, + { + "epoch": 0.42725337118523776, + "grad_norm": 10.770739555358887, + "learning_rate": 9.957303051809794e-05, + "loss": 0.606045913696289, + "step": 3010 + }, + { + "epoch": 0.42867281760113557, + "grad_norm": 8.715160369873047, + "learning_rate": 9.957161107168205e-05, + "loss": 0.6968401908874512, + "step": 3020 + }, + { + "epoch": 0.4300922640170334, + "grad_norm": 10.227581977844238, + "learning_rate": 9.957019162526615e-05, + "loss": 0.5089622497558594, + "step": 3030 + }, + { + "epoch": 0.4315117104329312, + "grad_norm": 8.32385540008545, + "learning_rate": 9.956877217885026e-05, + "loss": 0.6402715682983399, + "step": 3040 + }, + { + "epoch": 0.43293115684882894, + "grad_norm": 10.973727226257324, + "learning_rate": 9.956735273243436e-05, + "loss": 0.7282869338989257, + "step": 3050 + }, + { + "epoch": 0.43435060326472674, + "grad_norm": 8.994437217712402, + "learning_rate": 9.956593328601845e-05, + "loss": 0.5776423454284668, + "step": 3060 + }, + { + "epoch": 0.43577004968062455, + "grad_norm": 7.597539901733398, + "learning_rate": 9.956451383960255e-05, + "loss": 0.5537106990814209, + "step": 3070 + }, + { + "epoch": 0.43718949609652236, + "grad_norm": 7.695132732391357, + "learning_rate": 9.956309439318666e-05, + "loss": 0.5561283588409424, + "step": 3080 + }, + { + "epoch": 0.43860894251242016, + "grad_norm": 10.008833885192871, + "learning_rate": 9.956167494677077e-05, + "loss": 0.6571722030639648, + "step": 3090 + }, + { + "epoch": 0.44002838892831797, + "grad_norm": 6.440252304077148, + "learning_rate": 9.956025550035487e-05, + "loss": 0.4972050189971924, + "step": 3100 + }, + { + "epoch": 0.4414478353442158, + "grad_norm": 11.92957878112793, + "learning_rate": 9.955883605393897e-05, + "loss": 0.6483690738677979, + "step": 3110 + }, + { + "epoch": 0.44286728176011353, + "grad_norm": 8.40812873840332, + "learning_rate": 9.955741660752307e-05, + "loss": 0.602755069732666, + "step": 3120 + }, + { + "epoch": 0.44428672817601134, + "grad_norm": 6.782786846160889, + "learning_rate": 9.955599716110718e-05, + "loss": 0.6320923328399658, + "step": 3130 + }, + { + "epoch": 0.44570617459190914, + "grad_norm": 12.326107025146484, + "learning_rate": 9.955457771469127e-05, + "loss": 0.5779653549194336, + "step": 3140 + }, + { + "epoch": 0.44712562100780695, + "grad_norm": 12.876483917236328, + "learning_rate": 9.955315826827538e-05, + "loss": 0.7216415882110596, + "step": 3150 + }, + { + "epoch": 0.44854506742370476, + "grad_norm": 6.984850883483887, + "learning_rate": 9.955173882185947e-05, + "loss": 0.4415611267089844, + "step": 3160 + }, + { + "epoch": 0.44996451383960256, + "grad_norm": 6.711297512054443, + "learning_rate": 9.955031937544358e-05, + "loss": 0.5505913734436035, + "step": 3170 + }, + { + "epoch": 0.45138396025550037, + "grad_norm": 7.127682685852051, + "learning_rate": 9.954889992902769e-05, + "loss": 0.5563027858734131, + "step": 3180 + }, + { + "epoch": 0.4528034066713982, + "grad_norm": 9.826492309570312, + "learning_rate": 9.954748048261179e-05, + "loss": 0.5204686641693115, + "step": 3190 + }, + { + "epoch": 0.45422285308729593, + "grad_norm": 14.011224746704102, + "learning_rate": 9.95460610361959e-05, + "loss": 0.5676139831542969, + "step": 3200 + }, + { + "epoch": 0.45564229950319374, + "grad_norm": 10.502514839172363, + "learning_rate": 9.954464158977998e-05, + "loss": 0.6593122482299805, + "step": 3210 + }, + { + "epoch": 0.45706174591909154, + "grad_norm": 9.966157913208008, + "learning_rate": 9.95432221433641e-05, + "loss": 0.5757305145263671, + "step": 3220 + }, + { + "epoch": 0.45848119233498935, + "grad_norm": 7.551996231079102, + "learning_rate": 9.954180269694819e-05, + "loss": 0.5537711620330811, + "step": 3230 + }, + { + "epoch": 0.45990063875088716, + "grad_norm": 10.630086898803711, + "learning_rate": 9.95403832505323e-05, + "loss": 0.5302771091461181, + "step": 3240 + }, + { + "epoch": 0.46132008516678497, + "grad_norm": 12.471774101257324, + "learning_rate": 9.95389638041164e-05, + "loss": 0.6347667694091796, + "step": 3250 + }, + { + "epoch": 0.4627395315826828, + "grad_norm": 9.668441772460938, + "learning_rate": 9.95375443577005e-05, + "loss": 0.5615960121154785, + "step": 3260 + }, + { + "epoch": 0.4641589779985806, + "grad_norm": 9.092421531677246, + "learning_rate": 9.953612491128461e-05, + "loss": 0.5531889438629151, + "step": 3270 + }, + { + "epoch": 0.46557842441447833, + "grad_norm": 8.55390453338623, + "learning_rate": 9.95347054648687e-05, + "loss": 0.5149998188018798, + "step": 3280 + }, + { + "epoch": 0.46699787083037614, + "grad_norm": 9.092056274414062, + "learning_rate": 9.953328601845282e-05, + "loss": 0.49632701873779295, + "step": 3290 + }, + { + "epoch": 0.46841731724627395, + "grad_norm": 9.66268253326416, + "learning_rate": 9.953186657203691e-05, + "loss": 0.5612505912780762, + "step": 3300 + }, + { + "epoch": 0.46983676366217175, + "grad_norm": 6.583611011505127, + "learning_rate": 9.953044712562102e-05, + "loss": 0.5805669307708741, + "step": 3310 + }, + { + "epoch": 0.47125621007806956, + "grad_norm": 8.160282135009766, + "learning_rate": 9.952902767920511e-05, + "loss": 0.4320365428924561, + "step": 3320 + }, + { + "epoch": 0.47267565649396737, + "grad_norm": 10.05884075164795, + "learning_rate": 9.952760823278922e-05, + "loss": 0.5736487865447998, + "step": 3330 + }, + { + "epoch": 0.4740951029098652, + "grad_norm": 9.000593185424805, + "learning_rate": 9.952618878637332e-05, + "loss": 0.5238205432891846, + "step": 3340 + }, + { + "epoch": 0.4755145493257629, + "grad_norm": 9.076302528381348, + "learning_rate": 9.952476933995743e-05, + "loss": 0.5925283432006836, + "step": 3350 + }, + { + "epoch": 0.47693399574166073, + "grad_norm": 8.275947570800781, + "learning_rate": 9.952334989354152e-05, + "loss": 0.4787450313568115, + "step": 3360 + }, + { + "epoch": 0.47835344215755854, + "grad_norm": 12.550822257995605, + "learning_rate": 9.952193044712562e-05, + "loss": 0.49250407218933107, + "step": 3370 + }, + { + "epoch": 0.47977288857345635, + "grad_norm": 6.8708176612854, + "learning_rate": 9.952051100070973e-05, + "loss": 0.585394811630249, + "step": 3380 + }, + { + "epoch": 0.48119233498935415, + "grad_norm": 6.129304885864258, + "learning_rate": 9.951909155429383e-05, + "loss": 0.5862763881683349, + "step": 3390 + }, + { + "epoch": 0.48261178140525196, + "grad_norm": 7.1515045166015625, + "learning_rate": 9.951767210787794e-05, + "loss": 0.46143798828125, + "step": 3400 + }, + { + "epoch": 0.48403122782114977, + "grad_norm": 5.421439170837402, + "learning_rate": 9.951625266146204e-05, + "loss": 0.6040849685668945, + "step": 3410 + }, + { + "epoch": 0.4854506742370476, + "grad_norm": 10.418113708496094, + "learning_rate": 9.951483321504614e-05, + "loss": 0.55996732711792, + "step": 3420 + }, + { + "epoch": 0.4868701206529453, + "grad_norm": 9.697559356689453, + "learning_rate": 9.951341376863023e-05, + "loss": 0.5332645893096923, + "step": 3430 + }, + { + "epoch": 0.48828956706884313, + "grad_norm": 9.79345703125, + "learning_rate": 9.951199432221434e-05, + "loss": 0.5983724117279052, + "step": 3440 + }, + { + "epoch": 0.48970901348474094, + "grad_norm": 7.977105617523193, + "learning_rate": 9.951057487579844e-05, + "loss": 0.6096511840820312, + "step": 3450 + }, + { + "epoch": 0.49112845990063875, + "grad_norm": 6.851355075836182, + "learning_rate": 9.950915542938255e-05, + "loss": 0.4748993396759033, + "step": 3460 + }, + { + "epoch": 0.49254790631653655, + "grad_norm": 4.706153392791748, + "learning_rate": 9.950773598296665e-05, + "loss": 0.544727087020874, + "step": 3470 + }, + { + "epoch": 0.49396735273243436, + "grad_norm": 9.061712265014648, + "learning_rate": 9.950631653655075e-05, + "loss": 0.5076655387878418, + "step": 3480 + }, + { + "epoch": 0.49538679914833217, + "grad_norm": 7.619383335113525, + "learning_rate": 9.950489709013486e-05, + "loss": 0.5011069297790527, + "step": 3490 + }, + { + "epoch": 0.49680624556423, + "grad_norm": 6.629651069641113, + "learning_rate": 9.950347764371896e-05, + "loss": 0.5038942337036133, + "step": 3500 + }, + { + "epoch": 0.49680624556423, + "eval_accuracy": 0.8245056272652127, + "eval_loss": 0.526730477809906, + "eval_runtime": 32.5263, + "eval_samples_per_second": 483.517, + "eval_steps_per_second": 15.126, + "step": 3500 + }, + { + "epoch": 0.4982256919801277, + "grad_norm": 6.535589694976807, + "learning_rate": 9.950205819730307e-05, + "loss": 0.5255190849304199, + "step": 3510 + }, + { + "epoch": 0.49964513839602553, + "grad_norm": 10.481846809387207, + "learning_rate": 9.950063875088715e-05, + "loss": 0.4977625846862793, + "step": 3520 + }, + { + "epoch": 0.5010645848119234, + "grad_norm": 6.455493450164795, + "learning_rate": 9.949921930447126e-05, + "loss": 0.4624650955200195, + "step": 3530 + }, + { + "epoch": 0.5024840312278211, + "grad_norm": 12.190658569335938, + "learning_rate": 9.949779985805536e-05, + "loss": 0.45445499420166013, + "step": 3540 + }, + { + "epoch": 0.5039034776437189, + "grad_norm": 6.512971878051758, + "learning_rate": 9.949638041163947e-05, + "loss": 0.48822684288024903, + "step": 3550 + }, + { + "epoch": 0.5053229240596168, + "grad_norm": 8.259076118469238, + "learning_rate": 9.949496096522357e-05, + "loss": 0.4896749496459961, + "step": 3560 + }, + { + "epoch": 0.5067423704755145, + "grad_norm": 10.809083938598633, + "learning_rate": 9.949354151880766e-05, + "loss": 0.5267855644226074, + "step": 3570 + }, + { + "epoch": 0.5081618168914124, + "grad_norm": 11.164665222167969, + "learning_rate": 9.949212207239178e-05, + "loss": 0.6478964328765869, + "step": 3580 + }, + { + "epoch": 0.5095812633073101, + "grad_norm": 10.553145408630371, + "learning_rate": 9.949070262597587e-05, + "loss": 0.5322469711303711, + "step": 3590 + }, + { + "epoch": 0.511000709723208, + "grad_norm": 12.578235626220703, + "learning_rate": 9.948928317955998e-05, + "loss": 0.559388542175293, + "step": 3600 + }, + { + "epoch": 0.5124201561391057, + "grad_norm": 7.2467474937438965, + "learning_rate": 9.948786373314408e-05, + "loss": 0.5477664470672607, + "step": 3610 + }, + { + "epoch": 0.5138396025550035, + "grad_norm": 5.959977626800537, + "learning_rate": 9.948644428672818e-05, + "loss": 0.41798744201660154, + "step": 3620 + }, + { + "epoch": 0.5152590489709014, + "grad_norm": 11.72385025024414, + "learning_rate": 9.948502484031228e-05, + "loss": 0.5879819869995118, + "step": 3630 + }, + { + "epoch": 0.5166784953867991, + "grad_norm": 7.881444454193115, + "learning_rate": 9.948360539389639e-05, + "loss": 0.5005061149597168, + "step": 3640 + }, + { + "epoch": 0.518097941802697, + "grad_norm": 7.005399703979492, + "learning_rate": 9.948218594748048e-05, + "loss": 0.5412337303161621, + "step": 3650 + }, + { + "epoch": 0.5195173882185947, + "grad_norm": 13.495038032531738, + "learning_rate": 9.94807665010646e-05, + "loss": 0.5668565273284912, + "step": 3660 + }, + { + "epoch": 0.5209368346344926, + "grad_norm": 8.42395305633545, + "learning_rate": 9.947934705464869e-05, + "loss": 0.6085368633270264, + "step": 3670 + }, + { + "epoch": 0.5223562810503903, + "grad_norm": 8.754134178161621, + "learning_rate": 9.947792760823279e-05, + "loss": 0.5360457420349121, + "step": 3680 + }, + { + "epoch": 0.5237757274662882, + "grad_norm": 5.868712425231934, + "learning_rate": 9.94765081618169e-05, + "loss": 0.6166606903076172, + "step": 3690 + }, + { + "epoch": 0.525195173882186, + "grad_norm": 4.342434883117676, + "learning_rate": 9.9475088715401e-05, + "loss": 0.4890284061431885, + "step": 3700 + }, + { + "epoch": 0.5266146202980837, + "grad_norm": 8.200478553771973, + "learning_rate": 9.947366926898511e-05, + "loss": 0.5135448455810547, + "step": 3710 + }, + { + "epoch": 0.5280340667139816, + "grad_norm": 6.076674938201904, + "learning_rate": 9.94722498225692e-05, + "loss": 0.37685840129852294, + "step": 3720 + }, + { + "epoch": 0.5294535131298793, + "grad_norm": 8.206668853759766, + "learning_rate": 9.94708303761533e-05, + "loss": 0.43741750717163086, + "step": 3730 + }, + { + "epoch": 0.5308729595457772, + "grad_norm": 8.284717559814453, + "learning_rate": 9.94694109297374e-05, + "loss": 0.46701841354370116, + "step": 3740 + }, + { + "epoch": 0.5322924059616749, + "grad_norm": 8.111977577209473, + "learning_rate": 9.946799148332151e-05, + "loss": 0.5564829349517822, + "step": 3750 + }, + { + "epoch": 0.5337118523775728, + "grad_norm": 10.037016868591309, + "learning_rate": 9.946657203690561e-05, + "loss": 0.4543320655822754, + "step": 3760 + }, + { + "epoch": 0.5351312987934705, + "grad_norm": 6.1391191482543945, + "learning_rate": 9.946515259048972e-05, + "loss": 0.43409008979797364, + "step": 3770 + }, + { + "epoch": 0.5365507452093683, + "grad_norm": 9.031709671020508, + "learning_rate": 9.946373314407382e-05, + "loss": 0.45609292984008787, + "step": 3780 + }, + { + "epoch": 0.5379701916252662, + "grad_norm": 10.507880210876465, + "learning_rate": 9.946231369765791e-05, + "loss": 0.49566287994384767, + "step": 3790 + }, + { + "epoch": 0.5393896380411639, + "grad_norm": 7.94572114944458, + "learning_rate": 9.946089425124203e-05, + "loss": 0.43464975357055663, + "step": 3800 + }, + { + "epoch": 0.5408090844570618, + "grad_norm": 11.292725563049316, + "learning_rate": 9.945947480482612e-05, + "loss": 0.4976043224334717, + "step": 3810 + }, + { + "epoch": 0.5422285308729595, + "grad_norm": 9.720746040344238, + "learning_rate": 9.945805535841023e-05, + "loss": 0.44420394897460935, + "step": 3820 + }, + { + "epoch": 0.5436479772888574, + "grad_norm": 10.859402656555176, + "learning_rate": 9.945663591199432e-05, + "loss": 0.47884187698364256, + "step": 3830 + }, + { + "epoch": 0.5450674237047551, + "grad_norm": 10.234602928161621, + "learning_rate": 9.945521646557843e-05, + "loss": 0.4273094654083252, + "step": 3840 + }, + { + "epoch": 0.5464868701206529, + "grad_norm": 10.073461532592773, + "learning_rate": 9.945379701916253e-05, + "loss": 0.4809098243713379, + "step": 3850 + }, + { + "epoch": 0.5479063165365508, + "grad_norm": 8.402386665344238, + "learning_rate": 9.945237757274664e-05, + "loss": 0.5075035572052002, + "step": 3860 + }, + { + "epoch": 0.5493257629524485, + "grad_norm": 8.385801315307617, + "learning_rate": 9.945095812633073e-05, + "loss": 0.42679743766784667, + "step": 3870 + }, + { + "epoch": 0.5507452093683464, + "grad_norm": 8.214275360107422, + "learning_rate": 9.944953867991483e-05, + "loss": 0.42831969261169434, + "step": 3880 + }, + { + "epoch": 0.5521646557842441, + "grad_norm": 6.777364730834961, + "learning_rate": 9.944811923349894e-05, + "loss": 0.37784249782562257, + "step": 3890 + }, + { + "epoch": 0.553584102200142, + "grad_norm": 7.4766011238098145, + "learning_rate": 9.944669978708304e-05, + "loss": 0.5389047622680664, + "step": 3900 + }, + { + "epoch": 0.5550035486160397, + "grad_norm": 7.167613983154297, + "learning_rate": 9.944528034066715e-05, + "loss": 0.4535686492919922, + "step": 3910 + }, + { + "epoch": 0.5564229950319376, + "grad_norm": 3.94936203956604, + "learning_rate": 9.944386089425125e-05, + "loss": 0.392999267578125, + "step": 3920 + }, + { + "epoch": 0.5578424414478353, + "grad_norm": 7.909378528594971, + "learning_rate": 9.944244144783535e-05, + "loss": 0.4675307750701904, + "step": 3930 + }, + { + "epoch": 0.5592618878637331, + "grad_norm": 8.253449440002441, + "learning_rate": 9.944102200141944e-05, + "loss": 0.515011215209961, + "step": 3940 + }, + { + "epoch": 0.560681334279631, + "grad_norm": 5.535346984863281, + "learning_rate": 9.943960255500355e-05, + "loss": 0.35612196922302247, + "step": 3950 + }, + { + "epoch": 0.5621007806955287, + "grad_norm": 5.621975898742676, + "learning_rate": 9.943818310858765e-05, + "loss": 0.3292267322540283, + "step": 3960 + }, + { + "epoch": 0.5635202271114266, + "grad_norm": 8.432771682739258, + "learning_rate": 9.943676366217176e-05, + "loss": 0.44489707946777346, + "step": 3970 + }, + { + "epoch": 0.5649396735273243, + "grad_norm": 5.422188758850098, + "learning_rate": 9.943534421575586e-05, + "loss": 0.4312278270721436, + "step": 3980 + }, + { + "epoch": 0.5663591199432222, + "grad_norm": 6.463229179382324, + "learning_rate": 9.943392476933996e-05, + "loss": 0.4469761371612549, + "step": 3990 + }, + { + "epoch": 0.5677785663591199, + "grad_norm": 12.039133071899414, + "learning_rate": 9.943250532292407e-05, + "loss": 0.5653008937835693, + "step": 4000 + }, + { + "epoch": 0.5677785663591199, + "eval_accuracy": 0.8540726139759649, + "eval_loss": 0.4309617578983307, + "eval_runtime": 32.698, + "eval_samples_per_second": 480.977, + "eval_steps_per_second": 15.047, + "step": 4000 + }, + { + "epoch": 0.5691980127750177, + "grad_norm": 12.185803413391113, + "learning_rate": 9.943108587650817e-05, + "loss": 0.4458905220031738, + "step": 4010 + }, + { + "epoch": 0.5706174591909156, + "grad_norm": 9.691877365112305, + "learning_rate": 9.942966643009228e-05, + "loss": 0.39486720561981203, + "step": 4020 + }, + { + "epoch": 0.5720369056068133, + "grad_norm": 8.106902122497559, + "learning_rate": 9.942824698367637e-05, + "loss": 0.49444093704223635, + "step": 4030 + }, + { + "epoch": 0.5734563520227112, + "grad_norm": 9.14234447479248, + "learning_rate": 9.942682753726047e-05, + "loss": 0.43038039207458495, + "step": 4040 + }, + { + "epoch": 0.5748757984386089, + "grad_norm": 4.6097588539123535, + "learning_rate": 9.942540809084457e-05, + "loss": 0.4118741512298584, + "step": 4050 + }, + { + "epoch": 0.5762952448545068, + "grad_norm": 6.0909881591796875, + "learning_rate": 9.942398864442868e-05, + "loss": 0.4245272159576416, + "step": 4060 + }, + { + "epoch": 0.5777146912704045, + "grad_norm": 10.82681941986084, + "learning_rate": 9.942256919801278e-05, + "loss": 0.43029065132141114, + "step": 4070 + }, + { + "epoch": 0.5791341376863024, + "grad_norm": 7.2398481369018555, + "learning_rate": 9.942114975159689e-05, + "loss": 0.4268779277801514, + "step": 4080 + }, + { + "epoch": 0.5805535841022001, + "grad_norm": 12.160025596618652, + "learning_rate": 9.941973030518099e-05, + "loss": 0.45933380126953127, + "step": 4090 + }, + { + "epoch": 0.5819730305180979, + "grad_norm": 8.116787910461426, + "learning_rate": 9.941831085876508e-05, + "loss": 0.3810285568237305, + "step": 4100 + }, + { + "epoch": 0.5833924769339958, + "grad_norm": 7.5045037269592285, + "learning_rate": 9.94168914123492e-05, + "loss": 0.48673238754272463, + "step": 4110 + }, + { + "epoch": 0.5848119233498935, + "grad_norm": 10.9375581741333, + "learning_rate": 9.941547196593329e-05, + "loss": 0.5465658664703369, + "step": 4120 + }, + { + "epoch": 0.5862313697657914, + "grad_norm": 9.211751937866211, + "learning_rate": 9.94140525195174e-05, + "loss": 0.46831202507019043, + "step": 4130 + }, + { + "epoch": 0.5876508161816891, + "grad_norm": 7.636734485626221, + "learning_rate": 9.941263307310149e-05, + "loss": 0.3928233623504639, + "step": 4140 + }, + { + "epoch": 0.589070262597587, + "grad_norm": 7.125626564025879, + "learning_rate": 9.94112136266856e-05, + "loss": 0.40816364288330076, + "step": 4150 + }, + { + "epoch": 0.5904897090134847, + "grad_norm": 5.0693888664245605, + "learning_rate": 9.94097941802697e-05, + "loss": 0.3931445837020874, + "step": 4160 + }, + { + "epoch": 0.5919091554293825, + "grad_norm": 8.10261058807373, + "learning_rate": 9.94083747338538e-05, + "loss": 0.4498757839202881, + "step": 4170 + }, + { + "epoch": 0.5933286018452804, + "grad_norm": 9.593578338623047, + "learning_rate": 9.94069552874379e-05, + "loss": 0.46671414375305176, + "step": 4180 + }, + { + "epoch": 0.5947480482611781, + "grad_norm": 10.025617599487305, + "learning_rate": 9.9405535841022e-05, + "loss": 0.42932772636413574, + "step": 4190 + }, + { + "epoch": 0.596167494677076, + "grad_norm": 9.828198432922363, + "learning_rate": 9.940411639460611e-05, + "loss": 0.4723196506500244, + "step": 4200 + }, + { + "epoch": 0.5975869410929737, + "grad_norm": 7.570648193359375, + "learning_rate": 9.940269694819021e-05, + "loss": 0.4049358367919922, + "step": 4210 + }, + { + "epoch": 0.5990063875088716, + "grad_norm": 6.280502796173096, + "learning_rate": 9.940127750177432e-05, + "loss": 0.4247574806213379, + "step": 4220 + }, + { + "epoch": 0.6004258339247693, + "grad_norm": 8.619515419006348, + "learning_rate": 9.939985805535842e-05, + "loss": 0.3778993606567383, + "step": 4230 + }, + { + "epoch": 0.6018452803406671, + "grad_norm": 7.0030059814453125, + "learning_rate": 9.939843860894251e-05, + "loss": 0.4360033988952637, + "step": 4240 + }, + { + "epoch": 0.603264726756565, + "grad_norm": 6.206148624420166, + "learning_rate": 9.939701916252661e-05, + "loss": 0.4114119529724121, + "step": 4250 + }, + { + "epoch": 0.6046841731724627, + "grad_norm": 4.982306003570557, + "learning_rate": 9.939559971611072e-05, + "loss": 0.36019244194030764, + "step": 4260 + }, + { + "epoch": 0.6061036195883606, + "grad_norm": 7.193652153015137, + "learning_rate": 9.939418026969482e-05, + "loss": 0.47596259117126466, + "step": 4270 + }, + { + "epoch": 0.6075230660042583, + "grad_norm": 9.371147155761719, + "learning_rate": 9.939276082327893e-05, + "loss": 0.42912769317626953, + "step": 4280 + }, + { + "epoch": 0.6089425124201562, + "grad_norm": 8.962141036987305, + "learning_rate": 9.939134137686303e-05, + "loss": 0.42342243194580076, + "step": 4290 + }, + { + "epoch": 0.6103619588360539, + "grad_norm": 7.575186252593994, + "learning_rate": 9.938992193044712e-05, + "loss": 0.4972747802734375, + "step": 4300 + }, + { + "epoch": 0.6117814052519518, + "grad_norm": 6.965094566345215, + "learning_rate": 9.938850248403124e-05, + "loss": 0.3489841938018799, + "step": 4310 + }, + { + "epoch": 0.6132008516678495, + "grad_norm": 8.466391563415527, + "learning_rate": 9.938708303761533e-05, + "loss": 0.3544389009475708, + "step": 4320 + }, + { + "epoch": 0.6146202980837473, + "grad_norm": 6.5821123123168945, + "learning_rate": 9.938566359119944e-05, + "loss": 0.4732979297637939, + "step": 4330 + }, + { + "epoch": 0.6160397444996452, + "grad_norm": 6.803234100341797, + "learning_rate": 9.938424414478353e-05, + "loss": 0.39069912433624265, + "step": 4340 + }, + { + "epoch": 0.6174591909155429, + "grad_norm": 10.069840431213379, + "learning_rate": 9.938282469836764e-05, + "loss": 0.4592564582824707, + "step": 4350 + }, + { + "epoch": 0.6188786373314408, + "grad_norm": 9.41560173034668, + "learning_rate": 9.938140525195174e-05, + "loss": 0.45508289337158203, + "step": 4360 + }, + { + "epoch": 0.6202980837473385, + "grad_norm": 8.344886779785156, + "learning_rate": 9.937998580553585e-05, + "loss": 0.41198153495788575, + "step": 4370 + }, + { + "epoch": 0.6217175301632364, + "grad_norm": 9.129981994628906, + "learning_rate": 9.937856635911994e-05, + "loss": 0.33535902500152587, + "step": 4380 + }, + { + "epoch": 0.6231369765791341, + "grad_norm": 6.8436455726623535, + "learning_rate": 9.937714691270406e-05, + "loss": 0.3965883493423462, + "step": 4390 + }, + { + "epoch": 0.6245564229950319, + "grad_norm": 6.954466342926025, + "learning_rate": 9.937572746628815e-05, + "loss": 0.3352261304855347, + "step": 4400 + }, + { + "epoch": 0.6259758694109298, + "grad_norm": 8.227835655212402, + "learning_rate": 9.937430801987225e-05, + "loss": 0.4205745220184326, + "step": 4410 + }, + { + "epoch": 0.6273953158268275, + "grad_norm": 8.202418327331543, + "learning_rate": 9.937288857345636e-05, + "loss": 0.3927265405654907, + "step": 4420 + }, + { + "epoch": 0.6288147622427254, + "grad_norm": 9.406537055969238, + "learning_rate": 9.937146912704046e-05, + "loss": 0.4481183052062988, + "step": 4430 + }, + { + "epoch": 0.6302342086586231, + "grad_norm": 8.330412864685059, + "learning_rate": 9.937004968062457e-05, + "loss": 0.37030580043792727, + "step": 4440 + }, + { + "epoch": 0.631653655074521, + "grad_norm": 5.601277828216553, + "learning_rate": 9.936863023420865e-05, + "loss": 0.35557353496551514, + "step": 4450 + }, + { + "epoch": 0.6330731014904187, + "grad_norm": 11.551403999328613, + "learning_rate": 9.936721078779276e-05, + "loss": 0.3718759536743164, + "step": 4460 + }, + { + "epoch": 0.6344925479063165, + "grad_norm": 5.961857318878174, + "learning_rate": 9.936579134137686e-05, + "loss": 0.3828912258148193, + "step": 4470 + }, + { + "epoch": 0.6359119943222143, + "grad_norm": 6.173798561096191, + "learning_rate": 9.936437189496097e-05, + "loss": 0.392284631729126, + "step": 4480 + }, + { + "epoch": 0.6373314407381121, + "grad_norm": 8.952240943908691, + "learning_rate": 9.936295244854508e-05, + "loss": 0.41978960037231444, + "step": 4490 + }, + { + "epoch": 0.63875088715401, + "grad_norm": 9.86811637878418, + "learning_rate": 9.936153300212917e-05, + "loss": 0.42105417251586913, + "step": 4500 + }, + { + "epoch": 0.63875088715401, + "eval_accuracy": 0.8225344948178293, + "eval_loss": 0.5203356146812439, + "eval_runtime": 33.0374, + "eval_samples_per_second": 476.036, + "eval_steps_per_second": 14.892, + "step": 4500 + }, + { + "epoch": 0.6401703335699077, + "grad_norm": 10.036981582641602, + "learning_rate": 9.936011355571328e-05, + "loss": 0.41321401596069335, + "step": 4510 + }, + { + "epoch": 0.6415897799858056, + "grad_norm": 6.618304252624512, + "learning_rate": 9.935869410929738e-05, + "loss": 0.43657841682434084, + "step": 4520 + }, + { + "epoch": 0.6430092264017033, + "grad_norm": 9.975127220153809, + "learning_rate": 9.935727466288149e-05, + "loss": 0.3949880838394165, + "step": 4530 + }, + { + "epoch": 0.6444286728176012, + "grad_norm": 8.210672378540039, + "learning_rate": 9.935585521646558e-05, + "loss": 0.4280043125152588, + "step": 4540 + }, + { + "epoch": 0.6458481192334989, + "grad_norm": 12.055879592895508, + "learning_rate": 9.935443577004968e-05, + "loss": 0.39465947151184083, + "step": 4550 + }, + { + "epoch": 0.6472675656493967, + "grad_norm": 7.540829658508301, + "learning_rate": 9.935301632363378e-05, + "loss": 0.3965680837631226, + "step": 4560 + }, + { + "epoch": 0.6486870120652946, + "grad_norm": 9.717781066894531, + "learning_rate": 9.935159687721789e-05, + "loss": 0.40194106101989746, + "step": 4570 + }, + { + "epoch": 0.6501064584811923, + "grad_norm": 10.271167755126953, + "learning_rate": 9.9350177430802e-05, + "loss": 0.4726293087005615, + "step": 4580 + }, + { + "epoch": 0.6515259048970902, + "grad_norm": 7.158174514770508, + "learning_rate": 9.93487579843861e-05, + "loss": 0.40993413925170896, + "step": 4590 + }, + { + "epoch": 0.6529453513129879, + "grad_norm": 10.536994934082031, + "learning_rate": 9.93473385379702e-05, + "loss": 0.4424222469329834, + "step": 4600 + }, + { + "epoch": 0.6543647977288858, + "grad_norm": 7.256109714508057, + "learning_rate": 9.934591909155429e-05, + "loss": 0.388359522819519, + "step": 4610 + }, + { + "epoch": 0.6557842441447835, + "grad_norm": 8.278726577758789, + "learning_rate": 9.93444996451384e-05, + "loss": 0.3513230085372925, + "step": 4620 + }, + { + "epoch": 0.6572036905606813, + "grad_norm": 7.767818927764893, + "learning_rate": 9.93430801987225e-05, + "loss": 0.42050671577453613, + "step": 4630 + }, + { + "epoch": 0.6586231369765791, + "grad_norm": 3.4903321266174316, + "learning_rate": 9.934166075230661e-05, + "loss": 0.3255154609680176, + "step": 4640 + }, + { + "epoch": 0.6600425833924769, + "grad_norm": 8.193768501281738, + "learning_rate": 9.93402413058907e-05, + "loss": 0.34639596939086914, + "step": 4650 + }, + { + "epoch": 0.6614620298083748, + "grad_norm": 6.168176651000977, + "learning_rate": 9.93388218594748e-05, + "loss": 0.3619822025299072, + "step": 4660 + }, + { + "epoch": 0.6628814762242725, + "grad_norm": 4.793501853942871, + "learning_rate": 9.933740241305892e-05, + "loss": 0.3441330909729004, + "step": 4670 + }, + { + "epoch": 0.6643009226401704, + "grad_norm": 7.100066184997559, + "learning_rate": 9.933598296664301e-05, + "loss": 0.41966400146484373, + "step": 4680 + }, + { + "epoch": 0.6657203690560681, + "grad_norm": 8.032003402709961, + "learning_rate": 9.933456352022713e-05, + "loss": 0.39086959362030027, + "step": 4690 + }, + { + "epoch": 0.6671398154719659, + "grad_norm": 5.533408164978027, + "learning_rate": 9.933314407381121e-05, + "loss": 0.455733060836792, + "step": 4700 + }, + { + "epoch": 0.6685592618878637, + "grad_norm": 6.478943347930908, + "learning_rate": 9.933172462739532e-05, + "loss": 0.3870114326477051, + "step": 4710 + }, + { + "epoch": 0.6699787083037615, + "grad_norm": 8.963722229003906, + "learning_rate": 9.933030518097942e-05, + "loss": 0.4041899681091309, + "step": 4720 + }, + { + "epoch": 0.6713981547196594, + "grad_norm": 4.072963714599609, + "learning_rate": 9.932888573456353e-05, + "loss": 0.35542023181915283, + "step": 4730 + }, + { + "epoch": 0.6728176011355571, + "grad_norm": 6.834389686584473, + "learning_rate": 9.932746628814763e-05, + "loss": 0.34830470085144044, + "step": 4740 + }, + { + "epoch": 0.674237047551455, + "grad_norm": 7.003122329711914, + "learning_rate": 9.932604684173174e-05, + "loss": 0.3465887069702148, + "step": 4750 + }, + { + "epoch": 0.6756564939673527, + "grad_norm": 8.914156913757324, + "learning_rate": 9.932462739531583e-05, + "loss": 0.44321861267089846, + "step": 4760 + }, + { + "epoch": 0.6770759403832506, + "grad_norm": 7.6024627685546875, + "learning_rate": 9.932320794889993e-05, + "loss": 0.40067334175109864, + "step": 4770 + }, + { + "epoch": 0.6784953867991483, + "grad_norm": 8.667821884155273, + "learning_rate": 9.932178850248404e-05, + "loss": 0.371229887008667, + "step": 4780 + }, + { + "epoch": 0.6799148332150461, + "grad_norm": 9.355796813964844, + "learning_rate": 9.932036905606814e-05, + "loss": 0.3920291900634766, + "step": 4790 + }, + { + "epoch": 0.681334279630944, + "grad_norm": 6.767845153808594, + "learning_rate": 9.931894960965225e-05, + "loss": 0.3848612070083618, + "step": 4800 + }, + { + "epoch": 0.6827537260468417, + "grad_norm": 8.195937156677246, + "learning_rate": 9.931767210787794e-05, + "loss": 0.5190616607666015, + "step": 4810 + }, + { + "epoch": 0.6841731724627396, + "grad_norm": 6.033681869506836, + "learning_rate": 9.931625266146203e-05, + "loss": 0.39132606983184814, + "step": 4820 + }, + { + "epoch": 0.6855926188786373, + "grad_norm": 8.469270706176758, + "learning_rate": 9.931483321504613e-05, + "loss": 0.3626258850097656, + "step": 4830 + }, + { + "epoch": 0.6870120652945352, + "grad_norm": 4.255542278289795, + "learning_rate": 9.931341376863024e-05, + "loss": 0.31856842041015626, + "step": 4840 + }, + { + "epoch": 0.6884315117104329, + "grad_norm": 9.191469192504883, + "learning_rate": 9.931199432221434e-05, + "loss": 0.3280362367630005, + "step": 4850 + }, + { + "epoch": 0.6898509581263307, + "grad_norm": 8.94046688079834, + "learning_rate": 9.931057487579845e-05, + "loss": 0.39851620197296145, + "step": 4860 + }, + { + "epoch": 0.6912704045422285, + "grad_norm": 7.770534992218018, + "learning_rate": 9.930915542938255e-05, + "loss": 0.33825528621673584, + "step": 4870 + }, + { + "epoch": 0.6926898509581263, + "grad_norm": 6.560062885284424, + "learning_rate": 9.930773598296664e-05, + "loss": 0.35839481353759767, + "step": 4880 + }, + { + "epoch": 0.6941092973740242, + "grad_norm": 9.24365520477295, + "learning_rate": 9.930631653655074e-05, + "loss": 0.39770119190216063, + "step": 4890 + }, + { + "epoch": 0.6955287437899219, + "grad_norm": 11.744332313537598, + "learning_rate": 9.930489709013485e-05, + "loss": 0.4902297019958496, + "step": 4900 + }, + { + "epoch": 0.6969481902058198, + "grad_norm": 7.251524448394775, + "learning_rate": 9.930347764371895e-05, + "loss": 0.40317511558532715, + "step": 4910 + }, + { + "epoch": 0.6983676366217175, + "grad_norm": 8.896724700927734, + "learning_rate": 9.930205819730306e-05, + "loss": 0.44049978256225586, + "step": 4920 + }, + { + "epoch": 0.6997870830376153, + "grad_norm": 7.477156162261963, + "learning_rate": 9.930063875088716e-05, + "loss": 0.3586245536804199, + "step": 4930 + }, + { + "epoch": 0.7012065294535131, + "grad_norm": 6.159836769104004, + "learning_rate": 9.929921930447126e-05, + "loss": 0.32783629894256594, + "step": 4940 + }, + { + "epoch": 0.7026259758694109, + "grad_norm": 6.85299825668335, + "learning_rate": 9.929779985805537e-05, + "loss": 0.30911822319030763, + "step": 4950 + }, + { + "epoch": 0.7040454222853088, + "grad_norm": 7.820040225982666, + "learning_rate": 9.929638041163946e-05, + "loss": 0.36734838485717775, + "step": 4960 + }, + { + "epoch": 0.7054648687012065, + "grad_norm": 6.66180944442749, + "learning_rate": 9.929496096522358e-05, + "loss": 0.37120904922485354, + "step": 4970 + }, + { + "epoch": 0.7068843151171044, + "grad_norm": 7.3861775398254395, + "learning_rate": 9.929354151880766e-05, + "loss": 0.4064349174499512, + "step": 4980 + }, + { + "epoch": 0.7083037615330021, + "grad_norm": 7.068629741668701, + "learning_rate": 9.929212207239177e-05, + "loss": 0.36406426429748534, + "step": 4990 + }, + { + "epoch": 0.7097232079489, + "grad_norm": 7.482442378997803, + "learning_rate": 9.929070262597587e-05, + "loss": 0.40763154029846194, + "step": 5000 + }, + { + "epoch": 0.7097232079489, + "eval_accuracy": 0.8707954473198957, + "eval_loss": 0.37987253069877625, + "eval_runtime": 33.0642, + "eval_samples_per_second": 475.651, + "eval_steps_per_second": 14.88, + "step": 5000 + }, + { + "epoch": 0.7111426543647977, + "grad_norm": 5.368759632110596, + "learning_rate": 9.928928317955998e-05, + "loss": 0.40792322158813477, + "step": 5010 + }, + { + "epoch": 0.7125621007806955, + "grad_norm": 3.8395280838012695, + "learning_rate": 9.928786373314408e-05, + "loss": 0.45433621406555175, + "step": 5020 + }, + { + "epoch": 0.7139815471965933, + "grad_norm": 7.884678840637207, + "learning_rate": 9.928644428672817e-05, + "loss": 0.3092354774475098, + "step": 5030 + }, + { + "epoch": 0.7154009936124911, + "grad_norm": 9.11925983428955, + "learning_rate": 9.928502484031228e-05, + "loss": 0.3887113094329834, + "step": 5040 + }, + { + "epoch": 0.716820440028389, + "grad_norm": 8.5901517868042, + "learning_rate": 9.928360539389638e-05, + "loss": 0.3938072443008423, + "step": 5050 + }, + { + "epoch": 0.7182398864442867, + "grad_norm": 4.011209011077881, + "learning_rate": 9.928218594748049e-05, + "loss": 0.3140719890594482, + "step": 5060 + }, + { + "epoch": 0.7196593328601846, + "grad_norm": 9.04295825958252, + "learning_rate": 9.928076650106459e-05, + "loss": 0.37023751735687255, + "step": 5070 + }, + { + "epoch": 0.7210787792760823, + "grad_norm": 7.336644649505615, + "learning_rate": 9.92793470546487e-05, + "loss": 0.3326029539108276, + "step": 5080 + }, + { + "epoch": 0.7224982256919801, + "grad_norm": 6.824075698852539, + "learning_rate": 9.927792760823278e-05, + "loss": 0.31377925872802737, + "step": 5090 + }, + { + "epoch": 0.7239176721078779, + "grad_norm": 6.152795314788818, + "learning_rate": 9.92765081618169e-05, + "loss": 0.4362512111663818, + "step": 5100 + }, + { + "epoch": 0.7253371185237757, + "grad_norm": 7.997036457061768, + "learning_rate": 9.927508871540099e-05, + "loss": 0.39910459518432617, + "step": 5110 + }, + { + "epoch": 0.7267565649396736, + "grad_norm": 7.5024309158325195, + "learning_rate": 9.92736692689851e-05, + "loss": 0.3690288305282593, + "step": 5120 + }, + { + "epoch": 0.7281760113555713, + "grad_norm": 9.340811729431152, + "learning_rate": 9.92722498225692e-05, + "loss": 0.28037595748901367, + "step": 5130 + }, + { + "epoch": 0.7295954577714692, + "grad_norm": 6.796107292175293, + "learning_rate": 9.92708303761533e-05, + "loss": 0.2862435817718506, + "step": 5140 + }, + { + "epoch": 0.7310149041873669, + "grad_norm": 6.0283379554748535, + "learning_rate": 9.926941092973741e-05, + "loss": 0.351378345489502, + "step": 5150 + }, + { + "epoch": 0.7324343506032647, + "grad_norm": 6.880161762237549, + "learning_rate": 9.926799148332151e-05, + "loss": 0.3127347230911255, + "step": 5160 + }, + { + "epoch": 0.7338537970191625, + "grad_norm": 7.761416912078857, + "learning_rate": 9.926657203690562e-05, + "loss": 0.3232876777648926, + "step": 5170 + }, + { + "epoch": 0.7352732434350603, + "grad_norm": 8.840635299682617, + "learning_rate": 9.926515259048972e-05, + "loss": 0.36195032596588134, + "step": 5180 + }, + { + "epoch": 0.7366926898509581, + "grad_norm": 10.067350387573242, + "learning_rate": 9.926373314407381e-05, + "loss": 0.33318257331848145, + "step": 5190 + }, + { + "epoch": 0.7381121362668559, + "grad_norm": 4.935089111328125, + "learning_rate": 9.926231369765791e-05, + "loss": 0.3263442039489746, + "step": 5200 + }, + { + "epoch": 0.7395315826827538, + "grad_norm": 6.868301868438721, + "learning_rate": 9.926089425124202e-05, + "loss": 0.4087569236755371, + "step": 5210 + }, + { + "epoch": 0.7409510290986515, + "grad_norm": 7.978097915649414, + "learning_rate": 9.925947480482612e-05, + "loss": 0.33616573810577394, + "step": 5220 + }, + { + "epoch": 0.7423704755145494, + "grad_norm": 11.391094207763672, + "learning_rate": 9.925805535841023e-05, + "loss": 0.33483550548553465, + "step": 5230 + }, + { + "epoch": 0.7437899219304471, + "grad_norm": 5.558361530303955, + "learning_rate": 9.925663591199433e-05, + "loss": 0.38994641304016114, + "step": 5240 + }, + { + "epoch": 0.7452093683463449, + "grad_norm": 2.6022746562957764, + "learning_rate": 9.925521646557842e-05, + "loss": 0.2801194429397583, + "step": 5250 + }, + { + "epoch": 0.7466288147622427, + "grad_norm": 10.395146369934082, + "learning_rate": 9.925379701916253e-05, + "loss": 0.45772466659545896, + "step": 5260 + }, + { + "epoch": 0.7480482611781405, + "grad_norm": 10.162497520446777, + "learning_rate": 9.925237757274663e-05, + "loss": 0.3906741142272949, + "step": 5270 + }, + { + "epoch": 0.7494677075940384, + "grad_norm": 7.618703365325928, + "learning_rate": 9.925095812633074e-05, + "loss": 0.3549813747406006, + "step": 5280 + }, + { + "epoch": 0.7508871540099361, + "grad_norm": 6.407444953918457, + "learning_rate": 9.924953867991483e-05, + "loss": 0.3040858268737793, + "step": 5290 + }, + { + "epoch": 0.752306600425834, + "grad_norm": 7.738057613372803, + "learning_rate": 9.924811923349894e-05, + "loss": 0.39499850273132325, + "step": 5300 + }, + { + "epoch": 0.7537260468417317, + "grad_norm": 7.237374782562256, + "learning_rate": 9.924669978708304e-05, + "loss": 0.3085558652877808, + "step": 5310 + }, + { + "epoch": 0.7551454932576295, + "grad_norm": 6.442776203155518, + "learning_rate": 9.924528034066715e-05, + "loss": 0.40102262496948243, + "step": 5320 + }, + { + "epoch": 0.7565649396735273, + "grad_norm": 10.280111312866211, + "learning_rate": 9.924386089425126e-05, + "loss": 0.3338863611221313, + "step": 5330 + }, + { + "epoch": 0.7579843860894251, + "grad_norm": 8.590238571166992, + "learning_rate": 9.924244144783534e-05, + "loss": 0.48393831253051756, + "step": 5340 + }, + { + "epoch": 0.759403832505323, + "grad_norm": 4.818009376525879, + "learning_rate": 9.924102200141945e-05, + "loss": 0.31519811153411864, + "step": 5350 + }, + { + "epoch": 0.7608232789212207, + "grad_norm": 7.284486293792725, + "learning_rate": 9.923960255500355e-05, + "loss": 0.3537211179733276, + "step": 5360 + }, + { + "epoch": 0.7622427253371186, + "grad_norm": 8.618793487548828, + "learning_rate": 9.923818310858766e-05, + "loss": 0.34086947441101073, + "step": 5370 + }, + { + "epoch": 0.7636621717530163, + "grad_norm": 8.162178039550781, + "learning_rate": 9.923676366217176e-05, + "loss": 0.38811311721801756, + "step": 5380 + }, + { + "epoch": 0.7650816181689141, + "grad_norm": 7.360818386077881, + "learning_rate": 9.923534421575587e-05, + "loss": 0.30603010654449464, + "step": 5390 + }, + { + "epoch": 0.7665010645848119, + "grad_norm": 4.011861801147461, + "learning_rate": 9.923392476933995e-05, + "loss": 0.23683266639709472, + "step": 5400 + }, + { + "epoch": 0.7679205110007097, + "grad_norm": 5.943147659301758, + "learning_rate": 9.923250532292406e-05, + "loss": 0.34063313007354734, + "step": 5410 + }, + { + "epoch": 0.7693399574166075, + "grad_norm": 7.751121997833252, + "learning_rate": 9.923108587650817e-05, + "loss": 0.36524248123168945, + "step": 5420 + }, + { + "epoch": 0.7707594038325053, + "grad_norm": 8.413863182067871, + "learning_rate": 9.922966643009227e-05, + "loss": 0.3002290725708008, + "step": 5430 + }, + { + "epoch": 0.7721788502484032, + "grad_norm": 7.4792280197143555, + "learning_rate": 9.922824698367638e-05, + "loss": 0.2858253240585327, + "step": 5440 + }, + { + "epoch": 0.7735982966643009, + "grad_norm": 4.943634986877441, + "learning_rate": 9.922682753726047e-05, + "loss": 0.3922913074493408, + "step": 5450 + }, + { + "epoch": 0.7750177430801988, + "grad_norm": 9.556757926940918, + "learning_rate": 9.922540809084458e-05, + "loss": 0.32624542713165283, + "step": 5460 + }, + { + "epoch": 0.7764371894960965, + "grad_norm": 6.306029319763184, + "learning_rate": 9.922398864442867e-05, + "loss": 0.32522106170654297, + "step": 5470 + }, + { + "epoch": 0.7778566359119943, + "grad_norm": 9.622481346130371, + "learning_rate": 9.922256919801279e-05, + "loss": 0.32840585708618164, + "step": 5480 + }, + { + "epoch": 0.7792760823278921, + "grad_norm": 6.480415344238281, + "learning_rate": 9.922114975159688e-05, + "loss": 0.31494650840759275, + "step": 5490 + }, + { + "epoch": 0.7806955287437899, + "grad_norm": 9.822346687316895, + "learning_rate": 9.921973030518098e-05, + "loss": 0.3520227909088135, + "step": 5500 + }, + { + "epoch": 0.7806955287437899, + "eval_accuracy": 0.8887263940993196, + "eval_loss": 0.331625759601593, + "eval_runtime": 33.1217, + "eval_samples_per_second": 474.825, + "eval_steps_per_second": 14.854, + "step": 5500 + }, + { + "epoch": 0.7821149751596878, + "grad_norm": 8.544402122497559, + "learning_rate": 9.921831085876508e-05, + "loss": 0.3386709451675415, + "step": 5510 + }, + { + "epoch": 0.7835344215755855, + "grad_norm": 6.877591133117676, + "learning_rate": 9.921689141234919e-05, + "loss": 0.3577073574066162, + "step": 5520 + }, + { + "epoch": 0.7849538679914834, + "grad_norm": 8.182839393615723, + "learning_rate": 9.92154719659333e-05, + "loss": 0.33861188888549804, + "step": 5530 + }, + { + "epoch": 0.7863733144073811, + "grad_norm": 7.762393474578857, + "learning_rate": 9.92140525195174e-05, + "loss": 0.2913277387619019, + "step": 5540 + }, + { + "epoch": 0.7877927608232789, + "grad_norm": 9.238672256469727, + "learning_rate": 9.92126330731015e-05, + "loss": 0.27555758953094484, + "step": 5550 + }, + { + "epoch": 0.7892122072391767, + "grad_norm": 8.316729545593262, + "learning_rate": 9.921121362668559e-05, + "loss": 0.3221546411514282, + "step": 5560 + }, + { + "epoch": 0.7906316536550745, + "grad_norm": 5.685539245605469, + "learning_rate": 9.92097941802697e-05, + "loss": 0.335821533203125, + "step": 5570 + }, + { + "epoch": 0.7920511000709723, + "grad_norm": 9.121819496154785, + "learning_rate": 9.92083747338538e-05, + "loss": 0.41519789695739745, + "step": 5580 + }, + { + "epoch": 0.7934705464868701, + "grad_norm": 10.83812141418457, + "learning_rate": 9.920695528743791e-05, + "loss": 0.30081839561462403, + "step": 5590 + }, + { + "epoch": 0.794889992902768, + "grad_norm": 3.7030341625213623, + "learning_rate": 9.9205535841022e-05, + "loss": 0.3369245767593384, + "step": 5600 + }, + { + "epoch": 0.7963094393186657, + "grad_norm": 3.8987886905670166, + "learning_rate": 9.92041163946061e-05, + "loss": 0.3294223785400391, + "step": 5610 + }, + { + "epoch": 0.7977288857345636, + "grad_norm": 4.1831207275390625, + "learning_rate": 9.920269694819022e-05, + "loss": 0.2734922170639038, + "step": 5620 + }, + { + "epoch": 0.7991483321504613, + "grad_norm": 7.363320827484131, + "learning_rate": 9.920127750177431e-05, + "loss": 0.3629761219024658, + "step": 5630 + }, + { + "epoch": 0.8005677785663591, + "grad_norm": 3.947075366973877, + "learning_rate": 9.919985805535842e-05, + "loss": 0.24655752182006835, + "step": 5640 + }, + { + "epoch": 0.8019872249822569, + "grad_norm": 7.183192253112793, + "learning_rate": 9.919843860894251e-05, + "loss": 0.3074009895324707, + "step": 5650 + }, + { + "epoch": 0.8034066713981547, + "grad_norm": 9.004253387451172, + "learning_rate": 9.919701916252662e-05, + "loss": 0.38861281871795655, + "step": 5660 + }, + { + "epoch": 0.8048261178140526, + "grad_norm": 7.553649425506592, + "learning_rate": 9.919559971611072e-05, + "loss": 0.4247180461883545, + "step": 5670 + }, + { + "epoch": 0.8062455642299503, + "grad_norm": 6.382741928100586, + "learning_rate": 9.919418026969483e-05, + "loss": 0.304930305480957, + "step": 5680 + }, + { + "epoch": 0.8076650106458482, + "grad_norm": 5.102434158325195, + "learning_rate": 9.919276082327893e-05, + "loss": 0.38076980113983155, + "step": 5690 + }, + { + "epoch": 0.8090844570617459, + "grad_norm": 6.131350517272949, + "learning_rate": 9.919134137686302e-05, + "loss": 0.40895967483520507, + "step": 5700 + }, + { + "epoch": 0.8105039034776437, + "grad_norm": 7.717721939086914, + "learning_rate": 9.918992193044713e-05, + "loss": 0.34289727210998533, + "step": 5710 + }, + { + "epoch": 0.8119233498935415, + "grad_norm": 7.452071189880371, + "learning_rate": 9.918850248403123e-05, + "loss": 0.26248266696929934, + "step": 5720 + }, + { + "epoch": 0.8133427963094393, + "grad_norm": 4.934199333190918, + "learning_rate": 9.918708303761534e-05, + "loss": 0.2918365478515625, + "step": 5730 + }, + { + "epoch": 0.8147622427253371, + "grad_norm": 3.497220993041992, + "learning_rate": 9.918566359119944e-05, + "loss": 0.27859480381011964, + "step": 5740 + }, + { + "epoch": 0.8161816891412349, + "grad_norm": 9.320852279663086, + "learning_rate": 9.918424414478355e-05, + "loss": 0.34371328353881836, + "step": 5750 + }, + { + "epoch": 0.8176011355571328, + "grad_norm": 10.081619262695312, + "learning_rate": 9.918282469836763e-05, + "loss": 0.36181211471557617, + "step": 5760 + }, + { + "epoch": 0.8190205819730305, + "grad_norm": 7.466938018798828, + "learning_rate": 9.918140525195174e-05, + "loss": 0.34078028202056887, + "step": 5770 + }, + { + "epoch": 0.8204400283889283, + "grad_norm": 4.303114414215088, + "learning_rate": 9.917998580553584e-05, + "loss": 0.34729723930358886, + "step": 5780 + }, + { + "epoch": 0.8218594748048261, + "grad_norm": 9.38592529296875, + "learning_rate": 9.917856635911995e-05, + "loss": 0.4285425662994385, + "step": 5790 + }, + { + "epoch": 0.8232789212207239, + "grad_norm": 9.465388298034668, + "learning_rate": 9.917714691270405e-05, + "loss": 0.3501663446426392, + "step": 5800 + }, + { + "epoch": 0.8246983676366217, + "grad_norm": 5.500204086303711, + "learning_rate": 9.917572746628815e-05, + "loss": 0.3102808952331543, + "step": 5810 + }, + { + "epoch": 0.8261178140525195, + "grad_norm": 4.572218894958496, + "learning_rate": 9.917430801987226e-05, + "loss": 0.2433872938156128, + "step": 5820 + }, + { + "epoch": 0.8275372604684174, + "grad_norm": 9.858591079711914, + "learning_rate": 9.917288857345636e-05, + "loss": 0.30695419311523436, + "step": 5830 + }, + { + "epoch": 0.8289567068843151, + "grad_norm": 6.843176364898682, + "learning_rate": 9.917146912704047e-05, + "loss": 0.35634801387786863, + "step": 5840 + }, + { + "epoch": 0.830376153300213, + "grad_norm": 10.634949684143066, + "learning_rate": 9.917004968062456e-05, + "loss": 0.3107039451599121, + "step": 5850 + }, + { + "epoch": 0.8317955997161107, + "grad_norm": 8.44272518157959, + "learning_rate": 9.916863023420866e-05, + "loss": 0.3672316551208496, + "step": 5860 + }, + { + "epoch": 0.8332150461320085, + "grad_norm": 5.4848785400390625, + "learning_rate": 9.916721078779276e-05, + "loss": 0.4015390872955322, + "step": 5870 + }, + { + "epoch": 0.8346344925479063, + "grad_norm": 7.271710395812988, + "learning_rate": 9.916579134137687e-05, + "loss": 0.23676373958587646, + "step": 5880 + }, + { + "epoch": 0.8360539389638041, + "grad_norm": 4.376358509063721, + "learning_rate": 9.916437189496097e-05, + "loss": 0.2711988687515259, + "step": 5890 + }, + { + "epoch": 0.837473385379702, + "grad_norm": 6.931346416473389, + "learning_rate": 9.916295244854508e-05, + "loss": 0.2837867021560669, + "step": 5900 + }, + { + "epoch": 0.8388928317955997, + "grad_norm": 7.611521244049072, + "learning_rate": 9.916153300212918e-05, + "loss": 0.315134072303772, + "step": 5910 + }, + { + "epoch": 0.8403122782114976, + "grad_norm": 7.071038722991943, + "learning_rate": 9.916011355571327e-05, + "loss": 0.3368415594100952, + "step": 5920 + }, + { + "epoch": 0.8417317246273953, + "grad_norm": 4.1825056076049805, + "learning_rate": 9.915869410929738e-05, + "loss": 0.3074488162994385, + "step": 5930 + }, + { + "epoch": 0.8431511710432931, + "grad_norm": 6.3160929679870605, + "learning_rate": 9.915727466288148e-05, + "loss": 0.3252119541168213, + "step": 5940 + }, + { + "epoch": 0.8445706174591909, + "grad_norm": 8.007182121276855, + "learning_rate": 9.915585521646559e-05, + "loss": 0.23286638259887696, + "step": 5950 + }, + { + "epoch": 0.8459900638750887, + "grad_norm": 7.93002986907959, + "learning_rate": 9.915443577004968e-05, + "loss": 0.2870266199111938, + "step": 5960 + }, + { + "epoch": 0.8474095102909865, + "grad_norm": 5.426539897918701, + "learning_rate": 9.915301632363379e-05, + "loss": 0.29859611988067625, + "step": 5970 + }, + { + "epoch": 0.8488289567068843, + "grad_norm": 4.294735908508301, + "learning_rate": 9.915159687721788e-05, + "loss": 0.24727118015289307, + "step": 5980 + }, + { + "epoch": 0.8502484031227822, + "grad_norm": 8.501158714294434, + "learning_rate": 9.9150177430802e-05, + "loss": 0.3406102657318115, + "step": 5990 + }, + { + "epoch": 0.8516678495386799, + "grad_norm": 8.125472068786621, + "learning_rate": 9.914875798438609e-05, + "loss": 0.3179450273513794, + "step": 6000 + }, + { + "epoch": 0.8516678495386799, + "eval_accuracy": 0.8626565778597317, + "eval_loss": 0.4082823693752289, + "eval_runtime": 33.3539, + "eval_samples_per_second": 471.52, + "eval_steps_per_second": 14.751, + "step": 6000 + }, + { + "epoch": 0.8530872959545777, + "grad_norm": 4.980500221252441, + "learning_rate": 9.914733853797019e-05, + "loss": 0.3588885307312012, + "step": 6010 + }, + { + "epoch": 0.8545067423704755, + "grad_norm": 5.385146617889404, + "learning_rate": 9.91459190915543e-05, + "loss": 0.28512775897979736, + "step": 6020 + }, + { + "epoch": 0.8559261887863733, + "grad_norm": 8.24423599243164, + "learning_rate": 9.91444996451384e-05, + "loss": 0.32922515869140623, + "step": 6030 + }, + { + "epoch": 0.8573456352022711, + "grad_norm": 6.568521499633789, + "learning_rate": 9.914308019872251e-05, + "loss": 0.24458625316619872, + "step": 6040 + }, + { + "epoch": 0.8587650816181689, + "grad_norm": 6.268226146697998, + "learning_rate": 9.914166075230661e-05, + "loss": 0.30663580894470216, + "step": 6050 + }, + { + "epoch": 0.8601845280340668, + "grad_norm": 5.911208152770996, + "learning_rate": 9.91402413058907e-05, + "loss": 0.38018484115600587, + "step": 6060 + }, + { + "epoch": 0.8616039744499645, + "grad_norm": 5.170897483825684, + "learning_rate": 9.91388218594748e-05, + "loss": 0.22591965198516845, + "step": 6070 + }, + { + "epoch": 0.8630234208658624, + "grad_norm": 5.716799736022949, + "learning_rate": 9.913740241305891e-05, + "loss": 0.2626305103302002, + "step": 6080 + }, + { + "epoch": 0.8644428672817601, + "grad_norm": 6.144148349761963, + "learning_rate": 9.913598296664301e-05, + "loss": 0.23459088802337646, + "step": 6090 + }, + { + "epoch": 0.8658623136976579, + "grad_norm": 8.506244659423828, + "learning_rate": 9.913456352022712e-05, + "loss": 0.36330761909484866, + "step": 6100 + }, + { + "epoch": 0.8672817601135557, + "grad_norm": 9.882643699645996, + "learning_rate": 9.913314407381122e-05, + "loss": 0.32826101779937744, + "step": 6110 + }, + { + "epoch": 0.8687012065294535, + "grad_norm": 8.62743091583252, + "learning_rate": 9.913172462739532e-05, + "loss": 0.30355727672576904, + "step": 6120 + }, + { + "epoch": 0.8701206529453513, + "grad_norm": 11.726634979248047, + "learning_rate": 9.913030518097943e-05, + "loss": 0.280806303024292, + "step": 6130 + }, + { + "epoch": 0.8715400993612491, + "grad_norm": 7.7827839851379395, + "learning_rate": 9.912888573456352e-05, + "loss": 0.3389289855957031, + "step": 6140 + }, + { + "epoch": 0.872959545777147, + "grad_norm": 12.07807731628418, + "learning_rate": 9.912746628814764e-05, + "loss": 0.31570281982421877, + "step": 6150 + }, + { + "epoch": 0.8743789921930447, + "grad_norm": 4.949673652648926, + "learning_rate": 9.912604684173173e-05, + "loss": 0.269368839263916, + "step": 6160 + }, + { + "epoch": 0.8757984386089425, + "grad_norm": 6.946098327636719, + "learning_rate": 9.912462739531583e-05, + "loss": 0.33236119747161863, + "step": 6170 + }, + { + "epoch": 0.8772178850248403, + "grad_norm": 7.137246131896973, + "learning_rate": 9.912320794889993e-05, + "loss": 0.3343817710876465, + "step": 6180 + }, + { + "epoch": 0.8786373314407381, + "grad_norm": 4.929990768432617, + "learning_rate": 9.912178850248404e-05, + "loss": 0.23963472843170167, + "step": 6190 + }, + { + "epoch": 0.8800567778566359, + "grad_norm": 10.46869945526123, + "learning_rate": 9.912036905606814e-05, + "loss": 0.2913534641265869, + "step": 6200 + }, + { + "epoch": 0.8814762242725337, + "grad_norm": 7.179393291473389, + "learning_rate": 9.911894960965225e-05, + "loss": 0.27806806564331055, + "step": 6210 + }, + { + "epoch": 0.8828956706884316, + "grad_norm": 5.430668830871582, + "learning_rate": 9.911753016323634e-05, + "loss": 0.2537125587463379, + "step": 6220 + }, + { + "epoch": 0.8843151171043293, + "grad_norm": 7.001239776611328, + "learning_rate": 9.911611071682044e-05, + "loss": 0.2821568489074707, + "step": 6230 + }, + { + "epoch": 0.8857345635202271, + "grad_norm": 10.218942642211914, + "learning_rate": 9.911469127040455e-05, + "loss": 0.30785112380981444, + "step": 6240 + }, + { + "epoch": 0.8871540099361249, + "grad_norm": 3.9179635047912598, + "learning_rate": 9.911327182398865e-05, + "loss": 0.3376051902770996, + "step": 6250 + }, + { + "epoch": 0.8885734563520227, + "grad_norm": 7.35114049911499, + "learning_rate": 9.911185237757276e-05, + "loss": 0.2029582977294922, + "step": 6260 + }, + { + "epoch": 0.8899929027679205, + "grad_norm": 7.477942943572998, + "learning_rate": 9.911043293115684e-05, + "loss": 0.31639838218688965, + "step": 6270 + }, + { + "epoch": 0.8914123491838183, + "grad_norm": 6.479630470275879, + "learning_rate": 9.910901348474096e-05, + "loss": 0.35874156951904296, + "step": 6280 + }, + { + "epoch": 0.8928317955997161, + "grad_norm": 5.139812469482422, + "learning_rate": 9.910759403832505e-05, + "loss": 0.23642609119415284, + "step": 6290 + }, + { + "epoch": 0.8942512420156139, + "grad_norm": 7.17330265045166, + "learning_rate": 9.910617459190916e-05, + "loss": 0.27939982414245607, + "step": 6300 + }, + { + "epoch": 0.8956706884315118, + "grad_norm": 8.804689407348633, + "learning_rate": 9.910475514549326e-05, + "loss": 0.3722469568252563, + "step": 6310 + }, + { + "epoch": 0.8970901348474095, + "grad_norm": 2.958435297012329, + "learning_rate": 9.910333569907736e-05, + "loss": 0.23576738834381103, + "step": 6320 + }, + { + "epoch": 0.8985095812633073, + "grad_norm": 10.53680419921875, + "learning_rate": 9.910191625266147e-05, + "loss": 0.4027998447418213, + "step": 6330 + }, + { + "epoch": 0.8999290276792051, + "grad_norm": 5.857926368713379, + "learning_rate": 9.910049680624557e-05, + "loss": 0.29457688331604004, + "step": 6340 + }, + { + "epoch": 0.9013484740951029, + "grad_norm": 1.7572773694992065, + "learning_rate": 9.909907735982968e-05, + "loss": 0.2572882890701294, + "step": 6350 + }, + { + "epoch": 0.9027679205110007, + "grad_norm": 4.274378299713135, + "learning_rate": 9.909765791341377e-05, + "loss": 0.23681292533874512, + "step": 6360 + }, + { + "epoch": 0.9041873669268985, + "grad_norm": 7.596087455749512, + "learning_rate": 9.909623846699787e-05, + "loss": 0.23812153339385986, + "step": 6370 + }, + { + "epoch": 0.9056068133427964, + "grad_norm": 5.59556770324707, + "learning_rate": 9.909481902058197e-05, + "loss": 0.29871695041656493, + "step": 6380 + }, + { + "epoch": 0.9070262597586941, + "grad_norm": 4.671100616455078, + "learning_rate": 9.909339957416608e-05, + "loss": 0.23768167495727538, + "step": 6390 + }, + { + "epoch": 0.9084457061745919, + "grad_norm": 6.55142068862915, + "learning_rate": 9.909198012775018e-05, + "loss": 0.2650206804275513, + "step": 6400 + }, + { + "epoch": 0.9098651525904897, + "grad_norm": 7.774087429046631, + "learning_rate": 9.909056068133429e-05, + "loss": 0.2898139238357544, + "step": 6410 + }, + { + "epoch": 0.9112845990063875, + "grad_norm": 6.386779308319092, + "learning_rate": 9.908914123491839e-05, + "loss": 0.26163647174835203, + "step": 6420 + }, + { + "epoch": 0.9127040454222853, + "grad_norm": 7.33029317855835, + "learning_rate": 9.908772178850248e-05, + "loss": 0.2447366952896118, + "step": 6430 + }, + { + "epoch": 0.9141234918381831, + "grad_norm": 10.35724925994873, + "learning_rate": 9.90863023420866e-05, + "loss": 0.2560460329055786, + "step": 6440 + }, + { + "epoch": 0.915542938254081, + "grad_norm": 9.2293062210083, + "learning_rate": 9.908488289567069e-05, + "loss": 0.3864759922027588, + "step": 6450 + }, + { + "epoch": 0.9169623846699787, + "grad_norm": 8.472285270690918, + "learning_rate": 9.90834634492548e-05, + "loss": 0.2888746976852417, + "step": 6460 + }, + { + "epoch": 0.9183818310858765, + "grad_norm": 6.22374153137207, + "learning_rate": 9.90820440028389e-05, + "loss": 0.2505399942398071, + "step": 6470 + }, + { + "epoch": 0.9198012775017743, + "grad_norm": 7.827479839324951, + "learning_rate": 9.9080624556423e-05, + "loss": 0.2327653408050537, + "step": 6480 + }, + { + "epoch": 0.9212207239176721, + "grad_norm": 7.873356819152832, + "learning_rate": 9.90792051100071e-05, + "loss": 0.2565167903900146, + "step": 6490 + }, + { + "epoch": 0.9226401703335699, + "grad_norm": 4.665884494781494, + "learning_rate": 9.90777856635912e-05, + "loss": 0.2404710292816162, + "step": 6500 + }, + { + "epoch": 0.9226401703335699, + "eval_accuracy": 0.9011890379601959, + "eval_loss": 0.29011303186416626, + "eval_runtime": 34.6022, + "eval_samples_per_second": 454.509, + "eval_steps_per_second": 14.219, + "step": 6500 + }, + { + "epoch": 0.9240596167494677, + "grad_norm": 7.10374641418457, + "learning_rate": 9.90763662171753e-05, + "loss": 0.28783435821533204, + "step": 6510 + }, + { + "epoch": 0.9254790631653655, + "grad_norm": 7.5799784660339355, + "learning_rate": 9.907494677075941e-05, + "loss": 0.3219441890716553, + "step": 6520 + }, + { + "epoch": 0.9268985095812633, + "grad_norm": 3.9083335399627686, + "learning_rate": 9.907352732434351e-05, + "loss": 0.2374324083328247, + "step": 6530 + }, + { + "epoch": 0.9283179559971612, + "grad_norm": 9.309243202209473, + "learning_rate": 9.907210787792761e-05, + "loss": 0.2314399242401123, + "step": 6540 + }, + { + "epoch": 0.9297374024130589, + "grad_norm": 5.650235176086426, + "learning_rate": 9.907068843151172e-05, + "loss": 0.2187626600265503, + "step": 6550 + }, + { + "epoch": 0.9311568488289567, + "grad_norm": 5.9835710525512695, + "learning_rate": 9.906926898509582e-05, + "loss": 0.27225399017333984, + "step": 6560 + }, + { + "epoch": 0.9325762952448545, + "grad_norm": 8.403820991516113, + "learning_rate": 9.906784953867993e-05, + "loss": 0.24051570892333984, + "step": 6570 + }, + { + "epoch": 0.9339957416607523, + "grad_norm": 5.456867218017578, + "learning_rate": 9.906643009226401e-05, + "loss": 0.229835844039917, + "step": 6580 + }, + { + "epoch": 0.9354151880766501, + "grad_norm": 11.34472942352295, + "learning_rate": 9.906501064584812e-05, + "loss": 0.28583712577819825, + "step": 6590 + }, + { + "epoch": 0.9368346344925479, + "grad_norm": 7.0680694580078125, + "learning_rate": 9.906359119943222e-05, + "loss": 0.28688597679138184, + "step": 6600 + }, + { + "epoch": 0.9382540809084458, + "grad_norm": 4.637568950653076, + "learning_rate": 9.906217175301633e-05, + "loss": 0.3234848976135254, + "step": 6610 + }, + { + "epoch": 0.9396735273243435, + "grad_norm": 4.935168743133545, + "learning_rate": 9.906075230660043e-05, + "loss": 0.2546673059463501, + "step": 6620 + }, + { + "epoch": 0.9410929737402413, + "grad_norm": 8.563390731811523, + "learning_rate": 9.905933286018453e-05, + "loss": 0.26501734256744386, + "step": 6630 + }, + { + "epoch": 0.9425124201561391, + "grad_norm": 8.05203914642334, + "learning_rate": 9.905791341376864e-05, + "loss": 0.19906221628189086, + "step": 6640 + }, + { + "epoch": 0.9439318665720369, + "grad_norm": 4.535382270812988, + "learning_rate": 9.905649396735273e-05, + "loss": 0.2355113744735718, + "step": 6650 + }, + { + "epoch": 0.9453513129879347, + "grad_norm": 5.967373371124268, + "learning_rate": 9.905507452093685e-05, + "loss": 0.2591426372528076, + "step": 6660 + }, + { + "epoch": 0.9467707594038325, + "grad_norm": 5.093105792999268, + "learning_rate": 9.905365507452094e-05, + "loss": 0.2508120536804199, + "step": 6670 + }, + { + "epoch": 0.9481902058197303, + "grad_norm": 6.775847911834717, + "learning_rate": 9.905223562810504e-05, + "loss": 0.2802272319793701, + "step": 6680 + }, + { + "epoch": 0.9496096522356281, + "grad_norm": 7.280439376831055, + "learning_rate": 9.905081618168914e-05, + "loss": 0.23689627647399902, + "step": 6690 + }, + { + "epoch": 0.9510290986515259, + "grad_norm": 7.68773078918457, + "learning_rate": 9.904939673527325e-05, + "loss": 0.2927251815795898, + "step": 6700 + }, + { + "epoch": 0.9524485450674237, + "grad_norm": 5.4808831214904785, + "learning_rate": 9.904797728885735e-05, + "loss": 0.28672428131103517, + "step": 6710 + }, + { + "epoch": 0.9538679914833215, + "grad_norm": 8.087321281433105, + "learning_rate": 9.904655784244146e-05, + "loss": 0.3129342794418335, + "step": 6720 + }, + { + "epoch": 0.9552874378992193, + "grad_norm": 2.7893686294555664, + "learning_rate": 9.904513839602555e-05, + "loss": 0.22520501613616944, + "step": 6730 + }, + { + "epoch": 0.9567068843151171, + "grad_norm": 10.040759086608887, + "learning_rate": 9.904371894960965e-05, + "loss": 0.2705253601074219, + "step": 6740 + }, + { + "epoch": 0.9581263307310149, + "grad_norm": 3.0198464393615723, + "learning_rate": 9.904229950319376e-05, + "loss": 0.27905032634735105, + "step": 6750 + }, + { + "epoch": 0.9595457771469127, + "grad_norm": 9.044099807739258, + "learning_rate": 9.904088005677786e-05, + "loss": 0.2549771547317505, + "step": 6760 + }, + { + "epoch": 0.9609652235628106, + "grad_norm": 3.4965715408325195, + "learning_rate": 9.903946061036197e-05, + "loss": 0.2617889165878296, + "step": 6770 + }, + { + "epoch": 0.9623846699787083, + "grad_norm": 4.959318161010742, + "learning_rate": 9.903804116394605e-05, + "loss": 0.24190716743469237, + "step": 6780 + }, + { + "epoch": 0.9638041163946061, + "grad_norm": 4.6404314041137695, + "learning_rate": 9.903662171753017e-05, + "loss": 0.29865779876708987, + "step": 6790 + }, + { + "epoch": 0.9652235628105039, + "grad_norm": 6.315147876739502, + "learning_rate": 9.903520227111426e-05, + "loss": 0.2937409162521362, + "step": 6800 + }, + { + "epoch": 0.9666430092264017, + "grad_norm": 6.294488906860352, + "learning_rate": 9.903378282469837e-05, + "loss": 0.28489468097686765, + "step": 6810 + }, + { + "epoch": 0.9680624556422995, + "grad_norm": 6.917492866516113, + "learning_rate": 9.903236337828248e-05, + "loss": 0.18736352920532226, + "step": 6820 + }, + { + "epoch": 0.9694819020581973, + "grad_norm": 6.20442533493042, + "learning_rate": 9.903094393186658e-05, + "loss": 0.24552693367004394, + "step": 6830 + }, + { + "epoch": 0.9709013484740951, + "grad_norm": 9.16247844696045, + "learning_rate": 9.902952448545068e-05, + "loss": 0.22968952655792235, + "step": 6840 + }, + { + "epoch": 0.9723207948899929, + "grad_norm": 8.185150146484375, + "learning_rate": 9.902810503903478e-05, + "loss": 0.25458450317382814, + "step": 6850 + }, + { + "epoch": 0.9737402413058907, + "grad_norm": 8.134267807006836, + "learning_rate": 9.902668559261889e-05, + "loss": 0.25451316833496096, + "step": 6860 + }, + { + "epoch": 0.9751596877217885, + "grad_norm": 12.39373779296875, + "learning_rate": 9.902526614620298e-05, + "loss": 0.2887612819671631, + "step": 6870 + }, + { + "epoch": 0.9765791341376863, + "grad_norm": 7.776149272918701, + "learning_rate": 9.90238466997871e-05, + "loss": 0.3695904970169067, + "step": 6880 + }, + { + "epoch": 0.9779985805535841, + "grad_norm": 6.241235256195068, + "learning_rate": 9.902242725337118e-05, + "loss": 0.26552643775939944, + "step": 6890 + }, + { + "epoch": 0.9794180269694819, + "grad_norm": 11.734026908874512, + "learning_rate": 9.902100780695529e-05, + "loss": 0.32755370140075685, + "step": 6900 + }, + { + "epoch": 0.9808374733853797, + "grad_norm": 6.049038887023926, + "learning_rate": 9.90195883605394e-05, + "loss": 0.22059807777404786, + "step": 6910 + }, + { + "epoch": 0.9822569198012775, + "grad_norm": 4.156560897827148, + "learning_rate": 9.901831085876509e-05, + "loss": 0.3507907629013062, + "step": 6920 + }, + { + "epoch": 0.9836763662171752, + "grad_norm": 4.315751552581787, + "learning_rate": 9.901689141234918e-05, + "loss": 0.25436155796051024, + "step": 6930 + }, + { + "epoch": 0.9850958126330731, + "grad_norm": 6.76514196395874, + "learning_rate": 9.90154719659333e-05, + "loss": 0.24831132888793944, + "step": 6940 + }, + { + "epoch": 0.9865152590489709, + "grad_norm": 6.7387261390686035, + "learning_rate": 9.901405251951739e-05, + "loss": 0.23655142784118652, + "step": 6950 + }, + { + "epoch": 0.9879347054648687, + "grad_norm": 3.8014583587646484, + "learning_rate": 9.901263307310149e-05, + "loss": 0.2415374994277954, + "step": 6960 + }, + { + "epoch": 0.9893541518807665, + "grad_norm": 5.04398775100708, + "learning_rate": 9.90112136266856e-05, + "loss": 0.23744730949401854, + "step": 6970 + }, + { + "epoch": 0.9907735982966643, + "grad_norm": 5.434844017028809, + "learning_rate": 9.90097941802697e-05, + "loss": 0.24512255191802979, + "step": 6980 + }, + { + "epoch": 0.9921930447125621, + "grad_norm": 5.528685092926025, + "learning_rate": 9.900837473385381e-05, + "loss": 0.2296142578125, + "step": 6990 + }, + { + "epoch": 0.99361249112846, + "grad_norm": 5.2856526374816895, + "learning_rate": 9.90069552874379e-05, + "loss": 0.2707331418991089, + "step": 7000 + }, + { + "epoch": 0.99361249112846, + "eval_accuracy": 0.9093914923380174, + "eval_loss": 0.27007216215133667, + "eval_runtime": 33.3907, + "eval_samples_per_second": 470.999, + "eval_steps_per_second": 14.735, + "step": 7000 + }, + { + "epoch": 0.9950319375443577, + "grad_norm": 8.654793739318848, + "learning_rate": 9.9005535841022e-05, + "loss": 0.34286386966705323, + "step": 7010 + }, + { + "epoch": 0.9964513839602555, + "grad_norm": 3.311750888824463, + "learning_rate": 9.90041163946061e-05, + "loss": 0.269917893409729, + "step": 7020 + }, + { + "epoch": 0.9978708303761533, + "grad_norm": 6.643321514129639, + "learning_rate": 9.900269694819021e-05, + "loss": 0.2132892370223999, + "step": 7030 + }, + { + "epoch": 0.9992902767920511, + "grad_norm": 10.397172927856445, + "learning_rate": 9.900127750177431e-05, + "loss": 0.2613171339035034, + "step": 7040 + }, + { + "epoch": 1.000709723207949, + "grad_norm": 6.357808589935303, + "learning_rate": 9.899985805535842e-05, + "loss": 0.2258657455444336, + "step": 7050 + }, + { + "epoch": 1.0021291696238468, + "grad_norm": 6.077082633972168, + "learning_rate": 9.899843860894252e-05, + "loss": 0.20697100162506105, + "step": 7060 + }, + { + "epoch": 1.0035486160397444, + "grad_norm": 12.1661376953125, + "learning_rate": 9.899701916252661e-05, + "loss": 0.1927890658378601, + "step": 7070 + }, + { + "epoch": 1.0049680624556423, + "grad_norm": 4.968541145324707, + "learning_rate": 9.899559971611073e-05, + "loss": 0.23719356060028077, + "step": 7080 + }, + { + "epoch": 1.0063875088715402, + "grad_norm": 8.79593563079834, + "learning_rate": 9.899418026969482e-05, + "loss": 0.18882639408111573, + "step": 7090 + }, + { + "epoch": 1.0078069552874378, + "grad_norm": 5.142887115478516, + "learning_rate": 9.899276082327893e-05, + "loss": 0.2634677171707153, + "step": 7100 + }, + { + "epoch": 1.0092264017033357, + "grad_norm": 8.761039733886719, + "learning_rate": 9.899134137686302e-05, + "loss": 0.321915602684021, + "step": 7110 + }, + { + "epoch": 1.0106458481192335, + "grad_norm": 3.3865628242492676, + "learning_rate": 9.898992193044713e-05, + "loss": 0.23035690784454346, + "step": 7120 + }, + { + "epoch": 1.0120652945351314, + "grad_norm": 5.229470729827881, + "learning_rate": 9.898850248403123e-05, + "loss": 0.23260829448699952, + "step": 7130 + }, + { + "epoch": 1.013484740951029, + "grad_norm": 6.637743949890137, + "learning_rate": 9.898708303761534e-05, + "loss": 0.29780044555664065, + "step": 7140 + }, + { + "epoch": 1.014904187366927, + "grad_norm": 5.488855838775635, + "learning_rate": 9.898566359119943e-05, + "loss": 0.17786208391189576, + "step": 7150 + }, + { + "epoch": 1.0163236337828248, + "grad_norm": 3.6873295307159424, + "learning_rate": 9.898424414478355e-05, + "loss": 0.16665832996368407, + "step": 7160 + }, + { + "epoch": 1.0177430801987224, + "grad_norm": 3.507009267807007, + "learning_rate": 9.898282469836764e-05, + "loss": 0.2571221351623535, + "step": 7170 + }, + { + "epoch": 1.0191625266146203, + "grad_norm": 3.279927968978882, + "learning_rate": 9.898140525195174e-05, + "loss": 0.2422633171081543, + "step": 7180 + }, + { + "epoch": 1.0205819730305181, + "grad_norm": 7.186861991882324, + "learning_rate": 9.897998580553585e-05, + "loss": 0.2877654552459717, + "step": 7190 + }, + { + "epoch": 1.022001419446416, + "grad_norm": 8.821130752563477, + "learning_rate": 9.897856635911995e-05, + "loss": 0.21563093662261962, + "step": 7200 + }, + { + "epoch": 1.0234208658623136, + "grad_norm": 1.849163293838501, + "learning_rate": 9.897714691270406e-05, + "loss": 0.21513009071350098, + "step": 7210 + }, + { + "epoch": 1.0248403122782115, + "grad_norm": 7.898414611816406, + "learning_rate": 9.897572746628814e-05, + "loss": 0.24002442359924317, + "step": 7220 + }, + { + "epoch": 1.0262597586941093, + "grad_norm": 8.41958236694336, + "learning_rate": 9.897430801987225e-05, + "loss": 0.22358598709106445, + "step": 7230 + }, + { + "epoch": 1.027679205110007, + "grad_norm": 5.978959560394287, + "learning_rate": 9.897288857345635e-05, + "loss": 0.24321112632751465, + "step": 7240 + }, + { + "epoch": 1.0290986515259049, + "grad_norm": 7.758601665496826, + "learning_rate": 9.897146912704046e-05, + "loss": 0.2519962310791016, + "step": 7250 + }, + { + "epoch": 1.0305180979418027, + "grad_norm": 6.9067487716674805, + "learning_rate": 9.897004968062456e-05, + "loss": 0.22714946269989014, + "step": 7260 + }, + { + "epoch": 1.0319375443577006, + "grad_norm": 7.974116802215576, + "learning_rate": 9.896863023420866e-05, + "loss": 0.22177364826202392, + "step": 7270 + }, + { + "epoch": 1.0333569907735982, + "grad_norm": 2.706422805786133, + "learning_rate": 9.896721078779277e-05, + "loss": 0.19734153747558594, + "step": 7280 + }, + { + "epoch": 1.034776437189496, + "grad_norm": 10.539275169372559, + "learning_rate": 9.896579134137687e-05, + "loss": 0.2604410648345947, + "step": 7290 + }, + { + "epoch": 1.036195883605394, + "grad_norm": 6.023902893066406, + "learning_rate": 9.896437189496098e-05, + "loss": 0.23188574314117433, + "step": 7300 + }, + { + "epoch": 1.0376153300212918, + "grad_norm": 4.0170512199401855, + "learning_rate": 9.896295244854507e-05, + "loss": 0.20175492763519287, + "step": 7310 + }, + { + "epoch": 1.0390347764371894, + "grad_norm": 4.9612579345703125, + "learning_rate": 9.896153300212917e-05, + "loss": 0.2120590925216675, + "step": 7320 + }, + { + "epoch": 1.0404542228530873, + "grad_norm": 4.898397922515869, + "learning_rate": 9.896011355571327e-05, + "loss": 0.22397477626800538, + "step": 7330 + }, + { + "epoch": 1.0418736692689852, + "grad_norm": 7.394660472869873, + "learning_rate": 9.895869410929738e-05, + "loss": 0.2079904556274414, + "step": 7340 + }, + { + "epoch": 1.0432931156848828, + "grad_norm": 3.7839152812957764, + "learning_rate": 9.895727466288148e-05, + "loss": 0.1861090302467346, + "step": 7350 + }, + { + "epoch": 1.0447125621007807, + "grad_norm": 6.4003496170043945, + "learning_rate": 9.895585521646559e-05, + "loss": 0.21509413719177245, + "step": 7360 + }, + { + "epoch": 1.0461320085166785, + "grad_norm": 5.966845989227295, + "learning_rate": 9.895443577004969e-05, + "loss": 0.22056474685668945, + "step": 7370 + }, + { + "epoch": 1.0475514549325764, + "grad_norm": 3.580226182937622, + "learning_rate": 9.895301632363378e-05, + "loss": 0.2572075128555298, + "step": 7380 + }, + { + "epoch": 1.048970901348474, + "grad_norm": 7.922166347503662, + "learning_rate": 9.89515968772179e-05, + "loss": 0.26929004192352296, + "step": 7390 + }, + { + "epoch": 1.050390347764372, + "grad_norm": 8.884166717529297, + "learning_rate": 9.895017743080199e-05, + "loss": 0.23953988552093505, + "step": 7400 + }, + { + "epoch": 1.0518097941802698, + "grad_norm": 13.472792625427246, + "learning_rate": 9.89487579843861e-05, + "loss": 0.26428995132446287, + "step": 7410 + }, + { + "epoch": 1.0532292405961674, + "grad_norm": 5.455354690551758, + "learning_rate": 9.894733853797019e-05, + "loss": 0.22658278942108154, + "step": 7420 + }, + { + "epoch": 1.0546486870120653, + "grad_norm": 12.143173217773438, + "learning_rate": 9.89459190915543e-05, + "loss": 0.2838724136352539, + "step": 7430 + }, + { + "epoch": 1.0560681334279631, + "grad_norm": 12.741036415100098, + "learning_rate": 9.89444996451384e-05, + "loss": 0.22514543533325196, + "step": 7440 + }, + { + "epoch": 1.057487579843861, + "grad_norm": 3.3944201469421387, + "learning_rate": 9.89430801987225e-05, + "loss": 0.2505282163619995, + "step": 7450 + }, + { + "epoch": 1.0589070262597586, + "grad_norm": 4.490118503570557, + "learning_rate": 9.89416607523066e-05, + "loss": 0.24113750457763672, + "step": 7460 + }, + { + "epoch": 1.0603264726756565, + "grad_norm": 3.8860394954681396, + "learning_rate": 9.89402413058907e-05, + "loss": 0.19650124311447142, + "step": 7470 + }, + { + "epoch": 1.0617459190915544, + "grad_norm": 8.089933395385742, + "learning_rate": 9.893882185947481e-05, + "loss": 0.20081098079681398, + "step": 7480 + }, + { + "epoch": 1.063165365507452, + "grad_norm": 5.854043483734131, + "learning_rate": 9.893740241305891e-05, + "loss": 0.19387896060943605, + "step": 7490 + }, + { + "epoch": 1.0645848119233499, + "grad_norm": 3.3195252418518066, + "learning_rate": 9.893598296664302e-05, + "loss": 0.1918407201766968, + "step": 7500 + }, + { + "epoch": 1.0645848119233499, + "eval_accuracy": 0.9091371526673873, + "eval_loss": 0.25946471095085144, + "eval_runtime": 32.8002, + "eval_samples_per_second": 479.478, + "eval_steps_per_second": 15.0, + "step": 7500 + }, + { + "epoch": 1.0660042583392477, + "grad_norm": 7.044492244720459, + "learning_rate": 9.893456352022712e-05, + "loss": 0.18088626861572266, + "step": 7510 + }, + { + "epoch": 1.0674237047551456, + "grad_norm": 2.1477725505828857, + "learning_rate": 9.893314407381123e-05, + "loss": 0.25041606426239016, + "step": 7520 + }, + { + "epoch": 1.0688431511710432, + "grad_norm": 5.232922077178955, + "learning_rate": 9.893172462739531e-05, + "loss": 0.13164312839508058, + "step": 7530 + }, + { + "epoch": 1.070262597586941, + "grad_norm": 7.097192764282227, + "learning_rate": 9.893030518097942e-05, + "loss": 0.2210529088973999, + "step": 7540 + }, + { + "epoch": 1.071682044002839, + "grad_norm": 6.555529594421387, + "learning_rate": 9.892888573456352e-05, + "loss": 0.22583472728729248, + "step": 7550 + }, + { + "epoch": 1.0731014904187366, + "grad_norm": 4.672628879547119, + "learning_rate": 9.892746628814763e-05, + "loss": 0.2420278787612915, + "step": 7560 + }, + { + "epoch": 1.0745209368346345, + "grad_norm": 5.684006690979004, + "learning_rate": 9.892604684173174e-05, + "loss": 0.16603726148605347, + "step": 7570 + }, + { + "epoch": 1.0759403832505323, + "grad_norm": 8.538924217224121, + "learning_rate": 9.892462739531582e-05, + "loss": 0.22756731510162354, + "step": 7580 + }, + { + "epoch": 1.0773598296664302, + "grad_norm": 10.23405647277832, + "learning_rate": 9.892320794889994e-05, + "loss": 0.17195621728897095, + "step": 7590 + }, + { + "epoch": 1.0787792760823278, + "grad_norm": 3.4394562244415283, + "learning_rate": 9.892178850248403e-05, + "loss": 0.1631350874900818, + "step": 7600 + }, + { + "epoch": 1.0801987224982257, + "grad_norm": 9.240316390991211, + "learning_rate": 9.892036905606814e-05, + "loss": 0.2647270917892456, + "step": 7610 + }, + { + "epoch": 1.0816181689141235, + "grad_norm": 11.555622100830078, + "learning_rate": 9.891894960965224e-05, + "loss": 0.26429762840271, + "step": 7620 + }, + { + "epoch": 1.0830376153300212, + "grad_norm": 2.4831769466400146, + "learning_rate": 9.891753016323634e-05, + "loss": 0.29258711338043214, + "step": 7630 + }, + { + "epoch": 1.084457061745919, + "grad_norm": 4.935022830963135, + "learning_rate": 9.891611071682044e-05, + "loss": 0.21570188999176027, + "step": 7640 + }, + { + "epoch": 1.085876508161817, + "grad_norm": 11.602439880371094, + "learning_rate": 9.891469127040455e-05, + "loss": 0.32711737155914306, + "step": 7650 + }, + { + "epoch": 1.0872959545777148, + "grad_norm": 6.064338207244873, + "learning_rate": 9.891327182398866e-05, + "loss": 0.226470947265625, + "step": 7660 + }, + { + "epoch": 1.0887154009936124, + "grad_norm": 5.629254341125488, + "learning_rate": 9.891185237757276e-05, + "loss": 0.1874476909637451, + "step": 7670 + }, + { + "epoch": 1.0901348474095103, + "grad_norm": 6.994508743286133, + "learning_rate": 9.891043293115685e-05, + "loss": 0.2323138952255249, + "step": 7680 + }, + { + "epoch": 1.0915542938254081, + "grad_norm": 7.654874324798584, + "learning_rate": 9.890901348474095e-05, + "loss": 0.267806077003479, + "step": 7690 + }, + { + "epoch": 1.0929737402413058, + "grad_norm": 2.5339603424072266, + "learning_rate": 9.890759403832506e-05, + "loss": 0.17415390014648438, + "step": 7700 + }, + { + "epoch": 1.0943931866572036, + "grad_norm": 9.036078453063965, + "learning_rate": 9.890617459190916e-05, + "loss": 0.26232335567474363, + "step": 7710 + }, + { + "epoch": 1.0958126330731015, + "grad_norm": 8.1493558883667, + "learning_rate": 9.890475514549327e-05, + "loss": 0.26018438339233396, + "step": 7720 + }, + { + "epoch": 1.0972320794889994, + "grad_norm": 4.394131660461426, + "learning_rate": 9.890333569907735e-05, + "loss": 0.20033717155456543, + "step": 7730 + }, + { + "epoch": 1.098651525904897, + "grad_norm": 7.311230659484863, + "learning_rate": 9.890191625266146e-05, + "loss": 0.2336057662963867, + "step": 7740 + }, + { + "epoch": 1.1000709723207949, + "grad_norm": 3.716153621673584, + "learning_rate": 9.890049680624556e-05, + "loss": 0.21649951934814454, + "step": 7750 + }, + { + "epoch": 1.1014904187366927, + "grad_norm": 5.747766017913818, + "learning_rate": 9.889907735982967e-05, + "loss": 0.21761865615844728, + "step": 7760 + }, + { + "epoch": 1.1029098651525904, + "grad_norm": 2.6889519691467285, + "learning_rate": 9.889765791341378e-05, + "loss": 0.2489168405532837, + "step": 7770 + }, + { + "epoch": 1.1043293115684882, + "grad_norm": 6.918911933898926, + "learning_rate": 9.889623846699787e-05, + "loss": 0.22506451606750488, + "step": 7780 + }, + { + "epoch": 1.105748757984386, + "grad_norm": 6.129018783569336, + "learning_rate": 9.889481902058198e-05, + "loss": 0.22557535171508789, + "step": 7790 + }, + { + "epoch": 1.107168204400284, + "grad_norm": 6.179121017456055, + "learning_rate": 9.889339957416608e-05, + "loss": 0.20877602100372314, + "step": 7800 + }, + { + "epoch": 1.1085876508161816, + "grad_norm": 4.490073204040527, + "learning_rate": 9.889198012775019e-05, + "loss": 0.24456796646118165, + "step": 7810 + }, + { + "epoch": 1.1100070972320795, + "grad_norm": 11.580991744995117, + "learning_rate": 9.889056068133428e-05, + "loss": 0.2545257806777954, + "step": 7820 + }, + { + "epoch": 1.1114265436479773, + "grad_norm": 5.933578968048096, + "learning_rate": 9.88891412349184e-05, + "loss": 0.20906269550323486, + "step": 7830 + }, + { + "epoch": 1.1128459900638752, + "grad_norm": 8.964847564697266, + "learning_rate": 9.888772178850248e-05, + "loss": 0.21426281929016114, + "step": 7840 + }, + { + "epoch": 1.1142654364797728, + "grad_norm": 3.047978401184082, + "learning_rate": 9.888630234208659e-05, + "loss": 0.20127902030944825, + "step": 7850 + }, + { + "epoch": 1.1156848828956707, + "grad_norm": 11.52719783782959, + "learning_rate": 9.88848828956707e-05, + "loss": 0.23301458358764648, + "step": 7860 + }, + { + "epoch": 1.1171043293115686, + "grad_norm": 4.898934364318848, + "learning_rate": 9.88834634492548e-05, + "loss": 0.26660704612731934, + "step": 7870 + }, + { + "epoch": 1.1185237757274662, + "grad_norm": 6.535075664520264, + "learning_rate": 9.888204400283891e-05, + "loss": 0.2355792284011841, + "step": 7880 + }, + { + "epoch": 1.119943222143364, + "grad_norm": 6.307318687438965, + "learning_rate": 9.888062455642299e-05, + "loss": 0.20682175159454347, + "step": 7890 + }, + { + "epoch": 1.121362668559262, + "grad_norm": 3.9123454093933105, + "learning_rate": 9.88792051100071e-05, + "loss": 0.3205126762390137, + "step": 7900 + }, + { + "epoch": 1.1227821149751598, + "grad_norm": 9.152158737182617, + "learning_rate": 9.88777856635912e-05, + "loss": 0.2413860082626343, + "step": 7910 + }, + { + "epoch": 1.1242015613910574, + "grad_norm": 9.178197860717773, + "learning_rate": 9.887636621717531e-05, + "loss": 0.32107110023498536, + "step": 7920 + }, + { + "epoch": 1.1256210078069553, + "grad_norm": 8.382686614990234, + "learning_rate": 9.887494677075941e-05, + "loss": 0.26145339012145996, + "step": 7930 + }, + { + "epoch": 1.1270404542228531, + "grad_norm": 6.847768306732178, + "learning_rate": 9.88735273243435e-05, + "loss": 0.21859989166259766, + "step": 7940 + }, + { + "epoch": 1.1284599006387508, + "grad_norm": 3.770111560821533, + "learning_rate": 9.887210787792762e-05, + "loss": 0.13420095443725585, + "step": 7950 + }, + { + "epoch": 1.1298793470546487, + "grad_norm": 7.4002509117126465, + "learning_rate": 9.887068843151171e-05, + "loss": 0.18695064783096313, + "step": 7960 + }, + { + "epoch": 1.1312987934705465, + "grad_norm": 4.0712761878967285, + "learning_rate": 9.886926898509583e-05, + "loss": 0.20656538009643555, + "step": 7970 + }, + { + "epoch": 1.1327182398864444, + "grad_norm": 4.4091291427612305, + "learning_rate": 9.886784953867992e-05, + "loss": 0.28663394451141355, + "step": 7980 + }, + { + "epoch": 1.134137686302342, + "grad_norm": 10.553000450134277, + "learning_rate": 9.886643009226402e-05, + "loss": 0.319093132019043, + "step": 7990 + }, + { + "epoch": 1.1355571327182399, + "grad_norm": 6.1367597579956055, + "learning_rate": 9.886501064584812e-05, + "loss": 0.19342881441116333, + "step": 8000 + }, + { + "epoch": 1.1355571327182399, + "eval_accuracy": 0.9207731925987156, + "eval_loss": 0.24032267928123474, + "eval_runtime": 32.4949, + "eval_samples_per_second": 483.984, + "eval_steps_per_second": 15.141, + "step": 8000 + }, + { + "epoch": 1.1369765791341377, + "grad_norm": 2.405918598175049, + "learning_rate": 9.886359119943223e-05, + "loss": 0.22856481075286866, + "step": 8010 + }, + { + "epoch": 1.1383960255500356, + "grad_norm": 3.4976019859313965, + "learning_rate": 9.886217175301633e-05, + "loss": 0.18118438720703126, + "step": 8020 + }, + { + "epoch": 1.1398154719659332, + "grad_norm": 6.432300567626953, + "learning_rate": 9.886075230660044e-05, + "loss": 0.21989898681640624, + "step": 8030 + }, + { + "epoch": 1.141234918381831, + "grad_norm": 8.299015045166016, + "learning_rate": 9.885933286018453e-05, + "loss": 0.18632423877716064, + "step": 8040 + }, + { + "epoch": 1.142654364797729, + "grad_norm": 4.741350173950195, + "learning_rate": 9.885791341376863e-05, + "loss": 0.3003889799118042, + "step": 8050 + }, + { + "epoch": 1.1440738112136266, + "grad_norm": 2.561021327972412, + "learning_rate": 9.885649396735274e-05, + "loss": 0.20989477634429932, + "step": 8060 + }, + { + "epoch": 1.1454932576295245, + "grad_norm": 4.419784069061279, + "learning_rate": 9.885507452093684e-05, + "loss": 0.20898723602294922, + "step": 8070 + }, + { + "epoch": 1.1469127040454223, + "grad_norm": 4.329728603363037, + "learning_rate": 9.885365507452095e-05, + "loss": 0.191938316822052, + "step": 8080 + }, + { + "epoch": 1.1483321504613202, + "grad_norm": 5.096283912658691, + "learning_rate": 9.885223562810503e-05, + "loss": 0.21612834930419922, + "step": 8090 + }, + { + "epoch": 1.1497515968772178, + "grad_norm": 7.623912811279297, + "learning_rate": 9.885081618168915e-05, + "loss": 0.2056267261505127, + "step": 8100 + }, + { + "epoch": 1.1511710432931157, + "grad_norm": 5.211782455444336, + "learning_rate": 9.884939673527324e-05, + "loss": 0.2458388090133667, + "step": 8110 + }, + { + "epoch": 1.1525904897090136, + "grad_norm": 4.73144006729126, + "learning_rate": 9.884797728885735e-05, + "loss": 0.2795632123947144, + "step": 8120 + }, + { + "epoch": 1.1540099361249112, + "grad_norm": 4.658935546875, + "learning_rate": 9.884655784244145e-05, + "loss": 0.19132717847824096, + "step": 8130 + }, + { + "epoch": 1.155429382540809, + "grad_norm": 2.4226841926574707, + "learning_rate": 9.884513839602555e-05, + "loss": 0.2345660448074341, + "step": 8140 + }, + { + "epoch": 1.156848828956707, + "grad_norm": 4.741151809692383, + "learning_rate": 9.884371894960966e-05, + "loss": 0.16295211315155028, + "step": 8150 + }, + { + "epoch": 1.1582682753726048, + "grad_norm": 5.364559173583984, + "learning_rate": 9.884229950319376e-05, + "loss": 0.32001848220825196, + "step": 8160 + }, + { + "epoch": 1.1596877217885024, + "grad_norm": 5.700736045837402, + "learning_rate": 9.884088005677787e-05, + "loss": 0.2149799346923828, + "step": 8170 + }, + { + "epoch": 1.1611071682044003, + "grad_norm": 8.003674507141113, + "learning_rate": 9.883946061036197e-05, + "loss": 0.1882821202278137, + "step": 8180 + }, + { + "epoch": 1.1625266146202982, + "grad_norm": 4.5582122802734375, + "learning_rate": 9.883804116394608e-05, + "loss": 0.21344914436340331, + "step": 8190 + }, + { + "epoch": 1.1639460610361958, + "grad_norm": 7.819937229156494, + "learning_rate": 9.883662171753016e-05, + "loss": 0.20212192535400392, + "step": 8200 + }, + { + "epoch": 1.1653655074520937, + "grad_norm": 4.706314563751221, + "learning_rate": 9.883520227111427e-05, + "loss": 0.23133435249328613, + "step": 8210 + }, + { + "epoch": 1.1667849538679915, + "grad_norm": 6.7971343994140625, + "learning_rate": 9.883378282469837e-05, + "loss": 0.2259516477584839, + "step": 8220 + }, + { + "epoch": 1.1682044002838894, + "grad_norm": 6.324117183685303, + "learning_rate": 9.883236337828248e-05, + "loss": 0.2526458024978638, + "step": 8230 + }, + { + "epoch": 1.169623846699787, + "grad_norm": 11.824000358581543, + "learning_rate": 9.883094393186658e-05, + "loss": 0.28786749839782716, + "step": 8240 + }, + { + "epoch": 1.171043293115685, + "grad_norm": 6.5561089515686035, + "learning_rate": 9.882952448545067e-05, + "loss": 0.2411046028137207, + "step": 8250 + }, + { + "epoch": 1.1724627395315828, + "grad_norm": 9.257662773132324, + "learning_rate": 9.882810503903479e-05, + "loss": 0.2078631639480591, + "step": 8260 + }, + { + "epoch": 1.1738821859474804, + "grad_norm": 6.388674736022949, + "learning_rate": 9.882668559261888e-05, + "loss": 0.2299574851989746, + "step": 8270 + }, + { + "epoch": 1.1753016323633783, + "grad_norm": 5.7360992431640625, + "learning_rate": 9.8825266146203e-05, + "loss": 0.18881726264953613, + "step": 8280 + }, + { + "epoch": 1.1767210787792761, + "grad_norm": 6.240981578826904, + "learning_rate": 9.882384669978709e-05, + "loss": 0.1505158066749573, + "step": 8290 + }, + { + "epoch": 1.178140525195174, + "grad_norm": 5.832661151885986, + "learning_rate": 9.882242725337119e-05, + "loss": 0.22867400646209718, + "step": 8300 + }, + { + "epoch": 1.1795599716110716, + "grad_norm": 10.773929595947266, + "learning_rate": 9.882100780695529e-05, + "loss": 0.1888264536857605, + "step": 8310 + }, + { + "epoch": 1.1809794180269695, + "grad_norm": 3.489490509033203, + "learning_rate": 9.88195883605394e-05, + "loss": 0.1748473525047302, + "step": 8320 + }, + { + "epoch": 1.1823988644428673, + "grad_norm": 5.332619667053223, + "learning_rate": 9.88181689141235e-05, + "loss": 0.20995078086853028, + "step": 8330 + }, + { + "epoch": 1.183818310858765, + "grad_norm": 4.1643147468566895, + "learning_rate": 9.88167494677076e-05, + "loss": 0.17949424982070922, + "step": 8340 + }, + { + "epoch": 1.1852377572746629, + "grad_norm": 5.263898849487305, + "learning_rate": 9.88153300212917e-05, + "loss": 0.17099075317382811, + "step": 8350 + }, + { + "epoch": 1.1866572036905607, + "grad_norm": 10.222403526306152, + "learning_rate": 9.88139105748758e-05, + "loss": 0.163385272026062, + "step": 8360 + }, + { + "epoch": 1.1880766501064586, + "grad_norm": 4.657668113708496, + "learning_rate": 9.881249112845991e-05, + "loss": 0.2960475444793701, + "step": 8370 + }, + { + "epoch": 1.1894960965223562, + "grad_norm": 4.420619964599609, + "learning_rate": 9.881107168204401e-05, + "loss": 0.1871565818786621, + "step": 8380 + }, + { + "epoch": 1.190915542938254, + "grad_norm": 6.741722583770752, + "learning_rate": 9.880965223562812e-05, + "loss": 0.18152236938476562, + "step": 8390 + }, + { + "epoch": 1.192334989354152, + "grad_norm": 7.203516483306885, + "learning_rate": 9.88082327892122e-05, + "loss": 0.21214077472686768, + "step": 8400 + }, + { + "epoch": 1.1937544357700496, + "grad_norm": 4.927282810211182, + "learning_rate": 9.880681334279631e-05, + "loss": 0.2104212999343872, + "step": 8410 + }, + { + "epoch": 1.1951738821859474, + "grad_norm": 5.8592023849487305, + "learning_rate": 9.880539389638041e-05, + "loss": 0.2139230728149414, + "step": 8420 + }, + { + "epoch": 1.1965933286018453, + "grad_norm": 7.09868860244751, + "learning_rate": 9.880397444996452e-05, + "loss": 0.1821369171142578, + "step": 8430 + }, + { + "epoch": 1.1980127750177432, + "grad_norm": 3.22680401802063, + "learning_rate": 9.880255500354862e-05, + "loss": 0.20524086952209472, + "step": 8440 + }, + { + "epoch": 1.1994322214336408, + "grad_norm": 6.953636169433594, + "learning_rate": 9.880113555713272e-05, + "loss": 0.12908190488815308, + "step": 8450 + }, + { + "epoch": 1.2008516678495387, + "grad_norm": 3.305361032485962, + "learning_rate": 9.879971611071683e-05, + "loss": 0.21676282882690429, + "step": 8460 + }, + { + "epoch": 1.2022711142654365, + "grad_norm": 5.03612756729126, + "learning_rate": 9.879829666430093e-05, + "loss": 0.21339573860168456, + "step": 8470 + }, + { + "epoch": 1.2036905606813342, + "grad_norm": 8.03529167175293, + "learning_rate": 9.879687721788504e-05, + "loss": 0.22714192867279054, + "step": 8480 + }, + { + "epoch": 1.205110007097232, + "grad_norm": 11.267200469970703, + "learning_rate": 9.879545777146913e-05, + "loss": 0.2318274736404419, + "step": 8490 + }, + { + "epoch": 1.20652945351313, + "grad_norm": 4.298351764678955, + "learning_rate": 9.879403832505323e-05, + "loss": 0.13804138898849488, + "step": 8500 + }, + { + "epoch": 1.20652945351313, + "eval_accuracy": 0.9154320595154829, + "eval_loss": 0.2389156073331833, + "eval_runtime": 32.8287, + "eval_samples_per_second": 479.062, + "eval_steps_per_second": 14.987, + "step": 8500 + }, + { + "epoch": 1.2079488999290278, + "grad_norm": 7.828441619873047, + "learning_rate": 9.879261887863733e-05, + "loss": 0.22812976837158203, + "step": 8510 + }, + { + "epoch": 1.2093683463449254, + "grad_norm": 6.791322708129883, + "learning_rate": 9.879119943222144e-05, + "loss": 0.2314612865447998, + "step": 8520 + }, + { + "epoch": 1.2107877927608233, + "grad_norm": 2.5891473293304443, + "learning_rate": 9.878977998580554e-05, + "loss": 0.2156294107437134, + "step": 8530 + }, + { + "epoch": 1.2122072391767211, + "grad_norm": 8.005664825439453, + "learning_rate": 9.878836053938965e-05, + "loss": 0.2180927038192749, + "step": 8540 + }, + { + "epoch": 1.2136266855926188, + "grad_norm": 4.849853515625, + "learning_rate": 9.878694109297374e-05, + "loss": 0.2122575521469116, + "step": 8550 + }, + { + "epoch": 1.2150461320085166, + "grad_norm": 2.7616207599639893, + "learning_rate": 9.878552164655784e-05, + "loss": 0.17834146022796632, + "step": 8560 + }, + { + "epoch": 1.2164655784244145, + "grad_norm": 5.352903366088867, + "learning_rate": 9.878410220014195e-05, + "loss": 0.13497724533081054, + "step": 8570 + }, + { + "epoch": 1.2178850248403124, + "grad_norm": 8.255563735961914, + "learning_rate": 9.878268275372605e-05, + "loss": 0.19454526901245117, + "step": 8580 + }, + { + "epoch": 1.21930447125621, + "grad_norm": 3.5060651302337646, + "learning_rate": 9.878126330731016e-05, + "loss": 0.23703739643096924, + "step": 8590 + }, + { + "epoch": 1.2207239176721079, + "grad_norm": 5.917641639709473, + "learning_rate": 9.877984386089426e-05, + "loss": 0.1788935661315918, + "step": 8600 + }, + { + "epoch": 1.2221433640880057, + "grad_norm": 7.5726542472839355, + "learning_rate": 9.877842441447836e-05, + "loss": 0.1879301905632019, + "step": 8610 + }, + { + "epoch": 1.2235628105039034, + "grad_norm": 6.313500881195068, + "learning_rate": 9.877700496806245e-05, + "loss": 0.19519026279449464, + "step": 8620 + }, + { + "epoch": 1.2249822569198012, + "grad_norm": 6.073189735412598, + "learning_rate": 9.877558552164656e-05, + "loss": 0.16100149154663085, + "step": 8630 + }, + { + "epoch": 1.226401703335699, + "grad_norm": 9.31675910949707, + "learning_rate": 9.877416607523066e-05, + "loss": 0.24087250232696533, + "step": 8640 + }, + { + "epoch": 1.227821149751597, + "grad_norm": 6.469115734100342, + "learning_rate": 9.877274662881477e-05, + "loss": 0.15760414600372313, + "step": 8650 + }, + { + "epoch": 1.2292405961674946, + "grad_norm": 5.7666192054748535, + "learning_rate": 9.877132718239887e-05, + "loss": 0.2261284589767456, + "step": 8660 + }, + { + "epoch": 1.2306600425833925, + "grad_norm": 7.881688117980957, + "learning_rate": 9.876990773598297e-05, + "loss": 0.22792091369628906, + "step": 8670 + }, + { + "epoch": 1.2320794889992903, + "grad_norm": 4.771458625793457, + "learning_rate": 9.876848828956708e-05, + "loss": 0.21116392612457274, + "step": 8680 + }, + { + "epoch": 1.233498935415188, + "grad_norm": 9.804439544677734, + "learning_rate": 9.876706884315118e-05, + "loss": 0.25815906524658205, + "step": 8690 + }, + { + "epoch": 1.2349183818310858, + "grad_norm": 3.326082229614258, + "learning_rate": 9.876564939673529e-05, + "loss": 0.21468789577484132, + "step": 8700 + }, + { + "epoch": 1.2363378282469837, + "grad_norm": 3.82004714012146, + "learning_rate": 9.876422995031937e-05, + "loss": 0.17646214962005616, + "step": 8710 + }, + { + "epoch": 1.2377572746628815, + "grad_norm": 7.979610443115234, + "learning_rate": 9.876281050390348e-05, + "loss": 0.23217053413391114, + "step": 8720 + }, + { + "epoch": 1.2391767210787792, + "grad_norm": 6.828559398651123, + "learning_rate": 9.876139105748758e-05, + "loss": 0.226235294342041, + "step": 8730 + }, + { + "epoch": 1.240596167494677, + "grad_norm": 7.083154678344727, + "learning_rate": 9.875997161107169e-05, + "loss": 0.2136064052581787, + "step": 8740 + }, + { + "epoch": 1.242015613910575, + "grad_norm": 8.167536735534668, + "learning_rate": 9.875855216465579e-05, + "loss": 0.20408027172088622, + "step": 8750 + }, + { + "epoch": 1.2434350603264726, + "grad_norm": 7.635597229003906, + "learning_rate": 9.875713271823988e-05, + "loss": 0.2205681324005127, + "step": 8760 + }, + { + "epoch": 1.2448545067423704, + "grad_norm": 6.944504737854004, + "learning_rate": 9.8755713271824e-05, + "loss": 0.14819756746292115, + "step": 8770 + }, + { + "epoch": 1.2462739531582683, + "grad_norm": 7.144880771636963, + "learning_rate": 9.875429382540809e-05, + "loss": 0.25865755081176756, + "step": 8780 + }, + { + "epoch": 1.2476933995741661, + "grad_norm": 4.50839900970459, + "learning_rate": 9.87528743789922e-05, + "loss": 0.19764204025268556, + "step": 8790 + }, + { + "epoch": 1.2491128459900638, + "grad_norm": 3.0644021034240723, + "learning_rate": 9.87514549325763e-05, + "loss": 0.23454864025115968, + "step": 8800 + }, + { + "epoch": 1.2505322924059616, + "grad_norm": 6.562272548675537, + "learning_rate": 9.87500354861604e-05, + "loss": 0.2683814525604248, + "step": 8810 + }, + { + "epoch": 1.2519517388218595, + "grad_norm": 4.825582027435303, + "learning_rate": 9.87486160397445e-05, + "loss": 0.2111285924911499, + "step": 8820 + }, + { + "epoch": 1.2533711852377571, + "grad_norm": 5.02101469039917, + "learning_rate": 9.87471965933286e-05, + "loss": 0.20650248527526854, + "step": 8830 + }, + { + "epoch": 1.254790631653655, + "grad_norm": 6.4850754737854, + "learning_rate": 9.87457771469127e-05, + "loss": 0.18662099838256835, + "step": 8840 + }, + { + "epoch": 1.2562100780695529, + "grad_norm": 6.745723724365234, + "learning_rate": 9.874435770049682e-05, + "loss": 0.12750645875930786, + "step": 8850 + }, + { + "epoch": 1.2576295244854507, + "grad_norm": 10.856019973754883, + "learning_rate": 9.874293825408091e-05, + "loss": 0.22051913738250734, + "step": 8860 + }, + { + "epoch": 1.2590489709013486, + "grad_norm": 7.022629737854004, + "learning_rate": 9.874151880766501e-05, + "loss": 0.2626792907714844, + "step": 8870 + }, + { + "epoch": 1.2604684173172462, + "grad_norm": 8.997479438781738, + "learning_rate": 9.874009936124912e-05, + "loss": 0.22494235038757324, + "step": 8880 + }, + { + "epoch": 1.261887863733144, + "grad_norm": 8.640801429748535, + "learning_rate": 9.873867991483322e-05, + "loss": 0.21826319694519042, + "step": 8890 + }, + { + "epoch": 1.2633073101490417, + "grad_norm": 4.579946517944336, + "learning_rate": 9.873726046841733e-05, + "loss": 0.18379125595092774, + "step": 8900 + }, + { + "epoch": 1.2647267565649396, + "grad_norm": 6.971579074859619, + "learning_rate": 9.873584102200143e-05, + "loss": 0.23222970962524414, + "step": 8910 + }, + { + "epoch": 1.2661462029808375, + "grad_norm": 6.197728633880615, + "learning_rate": 9.873442157558552e-05, + "loss": 0.23273870944976807, + "step": 8920 + }, + { + "epoch": 1.2675656493967353, + "grad_norm": 9.468696594238281, + "learning_rate": 9.873300212916962e-05, + "loss": 0.18107137680053711, + "step": 8930 + }, + { + "epoch": 1.2689850958126332, + "grad_norm": 3.7539901733398438, + "learning_rate": 9.873158268275373e-05, + "loss": 0.1382051467895508, + "step": 8940 + }, + { + "epoch": 1.2704045422285308, + "grad_norm": 7.013411521911621, + "learning_rate": 9.873016323633783e-05, + "loss": 0.13840343952178955, + "step": 8950 + }, + { + "epoch": 1.2718239886444287, + "grad_norm": 4.136613845825195, + "learning_rate": 9.872874378992194e-05, + "loss": 0.27057197093963625, + "step": 8960 + }, + { + "epoch": 1.2732434350603263, + "grad_norm": 7.147876262664795, + "learning_rate": 9.872732434350604e-05, + "loss": 0.19125341176986693, + "step": 8970 + }, + { + "epoch": 1.2746628814762242, + "grad_norm": 1.9221298694610596, + "learning_rate": 9.872590489709014e-05, + "loss": 0.22451837062835694, + "step": 8980 + }, + { + "epoch": 1.276082327892122, + "grad_norm": 10.765070915222168, + "learning_rate": 9.872448545067425e-05, + "loss": 0.2057518482208252, + "step": 8990 + }, + { + "epoch": 1.27750177430802, + "grad_norm": 3.960794448852539, + "learning_rate": 9.872306600425834e-05, + "loss": 0.21558022499084473, + "step": 9000 + }, + { + "epoch": 1.27750177430802, + "eval_accuracy": 0.907420359890634, + "eval_loss": 0.2675907015800476, + "eval_runtime": 32.1907, + "eval_samples_per_second": 488.557, + "eval_steps_per_second": 15.284, + "step": 9000 + }, + { + "epoch": 1.2789212207239178, + "grad_norm": 6.640925884246826, + "learning_rate": 9.872164655784245e-05, + "loss": 0.21932268142700195, + "step": 9010 + }, + { + "epoch": 1.2803406671398154, + "grad_norm": 3.883657455444336, + "learning_rate": 9.872022711142654e-05, + "loss": 0.20566184520721437, + "step": 9020 + }, + { + "epoch": 1.2817601135557133, + "grad_norm": 8.243616104125977, + "learning_rate": 9.871880766501065e-05, + "loss": 0.1661081552505493, + "step": 9030 + }, + { + "epoch": 1.2831795599716112, + "grad_norm": 9.827435493469238, + "learning_rate": 9.871738821859475e-05, + "loss": 0.17904939651489257, + "step": 9040 + }, + { + "epoch": 1.2845990063875088, + "grad_norm": 7.80245304107666, + "learning_rate": 9.871596877217886e-05, + "loss": 0.15805249214172362, + "step": 9050 + }, + { + "epoch": 1.2860184528034067, + "grad_norm": 4.689866542816162, + "learning_rate": 9.871454932576297e-05, + "loss": 0.23644819259643554, + "step": 9060 + }, + { + "epoch": 1.2874378992193045, + "grad_norm": 6.257835865020752, + "learning_rate": 9.871312987934705e-05, + "loss": 0.2536448955535889, + "step": 9070 + }, + { + "epoch": 1.2888573456352024, + "grad_norm": 1.8020100593566895, + "learning_rate": 9.871185237757275e-05, + "loss": 0.1373010277748108, + "step": 9080 + }, + { + "epoch": 1.2902767920511, + "grad_norm": 4.135176658630371, + "learning_rate": 9.871043293115685e-05, + "loss": 0.1967120051383972, + "step": 9090 + }, + { + "epoch": 1.2916962384669979, + "grad_norm": 5.261960506439209, + "learning_rate": 9.870901348474096e-05, + "loss": 0.21039602756500245, + "step": 9100 + }, + { + "epoch": 1.2931156848828957, + "grad_norm": 6.985999584197998, + "learning_rate": 9.870759403832506e-05, + "loss": 0.22036538124084473, + "step": 9110 + }, + { + "epoch": 1.2945351312987934, + "grad_norm": 3.4260783195495605, + "learning_rate": 9.870617459190917e-05, + "loss": 0.2039936065673828, + "step": 9120 + }, + { + "epoch": 1.2959545777146912, + "grad_norm": 3.7384250164031982, + "learning_rate": 9.870475514549326e-05, + "loss": 0.20263819694519042, + "step": 9130 + }, + { + "epoch": 1.297374024130589, + "grad_norm": 3.172229528427124, + "learning_rate": 9.870333569907736e-05, + "loss": 0.13130682706832886, + "step": 9140 + }, + { + "epoch": 1.298793470546487, + "grad_norm": 12.370247840881348, + "learning_rate": 9.870191625266146e-05, + "loss": 0.20618796348571777, + "step": 9150 + }, + { + "epoch": 1.3002129169623846, + "grad_norm": 7.193541049957275, + "learning_rate": 9.870049680624557e-05, + "loss": 0.2788748264312744, + "step": 9160 + }, + { + "epoch": 1.3016323633782825, + "grad_norm": 4.76792573928833, + "learning_rate": 9.869907735982967e-05, + "loss": 0.18996012210845947, + "step": 9170 + }, + { + "epoch": 1.3030518097941803, + "grad_norm": 3.7090489864349365, + "learning_rate": 9.869765791341378e-05, + "loss": 0.18860991001129152, + "step": 9180 + }, + { + "epoch": 1.304471256210078, + "grad_norm": 6.190913677215576, + "learning_rate": 9.869623846699788e-05, + "loss": 0.224440860748291, + "step": 9190 + }, + { + "epoch": 1.3058907026259758, + "grad_norm": 3.286689281463623, + "learning_rate": 9.869481902058197e-05, + "loss": 0.20683689117431642, + "step": 9200 + }, + { + "epoch": 1.3073101490418737, + "grad_norm": 4.6291937828063965, + "learning_rate": 9.869339957416608e-05, + "loss": 0.19128093719482422, + "step": 9210 + }, + { + "epoch": 1.3087295954577716, + "grad_norm": 8.739839553833008, + "learning_rate": 9.869198012775018e-05, + "loss": 0.21355061531066893, + "step": 9220 + }, + { + "epoch": 1.3101490418736692, + "grad_norm": 4.578412055969238, + "learning_rate": 9.869056068133429e-05, + "loss": 0.1978748083114624, + "step": 9230 + }, + { + "epoch": 1.311568488289567, + "grad_norm": 5.891171932220459, + "learning_rate": 9.868914123491839e-05, + "loss": 0.21060125827789306, + "step": 9240 + }, + { + "epoch": 1.312987934705465, + "grad_norm": 8.383025169372559, + "learning_rate": 9.868772178850249e-05, + "loss": 0.29614646434783937, + "step": 9250 + }, + { + "epoch": 1.3144073811213626, + "grad_norm": 7.3245930671691895, + "learning_rate": 9.868630234208658e-05, + "loss": 0.22820439338684081, + "step": 9260 + }, + { + "epoch": 1.3158268275372604, + "grad_norm": 3.143709182739258, + "learning_rate": 9.86848828956707e-05, + "loss": 0.1735852003097534, + "step": 9270 + }, + { + "epoch": 1.3172462739531583, + "grad_norm": 8.565205574035645, + "learning_rate": 9.868346344925479e-05, + "loss": 0.175143563747406, + "step": 9280 + }, + { + "epoch": 1.3186657203690562, + "grad_norm": 5.662914752960205, + "learning_rate": 9.86820440028389e-05, + "loss": 0.19213972091674805, + "step": 9290 + }, + { + "epoch": 1.3200851667849538, + "grad_norm": 7.872828960418701, + "learning_rate": 9.8680624556423e-05, + "loss": 0.14704231023788453, + "step": 9300 + }, + { + "epoch": 1.3215046132008517, + "grad_norm": 11.20383071899414, + "learning_rate": 9.86792051100071e-05, + "loss": 0.24307498931884766, + "step": 9310 + }, + { + "epoch": 1.3229240596167495, + "grad_norm": 2.9435956478118896, + "learning_rate": 9.867778566359121e-05, + "loss": 0.23251771926879883, + "step": 9320 + }, + { + "epoch": 1.3243435060326472, + "grad_norm": 3.8682780265808105, + "learning_rate": 9.867636621717531e-05, + "loss": 0.21560065746307372, + "step": 9330 + }, + { + "epoch": 1.325762952448545, + "grad_norm": 7.9737420082092285, + "learning_rate": 9.867494677075942e-05, + "loss": 0.1927724599838257, + "step": 9340 + }, + { + "epoch": 1.327182398864443, + "grad_norm": 6.955791473388672, + "learning_rate": 9.86735273243435e-05, + "loss": 0.22344651222229003, + "step": 9350 + }, + { + "epoch": 1.3286018452803408, + "grad_norm": 9.098529815673828, + "learning_rate": 9.867210787792761e-05, + "loss": 0.2260176420211792, + "step": 9360 + }, + { + "epoch": 1.3300212916962384, + "grad_norm": 5.625829219818115, + "learning_rate": 9.867068843151171e-05, + "loss": 0.1760912299156189, + "step": 9370 + }, + { + "epoch": 1.3314407381121363, + "grad_norm": 2.4090805053710938, + "learning_rate": 9.866926898509582e-05, + "loss": 0.16904083490371705, + "step": 9380 + }, + { + "epoch": 1.3328601845280341, + "grad_norm": 4.635160446166992, + "learning_rate": 9.866784953867992e-05, + "loss": 0.21562621593475342, + "step": 9390 + }, + { + "epoch": 1.3342796309439318, + "grad_norm": 8.606550216674805, + "learning_rate": 9.866643009226402e-05, + "loss": 0.21092190742492675, + "step": 9400 + }, + { + "epoch": 1.3356990773598296, + "grad_norm": 5.678009033203125, + "learning_rate": 9.866501064584813e-05, + "loss": 0.19930131435394288, + "step": 9410 + }, + { + "epoch": 1.3371185237757275, + "grad_norm": 6.880139350891113, + "learning_rate": 9.866359119943222e-05, + "loss": 0.3152653217315674, + "step": 9420 + }, + { + "epoch": 1.3385379701916253, + "grad_norm": 5.563040733337402, + "learning_rate": 9.866217175301633e-05, + "loss": 0.18800781965255736, + "step": 9430 + }, + { + "epoch": 1.339957416607523, + "grad_norm": 2.5089986324310303, + "learning_rate": 9.866075230660043e-05, + "loss": 0.11295425891876221, + "step": 9440 + }, + { + "epoch": 1.3413768630234209, + "grad_norm": 4.770693302154541, + "learning_rate": 9.865933286018453e-05, + "loss": 0.18411701917648315, + "step": 9450 + }, + { + "epoch": 1.3427963094393187, + "grad_norm": 4.498220920562744, + "learning_rate": 9.865791341376863e-05, + "loss": 0.2168651342391968, + "step": 9460 + }, + { + "epoch": 1.3442157558552164, + "grad_norm": 3.5189125537872314, + "learning_rate": 9.865649396735274e-05, + "loss": 0.23824927806854249, + "step": 9470 + }, + { + "epoch": 1.3456352022711142, + "grad_norm": 5.034974098205566, + "learning_rate": 9.865507452093684e-05, + "loss": 0.14622821807861328, + "step": 9480 + }, + { + "epoch": 1.347054648687012, + "grad_norm": 2.3215811252593994, + "learning_rate": 9.865365507452095e-05, + "loss": 0.11778559684753417, + "step": 9490 + }, + { + "epoch": 1.34847409510291, + "grad_norm": 4.806303977966309, + "learning_rate": 9.865223562810504e-05, + "loss": 0.12332210540771485, + "step": 9500 + }, + { + "epoch": 1.34847409510291, + "eval_accuracy": 0.9099637565969352, + "eval_loss": 0.2493496835231781, + "eval_runtime": 31.6926, + "eval_samples_per_second": 496.236, + "eval_steps_per_second": 15.524, + "step": 9500 + }, + { + "epoch": 1.3498935415188076, + "grad_norm": 6.961501598358154, + "learning_rate": 9.865081618168914e-05, + "loss": 0.2591987371444702, + "step": 9510 + }, + { + "epoch": 1.3513129879347054, + "grad_norm": 4.2426323890686035, + "learning_rate": 9.864939673527325e-05, + "loss": 0.17831168174743653, + "step": 9520 + }, + { + "epoch": 1.3527324343506033, + "grad_norm": 6.4358625411987305, + "learning_rate": 9.864797728885735e-05, + "loss": 0.2314450740814209, + "step": 9530 + }, + { + "epoch": 1.354151880766501, + "grad_norm": 5.79241943359375, + "learning_rate": 9.864655784244146e-05, + "loss": 0.18896229267120362, + "step": 9540 + }, + { + "epoch": 1.3555713271823988, + "grad_norm": 7.353359699249268, + "learning_rate": 9.864513839602554e-05, + "loss": 0.19705621004104615, + "step": 9550 + }, + { + "epoch": 1.3569907735982967, + "grad_norm": 6.934425354003906, + "learning_rate": 9.864371894960966e-05, + "loss": 0.17384577989578248, + "step": 9560 + }, + { + "epoch": 1.3584102200141945, + "grad_norm": 5.2685394287109375, + "learning_rate": 9.864229950319375e-05, + "loss": 0.2469557285308838, + "step": 9570 + }, + { + "epoch": 1.3598296664300924, + "grad_norm": 6.054180145263672, + "learning_rate": 9.864088005677786e-05, + "loss": 0.2497105598449707, + "step": 9580 + }, + { + "epoch": 1.36124911284599, + "grad_norm": 3.806577444076538, + "learning_rate": 9.863946061036196e-05, + "loss": 0.16005023717880248, + "step": 9590 + }, + { + "epoch": 1.362668559261888, + "grad_norm": 9.077430725097656, + "learning_rate": 9.863804116394607e-05, + "loss": 0.24311597347259523, + "step": 9600 + }, + { + "epoch": 1.3640880056777855, + "grad_norm": 5.967398166656494, + "learning_rate": 9.863662171753017e-05, + "loss": 0.2098919153213501, + "step": 9610 + }, + { + "epoch": 1.3655074520936834, + "grad_norm": 5.3782172203063965, + "learning_rate": 9.863520227111427e-05, + "loss": 0.22856371402740477, + "step": 9620 + }, + { + "epoch": 1.3669268985095813, + "grad_norm": 7.211184501647949, + "learning_rate": 9.863378282469838e-05, + "loss": 0.19752051830291747, + "step": 9630 + }, + { + "epoch": 1.3683463449254791, + "grad_norm": 2.611245632171631, + "learning_rate": 9.863236337828247e-05, + "loss": 0.20763750076293946, + "step": 9640 + }, + { + "epoch": 1.369765791341377, + "grad_norm": 7.055820465087891, + "learning_rate": 9.863094393186659e-05, + "loss": 0.18712767362594604, + "step": 9650 + }, + { + "epoch": 1.3711852377572746, + "grad_norm": 7.2558112144470215, + "learning_rate": 9.862952448545067e-05, + "loss": 0.24251337051391603, + "step": 9660 + }, + { + "epoch": 1.3726046841731725, + "grad_norm": 6.948854446411133, + "learning_rate": 9.862810503903478e-05, + "loss": 0.1610349178314209, + "step": 9670 + }, + { + "epoch": 1.3740241305890701, + "grad_norm": 6.58130407333374, + "learning_rate": 9.862668559261888e-05, + "loss": 0.1934449315071106, + "step": 9680 + }, + { + "epoch": 1.375443577004968, + "grad_norm": 3.3496904373168945, + "learning_rate": 9.862526614620299e-05, + "loss": 0.17610930204391478, + "step": 9690 + }, + { + "epoch": 1.3768630234208659, + "grad_norm": 9.198835372924805, + "learning_rate": 9.862384669978709e-05, + "loss": 0.17025632858276368, + "step": 9700 + }, + { + "epoch": 1.3782824698367637, + "grad_norm": 1.7735481262207031, + "learning_rate": 9.862242725337118e-05, + "loss": 0.20825440883636476, + "step": 9710 + }, + { + "epoch": 1.3797019162526616, + "grad_norm": 6.809709548950195, + "learning_rate": 9.86210078069553e-05, + "loss": 0.18874866962432862, + "step": 9720 + }, + { + "epoch": 1.3811213626685592, + "grad_norm": 8.268877029418945, + "learning_rate": 9.861958836053939e-05, + "loss": 0.26922762393951416, + "step": 9730 + }, + { + "epoch": 1.382540809084457, + "grad_norm": 2.897256851196289, + "learning_rate": 9.86181689141235e-05, + "loss": 0.24385275840759277, + "step": 9740 + }, + { + "epoch": 1.3839602555003547, + "grad_norm": 3.334864616394043, + "learning_rate": 9.86167494677076e-05, + "loss": 0.16869350671768188, + "step": 9750 + }, + { + "epoch": 1.3853797019162526, + "grad_norm": 7.382256984710693, + "learning_rate": 9.86153300212917e-05, + "loss": 0.18727898597717285, + "step": 9760 + }, + { + "epoch": 1.3867991483321505, + "grad_norm": 3.0756566524505615, + "learning_rate": 9.86139105748758e-05, + "loss": 0.1948513627052307, + "step": 9770 + }, + { + "epoch": 1.3882185947480483, + "grad_norm": 7.820052146911621, + "learning_rate": 9.86124911284599e-05, + "loss": 0.1906062364578247, + "step": 9780 + }, + { + "epoch": 1.3896380411639462, + "grad_norm": 5.2213263511657715, + "learning_rate": 9.8611071682044e-05, + "loss": 0.19792075157165528, + "step": 9790 + }, + { + "epoch": 1.3910574875798438, + "grad_norm": 9.714534759521484, + "learning_rate": 9.860965223562811e-05, + "loss": 0.17712973356246947, + "step": 9800 + }, + { + "epoch": 1.3924769339957417, + "grad_norm": 4.078144073486328, + "learning_rate": 9.860823278921221e-05, + "loss": 0.18135050535202027, + "step": 9810 + }, + { + "epoch": 1.3938963804116393, + "grad_norm": 5.219580173492432, + "learning_rate": 9.860681334279631e-05, + "loss": 0.227278733253479, + "step": 9820 + }, + { + "epoch": 1.3953158268275372, + "grad_norm": 6.879891395568848, + "learning_rate": 9.860539389638042e-05, + "loss": 0.215889835357666, + "step": 9830 + }, + { + "epoch": 1.396735273243435, + "grad_norm": 9.455697059631348, + "learning_rate": 9.860397444996452e-05, + "loss": 0.16740819215774536, + "step": 9840 + }, + { + "epoch": 1.398154719659333, + "grad_norm": 4.630984306335449, + "learning_rate": 9.860255500354863e-05, + "loss": 0.22700212001800538, + "step": 9850 + }, + { + "epoch": 1.3995741660752308, + "grad_norm": 6.121819972991943, + "learning_rate": 9.860113555713271e-05, + "loss": 0.220161509513855, + "step": 9860 + }, + { + "epoch": 1.4009936124911284, + "grad_norm": 2.6966371536254883, + "learning_rate": 9.859971611071682e-05, + "loss": 0.18548699617385864, + "step": 9870 + }, + { + "epoch": 1.4024130589070263, + "grad_norm": 4.1472554206848145, + "learning_rate": 9.859829666430092e-05, + "loss": 0.18523939847946166, + "step": 9880 + }, + { + "epoch": 1.4038325053229241, + "grad_norm": 7.051137924194336, + "learning_rate": 9.859687721788503e-05, + "loss": 0.1325202226638794, + "step": 9890 + }, + { + "epoch": 1.4052519517388218, + "grad_norm": 5.540129661560059, + "learning_rate": 9.859545777146913e-05, + "loss": 0.16468173265457153, + "step": 9900 + }, + { + "epoch": 1.4066713981547196, + "grad_norm": 6.817564487457275, + "learning_rate": 9.859403832505323e-05, + "loss": 0.12863141298294067, + "step": 9910 + }, + { + "epoch": 1.4080908445706175, + "grad_norm": 2.415663719177246, + "learning_rate": 9.859261887863734e-05, + "loss": 0.1454537630081177, + "step": 9920 + }, + { + "epoch": 1.4095102909865154, + "grad_norm": 5.63126277923584, + "learning_rate": 9.859119943222143e-05, + "loss": 0.20712642669677733, + "step": 9930 + }, + { + "epoch": 1.410929737402413, + "grad_norm": 3.990525484085083, + "learning_rate": 9.858977998580555e-05, + "loss": 0.14999470710754395, + "step": 9940 + }, + { + "epoch": 1.4123491838183109, + "grad_norm": 4.665277004241943, + "learning_rate": 9.858836053938964e-05, + "loss": 0.1735332727432251, + "step": 9950 + }, + { + "epoch": 1.4137686302342087, + "grad_norm": 6.532275676727295, + "learning_rate": 9.858694109297375e-05, + "loss": 0.18187229633331298, + "step": 9960 + }, + { + "epoch": 1.4151880766501064, + "grad_norm": 10.086085319519043, + "learning_rate": 9.858552164655784e-05, + "loss": 0.25496907234191896, + "step": 9970 + }, + { + "epoch": 1.4166075230660042, + "grad_norm": 8.85912036895752, + "learning_rate": 9.858410220014195e-05, + "loss": 0.21260628700256348, + "step": 9980 + }, + { + "epoch": 1.418026969481902, + "grad_norm": 3.1774983406066895, + "learning_rate": 9.858268275372605e-05, + "loss": 0.16666808128356933, + "step": 9990 + }, + { + "epoch": 1.4194464158978, + "grad_norm": 8.12264633178711, + "learning_rate": 9.858126330731016e-05, + "loss": 0.13021547794342042, + "step": 10000 + }, + { + "epoch": 1.4194464158978, + "eval_accuracy": 0.9303745151650029, + "eval_loss": 0.2065460979938507, + "eval_runtime": 32.8099, + "eval_samples_per_second": 479.338, + "eval_steps_per_second": 14.995, + "step": 10000 + }, + { + "epoch": 1.4208658623136976, + "grad_norm": 3.760587453842163, + "learning_rate": 9.857984386089427e-05, + "loss": 0.21676597595214844, + "step": 10010 + }, + { + "epoch": 1.4222853087295955, + "grad_norm": 6.741761207580566, + "learning_rate": 9.857842441447835e-05, + "loss": 0.22888615131378173, + "step": 10020 + }, + { + "epoch": 1.4237047551454933, + "grad_norm": 4.405668258666992, + "learning_rate": 9.857700496806246e-05, + "loss": 0.13688948154449462, + "step": 10030 + }, + { + "epoch": 1.425124201561391, + "grad_norm": 5.534117698669434, + "learning_rate": 9.857558552164656e-05, + "loss": 0.14423273801803588, + "step": 10040 + }, + { + "epoch": 1.4265436479772888, + "grad_norm": 5.10047721862793, + "learning_rate": 9.857416607523067e-05, + "loss": 0.2310737133026123, + "step": 10050 + }, + { + "epoch": 1.4279630943931867, + "grad_norm": 3.052246570587158, + "learning_rate": 9.857274662881477e-05, + "loss": 0.20977180004119872, + "step": 10060 + }, + { + "epoch": 1.4293825408090846, + "grad_norm": 9.701653480529785, + "learning_rate": 9.857132718239887e-05, + "loss": 0.22714948654174805, + "step": 10070 + }, + { + "epoch": 1.4308019872249822, + "grad_norm": 2.72581148147583, + "learning_rate": 9.856990773598296e-05, + "loss": 0.2333024263381958, + "step": 10080 + }, + { + "epoch": 1.43222143364088, + "grad_norm": 8.234984397888184, + "learning_rate": 9.856848828956707e-05, + "loss": 0.21033647060394287, + "step": 10090 + }, + { + "epoch": 1.433640880056778, + "grad_norm": 4.618515491485596, + "learning_rate": 9.856706884315118e-05, + "loss": 0.2534619331359863, + "step": 10100 + }, + { + "epoch": 1.4350603264726756, + "grad_norm": 3.2053143978118896, + "learning_rate": 9.856564939673528e-05, + "loss": 0.18584598302841188, + "step": 10110 + }, + { + "epoch": 1.4364797728885734, + "grad_norm": 5.643956661224365, + "learning_rate": 9.856422995031938e-05, + "loss": 0.16008204221725464, + "step": 10120 + }, + { + "epoch": 1.4378992193044713, + "grad_norm": 7.6051201820373535, + "learning_rate": 9.856281050390348e-05, + "loss": 0.19140913486480712, + "step": 10130 + }, + { + "epoch": 1.4393186657203692, + "grad_norm": 8.58385181427002, + "learning_rate": 9.856139105748759e-05, + "loss": 0.22861852645874023, + "step": 10140 + }, + { + "epoch": 1.4407381121362668, + "grad_norm": 3.0554444789886475, + "learning_rate": 9.855997161107168e-05, + "loss": 0.14198927879333495, + "step": 10150 + }, + { + "epoch": 1.4421575585521647, + "grad_norm": 3.255782127380371, + "learning_rate": 9.85585521646558e-05, + "loss": 0.17290072441101073, + "step": 10160 + }, + { + "epoch": 1.4435770049680625, + "grad_norm": 4.403168678283691, + "learning_rate": 9.855713271823988e-05, + "loss": 0.19940041303634642, + "step": 10170 + }, + { + "epoch": 1.4449964513839602, + "grad_norm": 8.145320892333984, + "learning_rate": 9.855571327182399e-05, + "loss": 0.21902050971984863, + "step": 10180 + }, + { + "epoch": 1.446415897799858, + "grad_norm": 5.803956508636475, + "learning_rate": 9.85542938254081e-05, + "loss": 0.21828086376190187, + "step": 10190 + }, + { + "epoch": 1.4478353442157559, + "grad_norm": 8.805460929870605, + "learning_rate": 9.85528743789922e-05, + "loss": 0.23348815441131593, + "step": 10200 + }, + { + "epoch": 1.4492547906316537, + "grad_norm": 7.180856704711914, + "learning_rate": 9.855145493257631e-05, + "loss": 0.18313560485839844, + "step": 10210 + }, + { + "epoch": 1.4506742370475514, + "grad_norm": 7.773831844329834, + "learning_rate": 9.85500354861604e-05, + "loss": 0.18291949033737182, + "step": 10220 + }, + { + "epoch": 1.4520936834634492, + "grad_norm": 1.713024616241455, + "learning_rate": 9.85486160397445e-05, + "loss": 0.11751105785369872, + "step": 10230 + }, + { + "epoch": 1.453513129879347, + "grad_norm": 2.2637596130371094, + "learning_rate": 9.85471965933286e-05, + "loss": 0.14805399179458617, + "step": 10240 + }, + { + "epoch": 1.4549325762952448, + "grad_norm": 8.369937896728516, + "learning_rate": 9.854577714691271e-05, + "loss": 0.2501375198364258, + "step": 10250 + }, + { + "epoch": 1.4563520227111426, + "grad_norm": 9.403657913208008, + "learning_rate": 9.854435770049681e-05, + "loss": 0.1835735559463501, + "step": 10260 + }, + { + "epoch": 1.4577714691270405, + "grad_norm": 7.980884075164795, + "learning_rate": 9.854293825408091e-05, + "loss": 0.2255629301071167, + "step": 10270 + }, + { + "epoch": 1.4591909155429383, + "grad_norm": 13.038922309875488, + "learning_rate": 9.854151880766502e-05, + "loss": 0.1810195565223694, + "step": 10280 + }, + { + "epoch": 1.460610361958836, + "grad_norm": 6.806441783905029, + "learning_rate": 9.854009936124912e-05, + "loss": 0.20559656620025635, + "step": 10290 + }, + { + "epoch": 1.4620298083747338, + "grad_norm": 1.5737494230270386, + "learning_rate": 9.853867991483323e-05, + "loss": 0.17797669172286987, + "step": 10300 + }, + { + "epoch": 1.4634492547906317, + "grad_norm": 10.547101020812988, + "learning_rate": 9.853726046841732e-05, + "loss": 0.14445135593414307, + "step": 10310 + }, + { + "epoch": 1.4648687012065293, + "grad_norm": 7.028156757354736, + "learning_rate": 9.853584102200144e-05, + "loss": 0.19645894765853883, + "step": 10320 + }, + { + "epoch": 1.4662881476224272, + "grad_norm": 8.557269096374512, + "learning_rate": 9.853442157558552e-05, + "loss": 0.14470189809799194, + "step": 10330 + }, + { + "epoch": 1.467707594038325, + "grad_norm": 3.8612992763519287, + "learning_rate": 9.853300212916963e-05, + "loss": 0.18914811611175536, + "step": 10340 + }, + { + "epoch": 1.469127040454223, + "grad_norm": 1.5628553628921509, + "learning_rate": 9.853158268275373e-05, + "loss": 0.15799893140792848, + "step": 10350 + }, + { + "epoch": 1.4705464868701206, + "grad_norm": 1.3893674612045288, + "learning_rate": 9.853016323633784e-05, + "loss": 0.20945143699645996, + "step": 10360 + }, + { + "epoch": 1.4719659332860184, + "grad_norm": 5.654598712921143, + "learning_rate": 9.852874378992194e-05, + "loss": 0.18789818286895751, + "step": 10370 + }, + { + "epoch": 1.4733853797019163, + "grad_norm": 2.126235008239746, + "learning_rate": 9.852732434350603e-05, + "loss": 0.18574261665344238, + "step": 10380 + }, + { + "epoch": 1.474804826117814, + "grad_norm": 6.465456008911133, + "learning_rate": 9.852590489709014e-05, + "loss": 0.2622290849685669, + "step": 10390 + }, + { + "epoch": 1.4762242725337118, + "grad_norm": 0.5080237984657288, + "learning_rate": 9.852448545067424e-05, + "loss": 0.1537003517150879, + "step": 10400 + }, + { + "epoch": 1.4776437189496097, + "grad_norm": 1.72958505153656, + "learning_rate": 9.852306600425835e-05, + "loss": 0.15624310970306396, + "step": 10410 + }, + { + "epoch": 1.4790631653655075, + "grad_norm": 4.848511695861816, + "learning_rate": 9.852164655784245e-05, + "loss": 0.12883809804916382, + "step": 10420 + }, + { + "epoch": 1.4804826117814054, + "grad_norm": 5.730294227600098, + "learning_rate": 9.852022711142655e-05, + "loss": 0.14428837299346925, + "step": 10430 + }, + { + "epoch": 1.481902058197303, + "grad_norm": 4.0559539794921875, + "learning_rate": 9.851880766501064e-05, + "loss": 0.1629919409751892, + "step": 10440 + }, + { + "epoch": 1.483321504613201, + "grad_norm": 4.338459014892578, + "learning_rate": 9.851738821859476e-05, + "loss": 0.17030248641967774, + "step": 10450 + }, + { + "epoch": 1.4847409510290985, + "grad_norm": 10.856430053710938, + "learning_rate": 9.851596877217885e-05, + "loss": 0.23294711112976074, + "step": 10460 + }, + { + "epoch": 1.4861603974449964, + "grad_norm": 5.3764729499816895, + "learning_rate": 9.851454932576296e-05, + "loss": 0.1908231258392334, + "step": 10470 + }, + { + "epoch": 1.4875798438608943, + "grad_norm": 7.5525736808776855, + "learning_rate": 9.851312987934706e-05, + "loss": 0.1458095669746399, + "step": 10480 + }, + { + "epoch": 1.4889992902767921, + "grad_norm": 4.017747402191162, + "learning_rate": 9.851171043293116e-05, + "loss": 0.09822410345077515, + "step": 10490 + }, + { + "epoch": 1.49041873669269, + "grad_norm": 3.671755075454712, + "learning_rate": 9.851029098651527e-05, + "loss": 0.2174128770828247, + "step": 10500 + }, + { + "epoch": 1.49041873669269, + "eval_accuracy": 0.9363514974248108, + "eval_loss": 0.18055449426174164, + "eval_runtime": 32.7495, + "eval_samples_per_second": 480.221, + "eval_steps_per_second": 15.023, + "step": 10500 + }, + { + "epoch": 1.4918381831085876, + "grad_norm": 5.814731597900391, + "learning_rate": 9.850887154009937e-05, + "loss": 0.20221278667449952, + "step": 10510 + }, + { + "epoch": 1.4932576295244855, + "grad_norm": 4.894477367401123, + "learning_rate": 9.850745209368348e-05, + "loss": 0.1364034056663513, + "step": 10520 + }, + { + "epoch": 1.4946770759403831, + "grad_norm": 9.05544662475586, + "learning_rate": 9.850603264726756e-05, + "loss": 0.2525052785873413, + "step": 10530 + }, + { + "epoch": 1.496096522356281, + "grad_norm": 4.482929706573486, + "learning_rate": 9.850461320085167e-05, + "loss": 0.16218397617340088, + "step": 10540 + }, + { + "epoch": 1.4975159687721789, + "grad_norm": 6.634395599365234, + "learning_rate": 9.850319375443577e-05, + "loss": 0.14512306451797485, + "step": 10550 + }, + { + "epoch": 1.4989354151880767, + "grad_norm": 8.131645202636719, + "learning_rate": 9.850177430801988e-05, + "loss": 0.1850733518600464, + "step": 10560 + }, + { + "epoch": 1.5003548616039746, + "grad_norm": 7.16902494430542, + "learning_rate": 9.850035486160398e-05, + "loss": 0.232697057723999, + "step": 10570 + }, + { + "epoch": 1.5017743080198722, + "grad_norm": 9.409531593322754, + "learning_rate": 9.849893541518808e-05, + "loss": 0.13974694013595582, + "step": 10580 + }, + { + "epoch": 1.50319375443577, + "grad_norm": 6.473144054412842, + "learning_rate": 9.849751596877219e-05, + "loss": 0.1807733178138733, + "step": 10590 + }, + { + "epoch": 1.5046132008516677, + "grad_norm": 2.1681149005889893, + "learning_rate": 9.849609652235628e-05, + "loss": 0.12265112400054931, + "step": 10600 + }, + { + "epoch": 1.5060326472675656, + "grad_norm": 5.138197898864746, + "learning_rate": 9.84946770759404e-05, + "loss": 0.14840331077575683, + "step": 10610 + }, + { + "epoch": 1.5074520936834634, + "grad_norm": 7.284664630889893, + "learning_rate": 9.849325762952449e-05, + "loss": 0.14850282669067383, + "step": 10620 + }, + { + "epoch": 1.5088715400993613, + "grad_norm": 3.7971346378326416, + "learning_rate": 9.84918381831086e-05, + "loss": 0.1547774314880371, + "step": 10630 + }, + { + "epoch": 1.5102909865152592, + "grad_norm": 6.039275169372559, + "learning_rate": 9.849041873669269e-05, + "loss": 0.197337806224823, + "step": 10640 + }, + { + "epoch": 1.5117104329311568, + "grad_norm": 3.9703164100646973, + "learning_rate": 9.84889992902768e-05, + "loss": 0.2073758363723755, + "step": 10650 + }, + { + "epoch": 1.5131298793470547, + "grad_norm": 9.968624114990234, + "learning_rate": 9.84875798438609e-05, + "loss": 0.1673255443572998, + "step": 10660 + }, + { + "epoch": 1.5145493257629523, + "grad_norm": 5.294106483459473, + "learning_rate": 9.8486160397445e-05, + "loss": 0.1461545467376709, + "step": 10670 + }, + { + "epoch": 1.5159687721788502, + "grad_norm": 10.589927673339844, + "learning_rate": 9.84847409510291e-05, + "loss": 0.1678829312324524, + "step": 10680 + }, + { + "epoch": 1.517388218594748, + "grad_norm": 8.75311279296875, + "learning_rate": 9.84833215046132e-05, + "loss": 0.1493905782699585, + "step": 10690 + }, + { + "epoch": 1.518807665010646, + "grad_norm": 5.052854061126709, + "learning_rate": 9.848190205819731e-05, + "loss": 0.16829880475997924, + "step": 10700 + }, + { + "epoch": 1.5202271114265438, + "grad_norm": 10.165739059448242, + "learning_rate": 9.848048261178141e-05, + "loss": 0.1630192756652832, + "step": 10710 + }, + { + "epoch": 1.5216465578424414, + "grad_norm": 4.576249599456787, + "learning_rate": 9.847906316536552e-05, + "loss": 0.18904685974121094, + "step": 10720 + }, + { + "epoch": 1.5230660042583393, + "grad_norm": 6.297980308532715, + "learning_rate": 9.847764371894962e-05, + "loss": 0.20620598793029785, + "step": 10730 + }, + { + "epoch": 1.524485450674237, + "grad_norm": 6.77498197555542, + "learning_rate": 9.847622427253371e-05, + "loss": 0.16875416040420532, + "step": 10740 + }, + { + "epoch": 1.5259048970901348, + "grad_norm": 3.679386854171753, + "learning_rate": 9.847480482611781e-05, + "loss": 0.17838630676269532, + "step": 10750 + }, + { + "epoch": 1.5273243435060326, + "grad_norm": 9.312896728515625, + "learning_rate": 9.847338537970192e-05, + "loss": 0.21157798767089844, + "step": 10760 + }, + { + "epoch": 1.5287437899219305, + "grad_norm": 7.985523223876953, + "learning_rate": 9.847196593328602e-05, + "loss": 0.18047035932540895, + "step": 10770 + }, + { + "epoch": 1.5301632363378284, + "grad_norm": 6.29368257522583, + "learning_rate": 9.847054648687013e-05, + "loss": 0.1568093180656433, + "step": 10780 + }, + { + "epoch": 1.531582682753726, + "grad_norm": 5.2899322509765625, + "learning_rate": 9.846912704045423e-05, + "loss": 0.14504846334457397, + "step": 10790 + }, + { + "epoch": 1.5330021291696239, + "grad_norm": 1.8608068227767944, + "learning_rate": 9.846770759403833e-05, + "loss": 0.10261296033859253, + "step": 10800 + }, + { + "epoch": 1.5344215755855215, + "grad_norm": 7.755560398101807, + "learning_rate": 9.846628814762244e-05, + "loss": 0.20737462043762206, + "step": 10810 + }, + { + "epoch": 1.5358410220014194, + "grad_norm": 5.849984645843506, + "learning_rate": 9.846486870120653e-05, + "loss": 0.13056904077529907, + "step": 10820 + }, + { + "epoch": 1.5372604684173172, + "grad_norm": 12.66482162475586, + "learning_rate": 9.846344925479065e-05, + "loss": 0.18910495042800904, + "step": 10830 + }, + { + "epoch": 1.538679914833215, + "grad_norm": 5.568217754364014, + "learning_rate": 9.846202980837473e-05, + "loss": 0.21616907119750978, + "step": 10840 + }, + { + "epoch": 1.540099361249113, + "grad_norm": 7.100687503814697, + "learning_rate": 9.846061036195884e-05, + "loss": 0.2003716230392456, + "step": 10850 + }, + { + "epoch": 1.5415188076650106, + "grad_norm": 5.5214009284973145, + "learning_rate": 9.845919091554294e-05, + "loss": 0.17750124931335448, + "step": 10860 + }, + { + "epoch": 1.5429382540809085, + "grad_norm": 7.188937664031982, + "learning_rate": 9.845777146912705e-05, + "loss": 0.18738465309143065, + "step": 10870 + }, + { + "epoch": 1.544357700496806, + "grad_norm": 6.263291358947754, + "learning_rate": 9.845635202271115e-05, + "loss": 0.14714010953903198, + "step": 10880 + }, + { + "epoch": 1.545777146912704, + "grad_norm": 1.6037124395370483, + "learning_rate": 9.845493257629524e-05, + "loss": 0.16528385877609253, + "step": 10890 + }, + { + "epoch": 1.5471965933286018, + "grad_norm": 6.341423034667969, + "learning_rate": 9.845351312987935e-05, + "loss": 0.16852269172668458, + "step": 10900 + }, + { + "epoch": 1.5486160397444997, + "grad_norm": 1.0601999759674072, + "learning_rate": 9.845209368346345e-05, + "loss": 0.165651535987854, + "step": 10910 + }, + { + "epoch": 1.5500354861603975, + "grad_norm": 6.944467544555664, + "learning_rate": 9.845067423704756e-05, + "loss": 0.21995656490325927, + "step": 10920 + }, + { + "epoch": 1.5514549325762954, + "grad_norm": 6.1232380867004395, + "learning_rate": 9.844925479063166e-05, + "loss": 0.23545873165130615, + "step": 10930 + }, + { + "epoch": 1.552874378992193, + "grad_norm": 5.78615665435791, + "learning_rate": 9.844783534421576e-05, + "loss": 0.20628550052642822, + "step": 10940 + }, + { + "epoch": 1.5542938254080907, + "grad_norm": 2.3399593830108643, + "learning_rate": 9.844641589779985e-05, + "loss": 0.1314982771873474, + "step": 10950 + }, + { + "epoch": 1.5557132718239886, + "grad_norm": 8.838848114013672, + "learning_rate": 9.844499645138397e-05, + "loss": 0.17209669351577758, + "step": 10960 + }, + { + "epoch": 1.5571327182398864, + "grad_norm": 6.756653308868408, + "learning_rate": 9.844357700496806e-05, + "loss": 0.2233790397644043, + "step": 10970 + }, + { + "epoch": 1.5585521646557843, + "grad_norm": 3.664095163345337, + "learning_rate": 9.844215755855217e-05, + "loss": 0.14182189702987671, + "step": 10980 + }, + { + "epoch": 1.5599716110716821, + "grad_norm": 6.118113040924072, + "learning_rate": 9.844073811213627e-05, + "loss": 0.1605884075164795, + "step": 10990 + }, + { + "epoch": 1.56139105748758, + "grad_norm": 3.3329458236694336, + "learning_rate": 9.843931866572037e-05, + "loss": 0.15648469924926758, + "step": 11000 + }, + { + "epoch": 1.56139105748758, + "eval_accuracy": 0.9343803649774274, + "eval_loss": 0.18083110451698303, + "eval_runtime": 31.9521, + "eval_samples_per_second": 492.205, + "eval_steps_per_second": 15.398, + "step": 11000 + }, + { + "epoch": 1.5628105039034776, + "grad_norm": 2.8265178203582764, + "learning_rate": 9.843789921930448e-05, + "loss": 0.1055110216140747, + "step": 11010 + }, + { + "epoch": 1.5642299503193753, + "grad_norm": 7.40562105178833, + "learning_rate": 9.843647977288858e-05, + "loss": 0.1931678533554077, + "step": 11020 + }, + { + "epoch": 1.5656493967352731, + "grad_norm": 5.846470355987549, + "learning_rate": 9.843506032647269e-05, + "loss": 0.16744234561920165, + "step": 11030 + }, + { + "epoch": 1.567068843151171, + "grad_norm": 10.13637924194336, + "learning_rate": 9.843364088005678e-05, + "loss": 0.16841363906860352, + "step": 11040 + }, + { + "epoch": 1.5684882895670689, + "grad_norm": 8.881434440612793, + "learning_rate": 9.843222143364088e-05, + "loss": 0.11868530511856079, + "step": 11050 + }, + { + "epoch": 1.5699077359829667, + "grad_norm": 3.2120912075042725, + "learning_rate": 9.843080198722498e-05, + "loss": 0.25566916465759276, + "step": 11060 + }, + { + "epoch": 1.5713271823988646, + "grad_norm": 8.856307983398438, + "learning_rate": 9.842938254080909e-05, + "loss": 0.16841399669647217, + "step": 11070 + }, + { + "epoch": 1.5727466288147622, + "grad_norm": 5.458991050720215, + "learning_rate": 9.842796309439319e-05, + "loss": 0.1553714632987976, + "step": 11080 + }, + { + "epoch": 1.5741660752306599, + "grad_norm": 7.29731559753418, + "learning_rate": 9.84265436479773e-05, + "loss": 0.12889499664306642, + "step": 11090 + }, + { + "epoch": 1.5755855216465577, + "grad_norm": 4.352165699005127, + "learning_rate": 9.84251242015614e-05, + "loss": 0.17049648761749267, + "step": 11100 + }, + { + "epoch": 1.5770049680624556, + "grad_norm": 3.659630060195923, + "learning_rate": 9.84237047551455e-05, + "loss": 0.11960989236831665, + "step": 11110 + }, + { + "epoch": 1.5784244144783535, + "grad_norm": 9.198236465454102, + "learning_rate": 9.84222853087296e-05, + "loss": 0.13858609199523925, + "step": 11120 + }, + { + "epoch": 1.5798438608942513, + "grad_norm": 4.7100510597229, + "learning_rate": 9.84208658623137e-05, + "loss": 0.15008503198623657, + "step": 11130 + }, + { + "epoch": 1.5812633073101492, + "grad_norm": 7.331428050994873, + "learning_rate": 9.841944641589781e-05, + "loss": 0.1811345934867859, + "step": 11140 + }, + { + "epoch": 1.5826827537260468, + "grad_norm": 7.792325019836426, + "learning_rate": 9.84180269694819e-05, + "loss": 0.22963361740112304, + "step": 11150 + }, + { + "epoch": 1.5841022001419447, + "grad_norm": 1.6901665925979614, + "learning_rate": 9.841660752306601e-05, + "loss": 0.12061529159545899, + "step": 11160 + }, + { + "epoch": 1.5855216465578423, + "grad_norm": 6.294560432434082, + "learning_rate": 9.84151880766501e-05, + "loss": 0.1813538670539856, + "step": 11170 + }, + { + "epoch": 1.5869410929737402, + "grad_norm": 5.661618232727051, + "learning_rate": 9.841376863023422e-05, + "loss": 0.13598719835281373, + "step": 11180 + }, + { + "epoch": 1.588360539389638, + "grad_norm": 4.586926460266113, + "learning_rate": 9.841234918381831e-05, + "loss": 0.151306414604187, + "step": 11190 + }, + { + "epoch": 1.589779985805536, + "grad_norm": 3.2611052989959717, + "learning_rate": 9.841092973740241e-05, + "loss": 0.202089524269104, + "step": 11200 + }, + { + "epoch": 1.5911994322214338, + "grad_norm": 5.5583109855651855, + "learning_rate": 9.840951029098652e-05, + "loss": 0.13323140144348145, + "step": 11210 + }, + { + "epoch": 1.5926188786373314, + "grad_norm": 2.7712435722351074, + "learning_rate": 9.840809084457062e-05, + "loss": 0.2039250135421753, + "step": 11220 + }, + { + "epoch": 1.5940383250532293, + "grad_norm": 5.573919773101807, + "learning_rate": 9.840667139815473e-05, + "loss": 0.22665846347808838, + "step": 11230 + }, + { + "epoch": 1.595457771469127, + "grad_norm": 4.785495758056641, + "learning_rate": 9.840525195173883e-05, + "loss": 0.13016164302825928, + "step": 11240 + }, + { + "epoch": 1.5968772178850248, + "grad_norm": 5.181567668914795, + "learning_rate": 9.840383250532292e-05, + "loss": 0.1920285105705261, + "step": 11250 + }, + { + "epoch": 1.5982966643009227, + "grad_norm": 6.854187488555908, + "learning_rate": 9.840255500354862e-05, + "loss": 0.17289340496063232, + "step": 11260 + }, + { + "epoch": 1.5997161107168205, + "grad_norm": 5.818141937255859, + "learning_rate": 9.840113555713272e-05, + "loss": 0.1366284132003784, + "step": 11270 + }, + { + "epoch": 1.6011355571327184, + "grad_norm": 5.610560417175293, + "learning_rate": 9.839971611071682e-05, + "loss": 0.15053837299346923, + "step": 11280 + }, + { + "epoch": 1.602555003548616, + "grad_norm": 3.7539663314819336, + "learning_rate": 9.839829666430093e-05, + "loss": 0.14345501661300658, + "step": 11290 + }, + { + "epoch": 1.6039744499645139, + "grad_norm": 7.876579284667969, + "learning_rate": 9.839687721788503e-05, + "loss": 0.13623604774475098, + "step": 11300 + }, + { + "epoch": 1.6053938963804115, + "grad_norm": 7.193563461303711, + "learning_rate": 9.839545777146914e-05, + "loss": 0.21021018028259278, + "step": 11310 + }, + { + "epoch": 1.6068133427963094, + "grad_norm": 3.236804485321045, + "learning_rate": 9.839403832505323e-05, + "loss": 0.1547287106513977, + "step": 11320 + }, + { + "epoch": 1.6082327892122072, + "grad_norm": 5.831701278686523, + "learning_rate": 9.839261887863733e-05, + "loss": 0.2037062644958496, + "step": 11330 + }, + { + "epoch": 1.609652235628105, + "grad_norm": 11.167473793029785, + "learning_rate": 9.839119943222144e-05, + "loss": 0.23104898929595946, + "step": 11340 + }, + { + "epoch": 1.611071682044003, + "grad_norm": 8.400900840759277, + "learning_rate": 9.838977998580554e-05, + "loss": 0.18747899532318116, + "step": 11350 + }, + { + "epoch": 1.6124911284599006, + "grad_norm": 5.5414042472839355, + "learning_rate": 9.838836053938965e-05, + "loss": 0.20507404804229737, + "step": 11360 + }, + { + "epoch": 1.6139105748757985, + "grad_norm": 5.533061504364014, + "learning_rate": 9.838694109297375e-05, + "loss": 0.17890411615371704, + "step": 11370 + }, + { + "epoch": 1.6153300212916961, + "grad_norm": 2.9510483741760254, + "learning_rate": 9.838552164655785e-05, + "loss": 0.16628677845001222, + "step": 11380 + }, + { + "epoch": 1.616749467707594, + "grad_norm": 5.596954822540283, + "learning_rate": 9.838410220014194e-05, + "loss": 0.14340368509292603, + "step": 11390 + }, + { + "epoch": 1.6181689141234918, + "grad_norm": 1.025497555732727, + "learning_rate": 9.838268275372605e-05, + "loss": 0.1132912278175354, + "step": 11400 + }, + { + "epoch": 1.6195883605393897, + "grad_norm": 8.293600082397461, + "learning_rate": 9.838126330731015e-05, + "loss": 0.15983034372329713, + "step": 11410 + }, + { + "epoch": 1.6210078069552876, + "grad_norm": 6.942419052124023, + "learning_rate": 9.837984386089426e-05, + "loss": 0.18471511602401733, + "step": 11420 + }, + { + "epoch": 1.6224272533711852, + "grad_norm": 7.051154613494873, + "learning_rate": 9.837842441447836e-05, + "loss": 0.17162368297576905, + "step": 11430 + }, + { + "epoch": 1.623846699787083, + "grad_norm": 4.608026504516602, + "learning_rate": 9.837700496806246e-05, + "loss": 0.17447967529296876, + "step": 11440 + }, + { + "epoch": 1.6252661462029807, + "grad_norm": 2.5280375480651855, + "learning_rate": 9.837558552164657e-05, + "loss": 0.13198750019073485, + "step": 11450 + }, + { + "epoch": 1.6266855926188786, + "grad_norm": 5.921835422515869, + "learning_rate": 9.837416607523067e-05, + "loss": 0.19506406784057617, + "step": 11460 + }, + { + "epoch": 1.6281050390347764, + "grad_norm": 1.4568758010864258, + "learning_rate": 9.837274662881478e-05, + "loss": 0.12564977407455444, + "step": 11470 + }, + { + "epoch": 1.6295244854506743, + "grad_norm": 4.619745254516602, + "learning_rate": 9.837132718239886e-05, + "loss": 0.1366949200630188, + "step": 11480 + }, + { + "epoch": 1.6309439318665722, + "grad_norm": 13.973068237304688, + "learning_rate": 9.836990773598297e-05, + "loss": 0.2520665168762207, + "step": 11490 + }, + { + "epoch": 1.6323633782824698, + "grad_norm": 5.616090297698975, + "learning_rate": 9.836848828956707e-05, + "loss": 0.24036917686462403, + "step": 11500 + }, + { + "epoch": 1.6323633782824698, + "eval_accuracy": 0.938894894131112, + "eval_loss": 0.17282415926456451, + "eval_runtime": 32.6586, + "eval_samples_per_second": 481.558, + "eval_steps_per_second": 15.065, + "step": 11500 + }, + { + "epoch": 1.6337828246983677, + "grad_norm": 2.5921289920806885, + "learning_rate": 9.836706884315118e-05, + "loss": 0.1288065195083618, + "step": 11510 + }, + { + "epoch": 1.6352022711142653, + "grad_norm": 3.20184326171875, + "learning_rate": 9.836564939673528e-05, + "loss": 0.14583102464675904, + "step": 11520 + }, + { + "epoch": 1.6366217175301632, + "grad_norm": 5.127830505371094, + "learning_rate": 9.836422995031937e-05, + "loss": 0.18197163343429565, + "step": 11530 + }, + { + "epoch": 1.638041163946061, + "grad_norm": 7.125634670257568, + "learning_rate": 9.836281050390349e-05, + "loss": 0.1912643551826477, + "step": 11540 + }, + { + "epoch": 1.639460610361959, + "grad_norm": 2.9785008430480957, + "learning_rate": 9.836139105748758e-05, + "loss": 0.13757799863815307, + "step": 11550 + }, + { + "epoch": 1.6408800567778568, + "grad_norm": 1.8115347623825073, + "learning_rate": 9.83599716110717e-05, + "loss": 0.1510754942893982, + "step": 11560 + }, + { + "epoch": 1.6422995031937544, + "grad_norm": 3.6485488414764404, + "learning_rate": 9.835855216465579e-05, + "loss": 0.17528530359268188, + "step": 11570 + }, + { + "epoch": 1.6437189496096523, + "grad_norm": 5.931766510009766, + "learning_rate": 9.835713271823989e-05, + "loss": 0.20811958312988282, + "step": 11580 + }, + { + "epoch": 1.64513839602555, + "grad_norm": 7.735183238983154, + "learning_rate": 9.835571327182399e-05, + "loss": 0.1395600199699402, + "step": 11590 + }, + { + "epoch": 1.6465578424414478, + "grad_norm": 5.529693603515625, + "learning_rate": 9.83542938254081e-05, + "loss": 0.14511030912399292, + "step": 11600 + }, + { + "epoch": 1.6479772888573456, + "grad_norm": 4.704524993896484, + "learning_rate": 9.83528743789922e-05, + "loss": 0.1279573082923889, + "step": 11610 + }, + { + "epoch": 1.6493967352732435, + "grad_norm": 11.802435874938965, + "learning_rate": 9.83514549325763e-05, + "loss": 0.14364974498748778, + "step": 11620 + }, + { + "epoch": 1.6508161816891413, + "grad_norm": 7.839514255523682, + "learning_rate": 9.83500354861604e-05, + "loss": 0.17981865406036376, + "step": 11630 + }, + { + "epoch": 1.652235628105039, + "grad_norm": 6.616874694824219, + "learning_rate": 9.83486160397445e-05, + "loss": 0.2129373550415039, + "step": 11640 + }, + { + "epoch": 1.6536550745209369, + "grad_norm": 2.111496925354004, + "learning_rate": 9.834719659332861e-05, + "loss": 0.21924855709075927, + "step": 11650 + }, + { + "epoch": 1.6550745209368345, + "grad_norm": 10.006966590881348, + "learning_rate": 9.834577714691271e-05, + "loss": 0.17941123247146606, + "step": 11660 + }, + { + "epoch": 1.6564939673527324, + "grad_norm": 5.636976718902588, + "learning_rate": 9.834435770049682e-05, + "loss": 0.166895854473114, + "step": 11670 + }, + { + "epoch": 1.6579134137686302, + "grad_norm": 1.7106539011001587, + "learning_rate": 9.834293825408092e-05, + "loss": 0.16953905820846557, + "step": 11680 + }, + { + "epoch": 1.659332860184528, + "grad_norm": 5.924720764160156, + "learning_rate": 9.834151880766501e-05, + "loss": 0.12511081695556642, + "step": 11690 + }, + { + "epoch": 1.660752306600426, + "grad_norm": 8.140963554382324, + "learning_rate": 9.834009936124911e-05, + "loss": 0.15308539867401122, + "step": 11700 + }, + { + "epoch": 1.6621717530163236, + "grad_norm": 2.5716195106506348, + "learning_rate": 9.833867991483322e-05, + "loss": 0.1372369647026062, + "step": 11710 + }, + { + "epoch": 1.6635911994322214, + "grad_norm": 7.952601909637451, + "learning_rate": 9.833726046841732e-05, + "loss": 0.14670779705047607, + "step": 11720 + }, + { + "epoch": 1.665010645848119, + "grad_norm": 1.4507794380187988, + "learning_rate": 9.833584102200143e-05, + "loss": 0.1868760108947754, + "step": 11730 + }, + { + "epoch": 1.666430092264017, + "grad_norm": 7.695814609527588, + "learning_rate": 9.833442157558553e-05, + "loss": 0.24691624641418458, + "step": 11740 + }, + { + "epoch": 1.6678495386799148, + "grad_norm": 10.15262508392334, + "learning_rate": 9.833300212916962e-05, + "loss": 0.2450582504272461, + "step": 11750 + }, + { + "epoch": 1.6692689850958127, + "grad_norm": 5.300413131713867, + "learning_rate": 9.833158268275374e-05, + "loss": 0.17981985807418824, + "step": 11760 + }, + { + "epoch": 1.6706884315117105, + "grad_norm": 10.736809730529785, + "learning_rate": 9.833016323633783e-05, + "loss": 0.12192434072494507, + "step": 11770 + }, + { + "epoch": 1.6721078779276084, + "grad_norm": 2.6130592823028564, + "learning_rate": 9.832874378992194e-05, + "loss": 0.1472996473312378, + "step": 11780 + }, + { + "epoch": 1.673527324343506, + "grad_norm": 6.176468849182129, + "learning_rate": 9.832732434350603e-05, + "loss": 0.12378195524215699, + "step": 11790 + }, + { + "epoch": 1.6749467707594037, + "grad_norm": 12.4953031539917, + "learning_rate": 9.832590489709014e-05, + "loss": 0.18659558296203613, + "step": 11800 + }, + { + "epoch": 1.6763662171753015, + "grad_norm": 6.664957046508789, + "learning_rate": 9.832448545067424e-05, + "loss": 0.17845855951309203, + "step": 11810 + }, + { + "epoch": 1.6777856635911994, + "grad_norm": 4.767297267913818, + "learning_rate": 9.832306600425835e-05, + "loss": 0.20129690170288086, + "step": 11820 + }, + { + "epoch": 1.6792051100070973, + "grad_norm": 8.662429809570312, + "learning_rate": 9.832164655784244e-05, + "loss": 0.19204812049865722, + "step": 11830 + }, + { + "epoch": 1.6806245564229951, + "grad_norm": 4.443410873413086, + "learning_rate": 9.832022711142654e-05, + "loss": 0.17241191864013672, + "step": 11840 + }, + { + "epoch": 1.682044002838893, + "grad_norm": 6.706130027770996, + "learning_rate": 9.831880766501065e-05, + "loss": 0.14194031953811645, + "step": 11850 + }, + { + "epoch": 1.6834634492547906, + "grad_norm": 4.810044288635254, + "learning_rate": 9.831738821859475e-05, + "loss": 0.1292971134185791, + "step": 11860 + }, + { + "epoch": 1.6848828956706883, + "grad_norm": 4.945130348205566, + "learning_rate": 9.831596877217886e-05, + "loss": 0.13104760646820068, + "step": 11870 + }, + { + "epoch": 1.6863023420865861, + "grad_norm": 7.412860870361328, + "learning_rate": 9.831454932576296e-05, + "loss": 0.18914194107055665, + "step": 11880 + }, + { + "epoch": 1.687721788502484, + "grad_norm": 1.9591195583343506, + "learning_rate": 9.831312987934706e-05, + "loss": 0.1756757378578186, + "step": 11890 + }, + { + "epoch": 1.6891412349183819, + "grad_norm": 2.857415199279785, + "learning_rate": 9.831171043293115e-05, + "loss": 0.10278797149658203, + "step": 11900 + }, + { + "epoch": 1.6905606813342797, + "grad_norm": 2.342369556427002, + "learning_rate": 9.831029098651526e-05, + "loss": 0.12141529321670533, + "step": 11910 + }, + { + "epoch": 1.6919801277501776, + "grad_norm": 5.84676456451416, + "learning_rate": 9.830887154009936e-05, + "loss": 0.20085587501525878, + "step": 11920 + }, + { + "epoch": 1.6933995741660752, + "grad_norm": 3.6309845447540283, + "learning_rate": 9.830745209368347e-05, + "loss": 0.15413752794265748, + "step": 11930 + }, + { + "epoch": 1.6948190205819729, + "grad_norm": 2.3892900943756104, + "learning_rate": 9.830603264726757e-05, + "loss": 0.15552257299423217, + "step": 11940 + }, + { + "epoch": 1.6962384669978707, + "grad_norm": 0.9857825636863708, + "learning_rate": 9.830461320085167e-05, + "loss": 0.15181114673614501, + "step": 11950 + }, + { + "epoch": 1.6976579134137686, + "grad_norm": 6.49855375289917, + "learning_rate": 9.830319375443578e-05, + "loss": 0.17083282470703126, + "step": 11960 + }, + { + "epoch": 1.6990773598296665, + "grad_norm": 1.0913960933685303, + "learning_rate": 9.830177430801988e-05, + "loss": 0.2133202314376831, + "step": 11970 + }, + { + "epoch": 1.7004968062455643, + "grad_norm": 4.437821388244629, + "learning_rate": 9.830035486160399e-05, + "loss": 0.0879701018333435, + "step": 11980 + }, + { + "epoch": 1.7019162526614622, + "grad_norm": 4.715758800506592, + "learning_rate": 9.829893541518807e-05, + "loss": 0.15447641611099244, + "step": 11990 + }, + { + "epoch": 1.7033356990773598, + "grad_norm": 8.367589950561523, + "learning_rate": 9.829751596877218e-05, + "loss": 0.17715357542037963, + "step": 12000 + }, + { + "epoch": 1.7033356990773598, + "eval_accuracy": 0.9378775354485916, + "eval_loss": 0.17906926572322845, + "eval_runtime": 33.4925, + "eval_samples_per_second": 469.568, + "eval_steps_per_second": 14.69, + "step": 12000 + }, + { + "epoch": 1.7047551454932577, + "grad_norm": 8.013254165649414, + "learning_rate": 9.829609652235628e-05, + "loss": 0.1866832494735718, + "step": 12010 + }, + { + "epoch": 1.7061745919091553, + "grad_norm": 7.372905731201172, + "learning_rate": 9.829467707594039e-05, + "loss": 0.124139404296875, + "step": 12020 + }, + { + "epoch": 1.7075940383250532, + "grad_norm": 6.6865739822387695, + "learning_rate": 9.829325762952449e-05, + "loss": 0.12705342769622802, + "step": 12030 + }, + { + "epoch": 1.709013484740951, + "grad_norm": 4.504441738128662, + "learning_rate": 9.82918381831086e-05, + "loss": 0.1867109179496765, + "step": 12040 + }, + { + "epoch": 1.710432931156849, + "grad_norm": 1.8893638849258423, + "learning_rate": 9.82904187366927e-05, + "loss": 0.14493658542633056, + "step": 12050 + }, + { + "epoch": 1.7118523775727468, + "grad_norm": 1.72226083278656, + "learning_rate": 9.828899929027679e-05, + "loss": 0.1554844617843628, + "step": 12060 + }, + { + "epoch": 1.7132718239886444, + "grad_norm": 5.362784385681152, + "learning_rate": 9.82875798438609e-05, + "loss": 0.18286285400390626, + "step": 12070 + }, + { + "epoch": 1.7146912704045423, + "grad_norm": 9.535138130187988, + "learning_rate": 9.8286160397445e-05, + "loss": 0.1454553484916687, + "step": 12080 + }, + { + "epoch": 1.71611071682044, + "grad_norm": 5.757817268371582, + "learning_rate": 9.828474095102911e-05, + "loss": 0.14671599864959717, + "step": 12090 + }, + { + "epoch": 1.7175301632363378, + "grad_norm": 5.000237464904785, + "learning_rate": 9.82833215046132e-05, + "loss": 0.21178703308105468, + "step": 12100 + }, + { + "epoch": 1.7189496096522356, + "grad_norm": 5.827192306518555, + "learning_rate": 9.82819020581973e-05, + "loss": 0.21477718353271485, + "step": 12110 + }, + { + "epoch": 1.7203690560681335, + "grad_norm": 3.8673248291015625, + "learning_rate": 9.82804826117814e-05, + "loss": 0.2367461919784546, + "step": 12120 + }, + { + "epoch": 1.7217885024840314, + "grad_norm": 4.519773006439209, + "learning_rate": 9.827906316536551e-05, + "loss": 0.12398046255111694, + "step": 12130 + }, + { + "epoch": 1.723207948899929, + "grad_norm": 7.634313583374023, + "learning_rate": 9.827764371894961e-05, + "loss": 0.12134796380996704, + "step": 12140 + }, + { + "epoch": 1.7246273953158269, + "grad_norm": 7.9592766761779785, + "learning_rate": 9.827622427253371e-05, + "loss": 0.18058866262435913, + "step": 12150 + }, + { + "epoch": 1.7260468417317245, + "grad_norm": 6.438409805297852, + "learning_rate": 9.827480482611782e-05, + "loss": 0.17642263174057007, + "step": 12160 + }, + { + "epoch": 1.7274662881476224, + "grad_norm": 5.818785667419434, + "learning_rate": 9.827338537970192e-05, + "loss": 0.13319342136383056, + "step": 12170 + }, + { + "epoch": 1.7288857345635202, + "grad_norm": 5.536925315856934, + "learning_rate": 9.827196593328603e-05, + "loss": 0.13135639429092408, + "step": 12180 + }, + { + "epoch": 1.730305180979418, + "grad_norm": 5.665536403656006, + "learning_rate": 9.827054648687013e-05, + "loss": 0.12864874601364135, + "step": 12190 + }, + { + "epoch": 1.731724627395316, + "grad_norm": 5.198805809020996, + "learning_rate": 9.826912704045422e-05, + "loss": 0.09919618964195251, + "step": 12200 + }, + { + "epoch": 1.7331440738112136, + "grad_norm": 3.8186886310577393, + "learning_rate": 9.826770759403832e-05, + "loss": 0.15075846910476684, + "step": 12210 + }, + { + "epoch": 1.7345635202271115, + "grad_norm": 4.91066837310791, + "learning_rate": 9.826628814762243e-05, + "loss": 0.1283166766166687, + "step": 12220 + }, + { + "epoch": 1.735982966643009, + "grad_norm": 4.604067802429199, + "learning_rate": 9.826486870120653e-05, + "loss": 0.1516009211540222, + "step": 12230 + }, + { + "epoch": 1.737402413058907, + "grad_norm": 0.4906020164489746, + "learning_rate": 9.826344925479064e-05, + "loss": 0.1481213688850403, + "step": 12240 + }, + { + "epoch": 1.7388218594748048, + "grad_norm": 2.69415283203125, + "learning_rate": 9.826202980837474e-05, + "loss": 0.14420045614242555, + "step": 12250 + }, + { + "epoch": 1.7402413058907027, + "grad_norm": 10.119294166564941, + "learning_rate": 9.826061036195884e-05, + "loss": 0.1346837282180786, + "step": 12260 + }, + { + "epoch": 1.7416607523066006, + "grad_norm": 5.118008613586426, + "learning_rate": 9.825919091554295e-05, + "loss": 0.10409802198410034, + "step": 12270 + }, + { + "epoch": 1.7430801987224982, + "grad_norm": 9.627950668334961, + "learning_rate": 9.825777146912704e-05, + "loss": 0.12958219051361083, + "step": 12280 + }, + { + "epoch": 1.744499645138396, + "grad_norm": 7.486164093017578, + "learning_rate": 9.825635202271115e-05, + "loss": 0.15439097881317138, + "step": 12290 + }, + { + "epoch": 1.7459190915542937, + "grad_norm": 4.496451377868652, + "learning_rate": 9.825493257629524e-05, + "loss": 0.14370408058166503, + "step": 12300 + }, + { + "epoch": 1.7473385379701916, + "grad_norm": 1.7741354703903198, + "learning_rate": 9.825351312987935e-05, + "loss": 0.14793674945831298, + "step": 12310 + }, + { + "epoch": 1.7487579843860894, + "grad_norm": 6.230805397033691, + "learning_rate": 9.825209368346345e-05, + "loss": 0.12588064670562743, + "step": 12320 + }, + { + "epoch": 1.7501774308019873, + "grad_norm": 7.041757106781006, + "learning_rate": 9.825067423704756e-05, + "loss": 0.2671244144439697, + "step": 12330 + }, + { + "epoch": 1.7515968772178852, + "grad_norm": 8.067173957824707, + "learning_rate": 9.824925479063167e-05, + "loss": 0.18581972122192383, + "step": 12340 + }, + { + "epoch": 1.7530163236337828, + "grad_norm": 6.106922626495361, + "learning_rate": 9.824783534421575e-05, + "loss": 0.16915748119354249, + "step": 12350 + }, + { + "epoch": 1.7544357700496807, + "grad_norm": 6.7981743812561035, + "learning_rate": 9.824641589779986e-05, + "loss": 0.12603729963302612, + "step": 12360 + }, + { + "epoch": 1.7558552164655783, + "grad_norm": 5.5388360023498535, + "learning_rate": 9.824499645138396e-05, + "loss": 0.1549227714538574, + "step": 12370 + }, + { + "epoch": 1.7572746628814762, + "grad_norm": 6.960907459259033, + "learning_rate": 9.824357700496807e-05, + "loss": 0.18172571659088135, + "step": 12380 + }, + { + "epoch": 1.758694109297374, + "grad_norm": 4.753782272338867, + "learning_rate": 9.824215755855217e-05, + "loss": 0.14021997451782225, + "step": 12390 + }, + { + "epoch": 1.7601135557132719, + "grad_norm": 3.4172661304473877, + "learning_rate": 9.824073811213628e-05, + "loss": 0.13940014839172363, + "step": 12400 + }, + { + "epoch": 1.7615330021291697, + "grad_norm": 2.0530076026916504, + "learning_rate": 9.823931866572036e-05, + "loss": 0.16023153066635132, + "step": 12410 + }, + { + "epoch": 1.7629524485450674, + "grad_norm": 9.870774269104004, + "learning_rate": 9.823789921930447e-05, + "loss": 0.1769045352935791, + "step": 12420 + }, + { + "epoch": 1.7643718949609652, + "grad_norm": 2.381181001663208, + "learning_rate": 9.823647977288859e-05, + "loss": 0.10290155410766602, + "step": 12430 + }, + { + "epoch": 1.7657913413768629, + "grad_norm": 0.6588567495346069, + "learning_rate": 9.823506032647268e-05, + "loss": 0.07668265104293823, + "step": 12440 + }, + { + "epoch": 1.7672107877927608, + "grad_norm": 8.259925842285156, + "learning_rate": 9.82336408800568e-05, + "loss": 0.10816916227340698, + "step": 12450 + }, + { + "epoch": 1.7686302342086586, + "grad_norm": 10.110259056091309, + "learning_rate": 9.823222143364088e-05, + "loss": 0.1543756604194641, + "step": 12460 + }, + { + "epoch": 1.7700496806245565, + "grad_norm": 1.5917772054672241, + "learning_rate": 9.823080198722499e-05, + "loss": 0.1755792737007141, + "step": 12470 + }, + { + "epoch": 1.7714691270404543, + "grad_norm": 4.567733287811279, + "learning_rate": 9.822938254080909e-05, + "loss": 0.09556171298027039, + "step": 12480 + }, + { + "epoch": 1.772888573456352, + "grad_norm": 4.524011611938477, + "learning_rate": 9.82279630943932e-05, + "loss": 0.11977797746658325, + "step": 12490 + }, + { + "epoch": 1.7743080198722498, + "grad_norm": 3.390681266784668, + "learning_rate": 9.82265436479773e-05, + "loss": 0.20999493598937988, + "step": 12500 + }, + { + "epoch": 1.7743080198722498, + "eval_accuracy": 0.9404845170725504, + "eval_loss": 0.17900405824184418, + "eval_runtime": 33.0963, + "eval_samples_per_second": 475.19, + "eval_steps_per_second": 14.866, + "step": 12500 + }, + { + "epoch": 1.7757274662881475, + "grad_norm": 6.486291885375977, + "learning_rate": 9.822512420156139e-05, + "loss": 0.15412837266921997, + "step": 12510 + }, + { + "epoch": 1.7771469127040453, + "grad_norm": 8.4727201461792, + "learning_rate": 9.82237047551455e-05, + "loss": 0.1553104877471924, + "step": 12520 + }, + { + "epoch": 1.7785663591199432, + "grad_norm": 7.080015182495117, + "learning_rate": 9.82222853087296e-05, + "loss": 0.17961130142211915, + "step": 12530 + }, + { + "epoch": 1.779985805535841, + "grad_norm": 3.5858380794525146, + "learning_rate": 9.822086586231371e-05, + "loss": 0.16834441423416138, + "step": 12540 + }, + { + "epoch": 1.781405251951739, + "grad_norm": 1.947180986404419, + "learning_rate": 9.821944641589781e-05, + "loss": 0.16140348911285402, + "step": 12550 + }, + { + "epoch": 1.7828246983676366, + "grad_norm": 4.678013801574707, + "learning_rate": 9.82180269694819e-05, + "loss": 0.17220114469528197, + "step": 12560 + }, + { + "epoch": 1.7842441447835344, + "grad_norm": 1.8858182430267334, + "learning_rate": 9.8216607523066e-05, + "loss": 0.11123390197753906, + "step": 12570 + }, + { + "epoch": 1.785663591199432, + "grad_norm": 8.490455627441406, + "learning_rate": 9.821518807665011e-05, + "loss": 0.21482553482055664, + "step": 12580 + }, + { + "epoch": 1.78708303761533, + "grad_norm": 6.9470415115356445, + "learning_rate": 9.821376863023421e-05, + "loss": 0.22754549980163574, + "step": 12590 + }, + { + "epoch": 1.7885024840312278, + "grad_norm": 7.122620105743408, + "learning_rate": 9.821234918381832e-05, + "loss": 0.2618594169616699, + "step": 12600 + }, + { + "epoch": 1.7899219304471257, + "grad_norm": 4.771125316619873, + "learning_rate": 9.821092973740242e-05, + "loss": 0.1289076805114746, + "step": 12610 + }, + { + "epoch": 1.7913413768630235, + "grad_norm": 1.8268935680389404, + "learning_rate": 9.820951029098652e-05, + "loss": 0.18204834461212158, + "step": 12620 + }, + { + "epoch": 1.7927608232789212, + "grad_norm": 5.549787521362305, + "learning_rate": 9.820809084457063e-05, + "loss": 0.14632033109664916, + "step": 12630 + }, + { + "epoch": 1.794180269694819, + "grad_norm": 4.965446949005127, + "learning_rate": 9.820667139815473e-05, + "loss": 0.14237403869628906, + "step": 12640 + }, + { + "epoch": 1.7955997161107167, + "grad_norm": 3.6704654693603516, + "learning_rate": 9.820525195173884e-05, + "loss": 0.14324573278427125, + "step": 12650 + }, + { + "epoch": 1.7970191625266145, + "grad_norm": 2.443148612976074, + "learning_rate": 9.820383250532292e-05, + "loss": 0.1546507477760315, + "step": 12660 + }, + { + "epoch": 1.7984386089425124, + "grad_norm": 8.586228370666504, + "learning_rate": 9.820241305890703e-05, + "loss": 0.17691378593444823, + "step": 12670 + }, + { + "epoch": 1.7998580553584103, + "grad_norm": 3.938798666000366, + "learning_rate": 9.820099361249113e-05, + "loss": 0.11685086488723755, + "step": 12680 + }, + { + "epoch": 1.8012775017743081, + "grad_norm": 10.324106216430664, + "learning_rate": 9.819957416607524e-05, + "loss": 0.1108386754989624, + "step": 12690 + }, + { + "epoch": 1.802696948190206, + "grad_norm": 5.7965087890625, + "learning_rate": 9.819815471965934e-05, + "loss": 0.173872172832489, + "step": 12700 + }, + { + "epoch": 1.8041163946061036, + "grad_norm": 6.263943195343018, + "learning_rate": 9.819673527324343e-05, + "loss": 0.12461161613464355, + "step": 12710 + }, + { + "epoch": 1.8055358410220013, + "grad_norm": 3.52416729927063, + "learning_rate": 9.819531582682754e-05, + "loss": 0.1361951231956482, + "step": 12720 + }, + { + "epoch": 1.8069552874378991, + "grad_norm": 3.2541964054107666, + "learning_rate": 9.819389638041164e-05, + "loss": 0.13711843490600586, + "step": 12730 + }, + { + "epoch": 1.808374733853797, + "grad_norm": 2.708355188369751, + "learning_rate": 9.819247693399575e-05, + "loss": 0.16509486436843873, + "step": 12740 + }, + { + "epoch": 1.8097941802696949, + "grad_norm": 8.279736518859863, + "learning_rate": 9.819105748757985e-05, + "loss": 0.15762121677398683, + "step": 12750 + }, + { + "epoch": 1.8112136266855927, + "grad_norm": 4.580092906951904, + "learning_rate": 9.818963804116396e-05, + "loss": 0.1657193422317505, + "step": 12760 + }, + { + "epoch": 1.8126330731014906, + "grad_norm": 6.182056903839111, + "learning_rate": 9.818821859474805e-05, + "loss": 0.09075002670288086, + "step": 12770 + }, + { + "epoch": 1.8140525195173882, + "grad_norm": 2.8882968425750732, + "learning_rate": 9.818679914833216e-05, + "loss": 0.11564161777496337, + "step": 12780 + }, + { + "epoch": 1.8154719659332859, + "grad_norm": 1.9291869401931763, + "learning_rate": 9.818537970191625e-05, + "loss": 0.1788640022277832, + "step": 12790 + }, + { + "epoch": 1.8168914123491837, + "grad_norm": 1.8585617542266846, + "learning_rate": 9.818396025550036e-05, + "loss": 0.14054034948348998, + "step": 12800 + }, + { + "epoch": 1.8183108587650816, + "grad_norm": 3.6257970333099365, + "learning_rate": 9.818254080908446e-05, + "loss": 0.1330336332321167, + "step": 12810 + }, + { + "epoch": 1.8197303051809794, + "grad_norm": 6.263546943664551, + "learning_rate": 9.818112136266856e-05, + "loss": 0.1774816632270813, + "step": 12820 + }, + { + "epoch": 1.8211497515968773, + "grad_norm": 10.41680908203125, + "learning_rate": 9.817970191625267e-05, + "loss": 0.1763577938079834, + "step": 12830 + }, + { + "epoch": 1.8225691980127752, + "grad_norm": 9.07449722290039, + "learning_rate": 9.817828246983677e-05, + "loss": 0.1599531054496765, + "step": 12840 + }, + { + "epoch": 1.8239886444286728, + "grad_norm": 7.387566089630127, + "learning_rate": 9.817686302342088e-05, + "loss": 0.14263440370559693, + "step": 12850 + }, + { + "epoch": 1.8254080908445705, + "grad_norm": 5.237459659576416, + "learning_rate": 9.817544357700498e-05, + "loss": 0.22102766036987304, + "step": 12860 + }, + { + "epoch": 1.8268275372604683, + "grad_norm": 2.364966630935669, + "learning_rate": 9.817402413058907e-05, + "loss": 0.10828995704650879, + "step": 12870 + }, + { + "epoch": 1.8282469836763662, + "grad_norm": 4.197632789611816, + "learning_rate": 9.817260468417317e-05, + "loss": 0.11210172176361084, + "step": 12880 + }, + { + "epoch": 1.829666430092264, + "grad_norm": 9.747461318969727, + "learning_rate": 9.817118523775728e-05, + "loss": 0.20235188007354737, + "step": 12890 + }, + { + "epoch": 1.831085876508162, + "grad_norm": 1.4320733547210693, + "learning_rate": 9.816976579134138e-05, + "loss": 0.11145485639572143, + "step": 12900 + }, + { + "epoch": 1.8325053229240598, + "grad_norm": 4.429521560668945, + "learning_rate": 9.816834634492549e-05, + "loss": 0.10955873727798462, + "step": 12910 + }, + { + "epoch": 1.8339247693399574, + "grad_norm": 6.954484462738037, + "learning_rate": 9.816692689850959e-05, + "loss": 0.15254650115966797, + "step": 12920 + }, + { + "epoch": 1.8353442157558553, + "grad_norm": 5.583377361297607, + "learning_rate": 9.816550745209368e-05, + "loss": 0.17690763473510743, + "step": 12930 + }, + { + "epoch": 1.836763662171753, + "grad_norm": 5.169642925262451, + "learning_rate": 9.81640880056778e-05, + "loss": 0.1680360794067383, + "step": 12940 + }, + { + "epoch": 1.8381831085876508, + "grad_norm": 10.711297988891602, + "learning_rate": 9.816266855926189e-05, + "loss": 0.20626237392425537, + "step": 12950 + }, + { + "epoch": 1.8396025550035486, + "grad_norm": 6.396773338317871, + "learning_rate": 9.8161249112846e-05, + "loss": 0.12390644550323486, + "step": 12960 + }, + { + "epoch": 1.8410220014194465, + "grad_norm": 6.008213996887207, + "learning_rate": 9.815982966643009e-05, + "loss": 0.16526665687561035, + "step": 12970 + }, + { + "epoch": 1.8424414478353444, + "grad_norm": 2.8224973678588867, + "learning_rate": 9.81584102200142e-05, + "loss": 0.15004030466079712, + "step": 12980 + }, + { + "epoch": 1.843860894251242, + "grad_norm": 3.8376224040985107, + "learning_rate": 9.81569907735983e-05, + "loss": 0.12394638061523437, + "step": 12990 + }, + { + "epoch": 1.8452803406671399, + "grad_norm": 4.487581253051758, + "learning_rate": 9.81555713271824e-05, + "loss": 0.12469573020935058, + "step": 13000 + }, + { + "epoch": 1.8452803406671399, + "eval_accuracy": 0.9322184777770712, + "eval_loss": 0.20526456832885742, + "eval_runtime": 32.1483, + "eval_samples_per_second": 489.202, + "eval_steps_per_second": 15.304, + "step": 13000 + }, + { + "epoch": 1.8466997870830375, + "grad_norm": 7.591648101806641, + "learning_rate": 9.81541518807665e-05, + "loss": 0.1824552297592163, + "step": 13010 + }, + { + "epoch": 1.8481192334989354, + "grad_norm": 2.9393680095672607, + "learning_rate": 9.81527324343506e-05, + "loss": 0.18779258728027343, + "step": 13020 + }, + { + "epoch": 1.8495386799148332, + "grad_norm": 4.982316493988037, + "learning_rate": 9.815131298793471e-05, + "loss": 0.1856153726577759, + "step": 13030 + }, + { + "epoch": 1.850958126330731, + "grad_norm": 4.3030242919921875, + "learning_rate": 9.814989354151881e-05, + "loss": 0.13149327039718628, + "step": 13040 + }, + { + "epoch": 1.852377572746629, + "grad_norm": 3.1720340251922607, + "learning_rate": 9.814847409510292e-05, + "loss": 0.17401224374771118, + "step": 13050 + }, + { + "epoch": 1.8537970191625266, + "grad_norm": 5.330498218536377, + "learning_rate": 9.814705464868702e-05, + "loss": 0.18381781578063966, + "step": 13060 + }, + { + "epoch": 1.8552164655784245, + "grad_norm": 3.171062469482422, + "learning_rate": 9.814563520227113e-05, + "loss": 0.09782277941703796, + "step": 13070 + }, + { + "epoch": 1.856635911994322, + "grad_norm": 3.653743267059326, + "learning_rate": 9.814421575585521e-05, + "loss": 0.14549950361251832, + "step": 13080 + }, + { + "epoch": 1.85805535841022, + "grad_norm": 2.782893180847168, + "learning_rate": 9.814279630943932e-05, + "loss": 0.1609262704849243, + "step": 13090 + }, + { + "epoch": 1.8594748048261178, + "grad_norm": 7.247891426086426, + "learning_rate": 9.814137686302342e-05, + "loss": 0.14557520151138306, + "step": 13100 + }, + { + "epoch": 1.8608942512420157, + "grad_norm": 4.025136947631836, + "learning_rate": 9.813995741660753e-05, + "loss": 0.06900943517684936, + "step": 13110 + }, + { + "epoch": 1.8623136976579135, + "grad_norm": 2.248847007751465, + "learning_rate": 9.813853797019163e-05, + "loss": 0.12486515045166016, + "step": 13120 + }, + { + "epoch": 1.8637331440738112, + "grad_norm": 9.784401893615723, + "learning_rate": 9.813711852377573e-05, + "loss": 0.12270998954772949, + "step": 13130 + }, + { + "epoch": 1.865152590489709, + "grad_norm": 4.735940456390381, + "learning_rate": 9.813569907735984e-05, + "loss": 0.2059864282608032, + "step": 13140 + }, + { + "epoch": 1.8665720369056067, + "grad_norm": 5.477226257324219, + "learning_rate": 9.813427963094394e-05, + "loss": 0.10135586261749267, + "step": 13150 + }, + { + "epoch": 1.8679914833215046, + "grad_norm": 5.485146522521973, + "learning_rate": 9.813286018452805e-05, + "loss": 0.18213980197906493, + "step": 13160 + }, + { + "epoch": 1.8694109297374024, + "grad_norm": 4.844747543334961, + "learning_rate": 9.813144073811214e-05, + "loss": 0.10833338499069214, + "step": 13170 + }, + { + "epoch": 1.8708303761533003, + "grad_norm": 12.112831115722656, + "learning_rate": 9.813002129169624e-05, + "loss": 0.1866260290145874, + "step": 13180 + }, + { + "epoch": 1.8722498225691981, + "grad_norm": 1.797105073928833, + "learning_rate": 9.812860184528034e-05, + "loss": 0.1560835361480713, + "step": 13190 + }, + { + "epoch": 1.8736692689850958, + "grad_norm": 8.335697174072266, + "learning_rate": 9.812718239886445e-05, + "loss": 0.11914796829223633, + "step": 13200 + }, + { + "epoch": 1.8750887154009936, + "grad_norm": 4.479477405548096, + "learning_rate": 9.812576295244855e-05, + "loss": 0.18317773342132568, + "step": 13210 + }, + { + "epoch": 1.8765081618168913, + "grad_norm": 1.5853248834609985, + "learning_rate": 9.812434350603266e-05, + "loss": 0.09048664569854736, + "step": 13220 + }, + { + "epoch": 1.8779276082327891, + "grad_norm": 4.840945243835449, + "learning_rate": 9.812292405961675e-05, + "loss": 0.13578274250030517, + "step": 13230 + }, + { + "epoch": 1.879347054648687, + "grad_norm": 11.123950958251953, + "learning_rate": 9.812150461320085e-05, + "loss": 0.17634526491165162, + "step": 13240 + }, + { + "epoch": 1.8807665010645849, + "grad_norm": 4.322571754455566, + "learning_rate": 9.812008516678496e-05, + "loss": 0.10883429050445556, + "step": 13250 + }, + { + "epoch": 1.8821859474804827, + "grad_norm": 4.164629936218262, + "learning_rate": 9.811866572036906e-05, + "loss": 0.15946507453918457, + "step": 13260 + }, + { + "epoch": 1.8836053938963804, + "grad_norm": 4.701801300048828, + "learning_rate": 9.811724627395317e-05, + "loss": 0.15585731267929076, + "step": 13270 + }, + { + "epoch": 1.8850248403122782, + "grad_norm": 6.6244916915893555, + "learning_rate": 9.811582682753726e-05, + "loss": 0.1586725354194641, + "step": 13280 + }, + { + "epoch": 1.8864442867281759, + "grad_norm": 5.30622673034668, + "learning_rate": 9.811440738112137e-05, + "loss": 0.16929301023483276, + "step": 13290 + }, + { + "epoch": 1.8878637331440737, + "grad_norm": 7.866292476654053, + "learning_rate": 9.811298793470546e-05, + "loss": 0.1626114845275879, + "step": 13300 + }, + { + "epoch": 1.8892831795599716, + "grad_norm": 3.1928579807281494, + "learning_rate": 9.811156848828957e-05, + "loss": 0.11974685192108155, + "step": 13310 + }, + { + "epoch": 1.8907026259758695, + "grad_norm": 3.165278196334839, + "learning_rate": 9.811014904187367e-05, + "loss": 0.17966209650039672, + "step": 13320 + }, + { + "epoch": 1.8921220723917673, + "grad_norm": 7.965559959411621, + "learning_rate": 9.810872959545777e-05, + "loss": 0.1445131778717041, + "step": 13330 + }, + { + "epoch": 1.893541518807665, + "grad_norm": 7.0571722984313965, + "learning_rate": 9.810745209368347e-05, + "loss": 0.1508271336555481, + "step": 13340 + }, + { + "epoch": 1.8949609652235628, + "grad_norm": 6.5066351890563965, + "learning_rate": 9.810603264726757e-05, + "loss": 0.2136392116546631, + "step": 13350 + }, + { + "epoch": 1.8963804116394605, + "grad_norm": 5.8861517906188965, + "learning_rate": 9.810461320085168e-05, + "loss": 0.11962813138961792, + "step": 13360 + }, + { + "epoch": 1.8977998580553583, + "grad_norm": 12.299768447875977, + "learning_rate": 9.810319375443577e-05, + "loss": 0.14256292581558228, + "step": 13370 + }, + { + "epoch": 1.8992193044712562, + "grad_norm": 10.79692554473877, + "learning_rate": 9.810177430801988e-05, + "loss": 0.16675705909729005, + "step": 13380 + }, + { + "epoch": 1.900638750887154, + "grad_norm": 4.968460559844971, + "learning_rate": 9.810035486160398e-05, + "loss": 0.18271161317825318, + "step": 13390 + }, + { + "epoch": 1.902058197303052, + "grad_norm": 6.083104133605957, + "learning_rate": 9.809893541518809e-05, + "loss": 0.19613151550292968, + "step": 13400 + }, + { + "epoch": 1.9034776437189496, + "grad_norm": 7.929781913757324, + "learning_rate": 9.809751596877218e-05, + "loss": 0.12828643321990968, + "step": 13410 + }, + { + "epoch": 1.9048970901348474, + "grad_norm": 10.386966705322266, + "learning_rate": 9.809609652235629e-05, + "loss": 0.1059008240699768, + "step": 13420 + }, + { + "epoch": 1.906316536550745, + "grad_norm": 9.958741188049316, + "learning_rate": 9.809467707594038e-05, + "loss": 0.17238779067993165, + "step": 13430 + }, + { + "epoch": 1.907735982966643, + "grad_norm": 7.629611492156982, + "learning_rate": 9.80932576295245e-05, + "loss": 0.11009730100631714, + "step": 13440 + }, + { + "epoch": 1.9091554293825408, + "grad_norm": 4.110402584075928, + "learning_rate": 9.809183818310859e-05, + "loss": 0.15767955780029297, + "step": 13450 + }, + { + "epoch": 1.9105748757984387, + "grad_norm": 5.907031059265137, + "learning_rate": 9.809041873669269e-05, + "loss": 0.11883927583694458, + "step": 13460 + }, + { + "epoch": 1.9119943222143365, + "grad_norm": 6.367669105529785, + "learning_rate": 9.80889992902768e-05, + "loss": 0.15383024215698243, + "step": 13470 + }, + { + "epoch": 1.9134137686302342, + "grad_norm": 11.253113746643066, + "learning_rate": 9.80875798438609e-05, + "loss": 0.18761264085769652, + "step": 13480 + }, + { + "epoch": 1.914833215046132, + "grad_norm": 8.148927688598633, + "learning_rate": 9.808616039744501e-05, + "loss": 0.1913072109222412, + "step": 13490 + }, + { + "epoch": 1.9162526614620297, + "grad_norm": 5.086034774780273, + "learning_rate": 9.808474095102911e-05, + "loss": 0.1331562876701355, + "step": 13500 + }, + { + "epoch": 1.9162526614620297, + "eval_accuracy": 0.9270680994468112, + "eval_loss": 0.20431001484394073, + "eval_runtime": 33.1047, + "eval_samples_per_second": 475.068, + "eval_steps_per_second": 14.862, + "step": 13500 + }, + { + "epoch": 1.9176721078779275, + "grad_norm": 8.143988609313965, + "learning_rate": 9.80833215046132e-05, + "loss": 0.16751954555511475, + "step": 13510 + }, + { + "epoch": 1.9190915542938254, + "grad_norm": 8.666000366210938, + "learning_rate": 9.80819020581973e-05, + "loss": 0.10578331947326661, + "step": 13520 + }, + { + "epoch": 1.9205110007097232, + "grad_norm": 2.205212116241455, + "learning_rate": 9.808048261178141e-05, + "loss": 0.16295469999313356, + "step": 13530 + }, + { + "epoch": 1.921930447125621, + "grad_norm": 3.5031938552856445, + "learning_rate": 9.807906316536551e-05, + "loss": 0.19274975061416627, + "step": 13540 + }, + { + "epoch": 1.923349893541519, + "grad_norm": 6.0588884353637695, + "learning_rate": 9.807764371894962e-05, + "loss": 0.1572549819946289, + "step": 13550 + }, + { + "epoch": 1.9247693399574166, + "grad_norm": 5.022733688354492, + "learning_rate": 9.807622427253372e-05, + "loss": 0.1502652645111084, + "step": 13560 + }, + { + "epoch": 1.9261887863733143, + "grad_norm": 6.909353733062744, + "learning_rate": 9.807480482611782e-05, + "loss": 0.19446460008621216, + "step": 13570 + }, + { + "epoch": 1.9276082327892121, + "grad_norm": 4.539268970489502, + "learning_rate": 9.807338537970193e-05, + "loss": 0.1496061086654663, + "step": 13580 + }, + { + "epoch": 1.92902767920511, + "grad_norm": 5.273926258087158, + "learning_rate": 9.807196593328602e-05, + "loss": 0.1780215859413147, + "step": 13590 + }, + { + "epoch": 1.9304471256210078, + "grad_norm": 4.610520362854004, + "learning_rate": 9.807054648687014e-05, + "loss": 0.12462868690490722, + "step": 13600 + }, + { + "epoch": 1.9318665720369057, + "grad_norm": 7.675487041473389, + "learning_rate": 9.806912704045422e-05, + "loss": 0.17334070205688476, + "step": 13610 + }, + { + "epoch": 1.9332860184528036, + "grad_norm": 7.004896640777588, + "learning_rate": 9.806770759403833e-05, + "loss": 0.15332577228546143, + "step": 13620 + }, + { + "epoch": 1.9347054648687012, + "grad_norm": 2.8662800788879395, + "learning_rate": 9.806628814762243e-05, + "loss": 0.12613468170166015, + "step": 13630 + }, + { + "epoch": 1.9361249112845988, + "grad_norm": 3.3417696952819824, + "learning_rate": 9.806486870120654e-05, + "loss": 0.11488528251647949, + "step": 13640 + }, + { + "epoch": 1.9375443577004967, + "grad_norm": 8.002215385437012, + "learning_rate": 9.806344925479064e-05, + "loss": 0.12292193174362183, + "step": 13650 + }, + { + "epoch": 1.9389638041163946, + "grad_norm": 3.650278091430664, + "learning_rate": 9.806202980837473e-05, + "loss": 0.15752785205841063, + "step": 13660 + }, + { + "epoch": 1.9403832505322924, + "grad_norm": 3.4982657432556152, + "learning_rate": 9.806061036195884e-05, + "loss": 0.13047711849212645, + "step": 13670 + }, + { + "epoch": 1.9418026969481903, + "grad_norm": 7.711712837219238, + "learning_rate": 9.805919091554294e-05, + "loss": 0.144749915599823, + "step": 13680 + }, + { + "epoch": 1.9432221433640882, + "grad_norm": 5.939789772033691, + "learning_rate": 9.805777146912705e-05, + "loss": 0.13807902336120606, + "step": 13690 + }, + { + "epoch": 1.9446415897799858, + "grad_norm": 3.993557929992676, + "learning_rate": 9.805635202271115e-05, + "loss": 0.1018330454826355, + "step": 13700 + }, + { + "epoch": 1.9460610361958834, + "grad_norm": 6.909927845001221, + "learning_rate": 9.805493257629525e-05, + "loss": 0.1758143424987793, + "step": 13710 + }, + { + "epoch": 1.9474804826117813, + "grad_norm": 4.5612993240356445, + "learning_rate": 9.805351312987934e-05, + "loss": 0.13746780157089233, + "step": 13720 + }, + { + "epoch": 1.9488999290276792, + "grad_norm": 0.8813110589981079, + "learning_rate": 9.805209368346346e-05, + "loss": 0.13282377719879152, + "step": 13730 + }, + { + "epoch": 1.950319375443577, + "grad_norm": 4.4625630378723145, + "learning_rate": 9.805067423704755e-05, + "loss": 0.19286319017410278, + "step": 13740 + }, + { + "epoch": 1.951738821859475, + "grad_norm": 6.587796688079834, + "learning_rate": 9.804925479063166e-05, + "loss": 0.1381397008895874, + "step": 13750 + }, + { + "epoch": 1.9531582682753728, + "grad_norm": 7.006091594696045, + "learning_rate": 9.804783534421576e-05, + "loss": 0.10776946544647217, + "step": 13760 + }, + { + "epoch": 1.9545777146912704, + "grad_norm": 6.6057257652282715, + "learning_rate": 9.804641589779986e-05, + "loss": 0.1551327109336853, + "step": 13770 + }, + { + "epoch": 1.9559971611071683, + "grad_norm": 2.855726480484009, + "learning_rate": 9.804499645138397e-05, + "loss": 0.14515860080718995, + "step": 13780 + }, + { + "epoch": 1.957416607523066, + "grad_norm": 4.859558582305908, + "learning_rate": 9.804357700496807e-05, + "loss": 0.13317285776138305, + "step": 13790 + }, + { + "epoch": 1.9588360539389638, + "grad_norm": 4.010891437530518, + "learning_rate": 9.804215755855218e-05, + "loss": 0.21571955680847169, + "step": 13800 + }, + { + "epoch": 1.9602555003548616, + "grad_norm": 1.5958309173583984, + "learning_rate": 9.804073811213627e-05, + "loss": 0.11179524660110474, + "step": 13810 + }, + { + "epoch": 1.9616749467707595, + "grad_norm": 4.728942394256592, + "learning_rate": 9.803931866572037e-05, + "loss": 0.12224637269973755, + "step": 13820 + }, + { + "epoch": 1.9630943931866573, + "grad_norm": 5.639578342437744, + "learning_rate": 9.803789921930447e-05, + "loss": 0.10692014694213867, + "step": 13830 + }, + { + "epoch": 1.964513839602555, + "grad_norm": 3.7262027263641357, + "learning_rate": 9.803647977288858e-05, + "loss": 0.1023218035697937, + "step": 13840 + }, + { + "epoch": 1.9659332860184529, + "grad_norm": 6.50256872177124, + "learning_rate": 9.803506032647268e-05, + "loss": 0.12723206281661986, + "step": 13850 + }, + { + "epoch": 1.9673527324343505, + "grad_norm": 2.4793450832366943, + "learning_rate": 9.803364088005679e-05, + "loss": 0.18150064945220948, + "step": 13860 + }, + { + "epoch": 1.9687721788502484, + "grad_norm": 8.015069961547852, + "learning_rate": 9.803222143364089e-05, + "loss": 0.13160840272903443, + "step": 13870 + }, + { + "epoch": 1.9701916252661462, + "grad_norm": 2.3164284229278564, + "learning_rate": 9.803080198722498e-05, + "loss": 0.13569587469100952, + "step": 13880 + }, + { + "epoch": 1.971611071682044, + "grad_norm": 5.398233413696289, + "learning_rate": 9.80293825408091e-05, + "loss": 0.10830456018447876, + "step": 13890 + }, + { + "epoch": 1.973030518097942, + "grad_norm": 4.58472204208374, + "learning_rate": 9.802796309439319e-05, + "loss": 0.12152203321456909, + "step": 13900 + }, + { + "epoch": 1.9744499645138396, + "grad_norm": 3.399158239364624, + "learning_rate": 9.80265436479773e-05, + "loss": 0.09602898955345154, + "step": 13910 + }, + { + "epoch": 1.9758694109297374, + "grad_norm": 5.37898063659668, + "learning_rate": 9.802512420156139e-05, + "loss": 0.16220704317092896, + "step": 13920 + }, + { + "epoch": 1.977288857345635, + "grad_norm": 8.282011985778809, + "learning_rate": 9.80237047551455e-05, + "loss": 0.1817216157913208, + "step": 13930 + }, + { + "epoch": 1.978708303761533, + "grad_norm": 8.454946517944336, + "learning_rate": 9.80222853087296e-05, + "loss": 0.10207384824752808, + "step": 13940 + }, + { + "epoch": 1.9801277501774308, + "grad_norm": 5.604420185089111, + "learning_rate": 9.80208658623137e-05, + "loss": 0.13896651268005372, + "step": 13950 + }, + { + "epoch": 1.9815471965933287, + "grad_norm": 5.782528400421143, + "learning_rate": 9.80194464158978e-05, + "loss": 0.16523996591567994, + "step": 13960 + }, + { + "epoch": 1.9829666430092265, + "grad_norm": 7.257541656494141, + "learning_rate": 9.80180269694819e-05, + "loss": 0.1670131802558899, + "step": 13970 + }, + { + "epoch": 1.9843860894251242, + "grad_norm": 1.4823135137557983, + "learning_rate": 9.801660752306601e-05, + "loss": 0.09150451421737671, + "step": 13980 + }, + { + "epoch": 1.985805535841022, + "grad_norm": 11.689827919006348, + "learning_rate": 9.801518807665011e-05, + "loss": 0.12286759614944458, + "step": 13990 + }, + { + "epoch": 1.9872249822569197, + "grad_norm": 2.379868268966675, + "learning_rate": 9.801376863023422e-05, + "loss": 0.08730307221412659, + "step": 14000 + }, + { + "epoch": 1.9872249822569197, + "eval_accuracy": 0.9489413111210021, + "eval_loss": 0.14637306332588196, + "eval_runtime": 33.0818, + "eval_samples_per_second": 475.397, + "eval_steps_per_second": 14.872, + "step": 14000 + }, + { + "epoch": 1.9886444286728175, + "grad_norm": 3.562831163406372, + "learning_rate": 9.801234918381832e-05, + "loss": 0.10573784112930298, + "step": 14010 + }, + { + "epoch": 1.9900638750887154, + "grad_norm": 1.7032339572906494, + "learning_rate": 9.801092973740241e-05, + "loss": 0.1144748330116272, + "step": 14020 + }, + { + "epoch": 1.9914833215046133, + "grad_norm": 9.984017372131348, + "learning_rate": 9.800951029098651e-05, + "loss": 0.2368067979812622, + "step": 14030 + }, + { + "epoch": 1.9929027679205111, + "grad_norm": 4.510107517242432, + "learning_rate": 9.800809084457062e-05, + "loss": 0.11444370746612549, + "step": 14040 + }, + { + "epoch": 1.9943222143364088, + "grad_norm": 2.9397714138031006, + "learning_rate": 9.800667139815472e-05, + "loss": 0.08882022500038148, + "step": 14050 + }, + { + "epoch": 1.9957416607523066, + "grad_norm": 5.492639064788818, + "learning_rate": 9.800525195173883e-05, + "loss": 0.13332669734954833, + "step": 14060 + }, + { + "epoch": 1.9971611071682043, + "grad_norm": 6.94230318069458, + "learning_rate": 9.800383250532293e-05, + "loss": 0.1107181191444397, + "step": 14070 + }, + { + "epoch": 1.9985805535841021, + "grad_norm": 1.4583178758621216, + "learning_rate": 9.800241305890703e-05, + "loss": 0.1853145956993103, + "step": 14080 + }, + { + "epoch": 2.0, + "grad_norm": 3.6740102767944336, + "learning_rate": 9.800099361249114e-05, + "loss": 0.1035921812057495, + "step": 14090 + }, + { + "epoch": 2.001419446415898, + "grad_norm": 7.763698101043701, + "learning_rate": 9.799957416607523e-05, + "loss": 0.11998735666275025, + "step": 14100 + }, + { + "epoch": 2.0028388928317957, + "grad_norm": 9.761672019958496, + "learning_rate": 9.799815471965935e-05, + "loss": 0.130437171459198, + "step": 14110 + }, + { + "epoch": 2.0042583392476936, + "grad_norm": 6.725173473358154, + "learning_rate": 9.799673527324344e-05, + "loss": 0.1438794732093811, + "step": 14120 + }, + { + "epoch": 2.005677785663591, + "grad_norm": 2.627002477645874, + "learning_rate": 9.799531582682754e-05, + "loss": 0.13035544157028198, + "step": 14130 + }, + { + "epoch": 2.007097232079489, + "grad_norm": 1.8587443828582764, + "learning_rate": 9.799389638041164e-05, + "loss": 0.10760440826416015, + "step": 14140 + }, + { + "epoch": 2.0085166784953867, + "grad_norm": 5.432860851287842, + "learning_rate": 9.799247693399575e-05, + "loss": 0.08797118067741394, + "step": 14150 + }, + { + "epoch": 2.0099361249112846, + "grad_norm": 8.000253677368164, + "learning_rate": 9.799105748757985e-05, + "loss": 0.13834741115570068, + "step": 14160 + }, + { + "epoch": 2.0113555713271825, + "grad_norm": 4.846225738525391, + "learning_rate": 9.798963804116396e-05, + "loss": 0.1457647442817688, + "step": 14170 + }, + { + "epoch": 2.0127750177430803, + "grad_norm": 11.00196361541748, + "learning_rate": 9.798821859474805e-05, + "loss": 0.1213072657585144, + "step": 14180 + }, + { + "epoch": 2.014194464158978, + "grad_norm": 10.398648262023926, + "learning_rate": 9.798679914833215e-05, + "loss": 0.13774160146713257, + "step": 14190 + }, + { + "epoch": 2.0156139105748756, + "grad_norm": 2.693225145339966, + "learning_rate": 9.798537970191626e-05, + "loss": 0.1436489462852478, + "step": 14200 + }, + { + "epoch": 2.0170333569907735, + "grad_norm": 2.0098676681518555, + "learning_rate": 9.798396025550036e-05, + "loss": 0.11806844472885132, + "step": 14210 + }, + { + "epoch": 2.0184528034066713, + "grad_norm": 3.5687620639801025, + "learning_rate": 9.798254080908447e-05, + "loss": 0.11548566818237305, + "step": 14220 + }, + { + "epoch": 2.019872249822569, + "grad_norm": 4.691004276275635, + "learning_rate": 9.798112136266855e-05, + "loss": 0.11631312370300292, + "step": 14230 + }, + { + "epoch": 2.021291696238467, + "grad_norm": 5.144685745239258, + "learning_rate": 9.797970191625267e-05, + "loss": 0.08912101984024048, + "step": 14240 + }, + { + "epoch": 2.022711142654365, + "grad_norm": 10.743430137634277, + "learning_rate": 9.797828246983676e-05, + "loss": 0.10923216342926026, + "step": 14250 + }, + { + "epoch": 2.0241305890702628, + "grad_norm": 1.788232445716858, + "learning_rate": 9.797686302342087e-05, + "loss": 0.10165914297103881, + "step": 14260 + }, + { + "epoch": 2.02555003548616, + "grad_norm": 1.6243984699249268, + "learning_rate": 9.797544357700497e-05, + "loss": 0.07863327860832214, + "step": 14270 + }, + { + "epoch": 2.026969481902058, + "grad_norm": 4.447552680969238, + "learning_rate": 9.797402413058907e-05, + "loss": 0.09306793808937072, + "step": 14280 + }, + { + "epoch": 2.028388928317956, + "grad_norm": 6.648647308349609, + "learning_rate": 9.797260468417318e-05, + "loss": 0.13603001832962036, + "step": 14290 + }, + { + "epoch": 2.029808374733854, + "grad_norm": 6.4532952308654785, + "learning_rate": 9.797118523775728e-05, + "loss": 0.13374946117401124, + "step": 14300 + }, + { + "epoch": 2.0312278211497516, + "grad_norm": 3.549644708633423, + "learning_rate": 9.796976579134139e-05, + "loss": 0.11156256198883056, + "step": 14310 + }, + { + "epoch": 2.0326472675656495, + "grad_norm": 5.188971042633057, + "learning_rate": 9.796834634492548e-05, + "loss": 0.1163739800453186, + "step": 14320 + }, + { + "epoch": 2.0340667139815474, + "grad_norm": 2.5170130729675293, + "learning_rate": 9.796692689850958e-05, + "loss": 0.17147536277770997, + "step": 14330 + }, + { + "epoch": 2.035486160397445, + "grad_norm": 1.3498976230621338, + "learning_rate": 9.796550745209368e-05, + "loss": 0.10244355201721192, + "step": 14340 + }, + { + "epoch": 2.0369056068133427, + "grad_norm": 1.6554956436157227, + "learning_rate": 9.796408800567779e-05, + "loss": 0.10223543643951416, + "step": 14350 + }, + { + "epoch": 2.0383250532292405, + "grad_norm": 7.838418006896973, + "learning_rate": 9.796266855926189e-05, + "loss": 0.11812844276428222, + "step": 14360 + }, + { + "epoch": 2.0397444996451384, + "grad_norm": 1.8078879117965698, + "learning_rate": 9.7961249112846e-05, + "loss": 0.1252034544944763, + "step": 14370 + }, + { + "epoch": 2.0411639460610362, + "grad_norm": 3.4205777645111084, + "learning_rate": 9.79598296664301e-05, + "loss": 0.10178905725479126, + "step": 14380 + }, + { + "epoch": 2.042583392476934, + "grad_norm": 6.722558498382568, + "learning_rate": 9.79584102200142e-05, + "loss": 0.13192167282104492, + "step": 14390 + }, + { + "epoch": 2.044002838892832, + "grad_norm": 3.837047576904297, + "learning_rate": 9.79569907735983e-05, + "loss": 0.12296985387802124, + "step": 14400 + }, + { + "epoch": 2.0454222853087294, + "grad_norm": 2.1457889080047607, + "learning_rate": 9.79555713271824e-05, + "loss": 0.16315003633499145, + "step": 14410 + }, + { + "epoch": 2.0468417317246272, + "grad_norm": 6.29680871963501, + "learning_rate": 9.795415188076651e-05, + "loss": 0.12061352729797363, + "step": 14420 + }, + { + "epoch": 2.048261178140525, + "grad_norm": 6.541940689086914, + "learning_rate": 9.79527324343506e-05, + "loss": 0.2011786699295044, + "step": 14430 + }, + { + "epoch": 2.049680624556423, + "grad_norm": 4.376636505126953, + "learning_rate": 9.795131298793471e-05, + "loss": 0.10220627784729004, + "step": 14440 + }, + { + "epoch": 2.051100070972321, + "grad_norm": 3.3631985187530518, + "learning_rate": 9.79498935415188e-05, + "loss": 0.12176470756530762, + "step": 14450 + }, + { + "epoch": 2.0525195173882187, + "grad_norm": 3.7540066242218018, + "learning_rate": 9.794847409510292e-05, + "loss": 0.13856956958770753, + "step": 14460 + }, + { + "epoch": 2.0539389638041166, + "grad_norm": 4.199720859527588, + "learning_rate": 9.794705464868701e-05, + "loss": 0.11578547954559326, + "step": 14470 + }, + { + "epoch": 2.055358410220014, + "grad_norm": 2.478891134262085, + "learning_rate": 9.794563520227112e-05, + "loss": 0.11448420286178589, + "step": 14480 + }, + { + "epoch": 2.056777856635912, + "grad_norm": 10.809943199157715, + "learning_rate": 9.794421575585522e-05, + "loss": 0.11974853277206421, + "step": 14490 + }, + { + "epoch": 2.0581973030518097, + "grad_norm": 3.9403326511383057, + "learning_rate": 9.794279630943932e-05, + "loss": 0.09595261812210083, + "step": 14500 + }, + { + "epoch": 2.0581973030518097, + "eval_accuracy": 0.9520569720862212, + "eval_loss": 0.1421024352312088, + "eval_runtime": 32.3117, + "eval_samples_per_second": 486.728, + "eval_steps_per_second": 15.227, + "step": 14500 + }, + { + "epoch": 2.0596167494677076, + "grad_norm": 9.631017684936523, + "learning_rate": 9.794137686302343e-05, + "loss": 0.15254437923431396, + "step": 14510 + }, + { + "epoch": 2.0610361958836054, + "grad_norm": 4.611459255218506, + "learning_rate": 9.793995741660753e-05, + "loss": 0.09197093248367309, + "step": 14520 + }, + { + "epoch": 2.0624556422995033, + "grad_norm": 5.0104756355285645, + "learning_rate": 9.793853797019164e-05, + "loss": 0.17470468282699586, + "step": 14530 + }, + { + "epoch": 2.063875088715401, + "grad_norm": 6.290011882781982, + "learning_rate": 9.793711852377572e-05, + "loss": 0.13710517883300782, + "step": 14540 + }, + { + "epoch": 2.065294535131299, + "grad_norm": 5.759206771850586, + "learning_rate": 9.793569907735983e-05, + "loss": 0.08785209059715271, + "step": 14550 + }, + { + "epoch": 2.0667139815471964, + "grad_norm": 3.606126308441162, + "learning_rate": 9.793427963094393e-05, + "loss": 0.1606206178665161, + "step": 14560 + }, + { + "epoch": 2.0681334279630943, + "grad_norm": 1.4751636981964111, + "learning_rate": 9.793286018452804e-05, + "loss": 0.09843673706054687, + "step": 14570 + }, + { + "epoch": 2.069552874378992, + "grad_norm": 6.7842864990234375, + "learning_rate": 9.793144073811215e-05, + "loss": 0.12192797660827637, + "step": 14580 + }, + { + "epoch": 2.07097232079489, + "grad_norm": 0.8541110754013062, + "learning_rate": 9.793002129169624e-05, + "loss": 0.16259843111038208, + "step": 14590 + }, + { + "epoch": 2.072391767210788, + "grad_norm": 1.672593116760254, + "learning_rate": 9.792860184528035e-05, + "loss": 0.09362624883651734, + "step": 14600 + }, + { + "epoch": 2.0738112136266857, + "grad_norm": 1.834715485572815, + "learning_rate": 9.792718239886444e-05, + "loss": 0.09099584221839904, + "step": 14610 + }, + { + "epoch": 2.0752306600425836, + "grad_norm": 2.21016001701355, + "learning_rate": 9.792576295244856e-05, + "loss": 0.12747323513031006, + "step": 14620 + }, + { + "epoch": 2.076650106458481, + "grad_norm": 2.8152081966400146, + "learning_rate": 9.792434350603265e-05, + "loss": 0.08871068954467773, + "step": 14630 + }, + { + "epoch": 2.078069552874379, + "grad_norm": 10.869599342346191, + "learning_rate": 9.792292405961675e-05, + "loss": 0.09311275482177735, + "step": 14640 + }, + { + "epoch": 2.0794889992902768, + "grad_norm": 7.580860614776611, + "learning_rate": 9.792150461320085e-05, + "loss": 0.10084123611450195, + "step": 14650 + }, + { + "epoch": 2.0809084457061746, + "grad_norm": 4.795779228210449, + "learning_rate": 9.792008516678496e-05, + "loss": 0.11776796579360962, + "step": 14660 + }, + { + "epoch": 2.0823278921220725, + "grad_norm": 8.302618980407715, + "learning_rate": 9.791866572036907e-05, + "loss": 0.1491849184036255, + "step": 14670 + }, + { + "epoch": 2.0837473385379703, + "grad_norm": 0.23616167902946472, + "learning_rate": 9.791724627395317e-05, + "loss": 0.09274361729621887, + "step": 14680 + }, + { + "epoch": 2.085166784953868, + "grad_norm": 4.930098056793213, + "learning_rate": 9.791582682753726e-05, + "loss": 0.10362660884857178, + "step": 14690 + }, + { + "epoch": 2.0865862313697656, + "grad_norm": 5.442007064819336, + "learning_rate": 9.791440738112136e-05, + "loss": 0.16730997562408448, + "step": 14700 + }, + { + "epoch": 2.0880056777856635, + "grad_norm": 2.312178134918213, + "learning_rate": 9.791298793470547e-05, + "loss": 0.09510490894317628, + "step": 14710 + }, + { + "epoch": 2.0894251242015613, + "grad_norm": 4.624721527099609, + "learning_rate": 9.791156848828957e-05, + "loss": 0.11144552230834961, + "step": 14720 + }, + { + "epoch": 2.090844570617459, + "grad_norm": 4.009274482727051, + "learning_rate": 9.791014904187368e-05, + "loss": 0.05063519477844238, + "step": 14730 + }, + { + "epoch": 2.092264017033357, + "grad_norm": 3.2653450965881348, + "learning_rate": 9.790872959545776e-05, + "loss": 0.08952829837799073, + "step": 14740 + }, + { + "epoch": 2.093683463449255, + "grad_norm": 5.824209690093994, + "learning_rate": 9.790731014904188e-05, + "loss": 0.15206855535507202, + "step": 14750 + }, + { + "epoch": 2.095102909865153, + "grad_norm": 9.619600296020508, + "learning_rate": 9.790589070262599e-05, + "loss": 0.09403921961784363, + "step": 14760 + }, + { + "epoch": 2.09652235628105, + "grad_norm": 9.709185600280762, + "learning_rate": 9.790447125621008e-05, + "loss": 0.14637627601623535, + "step": 14770 + }, + { + "epoch": 2.097941802696948, + "grad_norm": 5.918253421783447, + "learning_rate": 9.79030518097942e-05, + "loss": 0.1368915319442749, + "step": 14780 + }, + { + "epoch": 2.099361249112846, + "grad_norm": 4.801339626312256, + "learning_rate": 9.790163236337828e-05, + "loss": 0.12445158958435058, + "step": 14790 + }, + { + "epoch": 2.100780695528744, + "grad_norm": 4.204085826873779, + "learning_rate": 9.790021291696239e-05, + "loss": 0.10883952379226684, + "step": 14800 + }, + { + "epoch": 2.1022001419446417, + "grad_norm": 2.81545352935791, + "learning_rate": 9.789879347054649e-05, + "loss": 0.14513410329818727, + "step": 14810 + }, + { + "epoch": 2.1036195883605395, + "grad_norm": 10.400982856750488, + "learning_rate": 9.78973740241306e-05, + "loss": 0.1663369655609131, + "step": 14820 + }, + { + "epoch": 2.1050390347764374, + "grad_norm": 4.7983078956604, + "learning_rate": 9.78959545777147e-05, + "loss": 0.10346471071243286, + "step": 14830 + }, + { + "epoch": 2.106458481192335, + "grad_norm": 6.536756992340088, + "learning_rate": 9.78945351312988e-05, + "loss": 0.12118889093399048, + "step": 14840 + }, + { + "epoch": 2.1078779276082327, + "grad_norm": 4.13341760635376, + "learning_rate": 9.78931156848829e-05, + "loss": 0.09681417346000672, + "step": 14850 + }, + { + "epoch": 2.1092973740241305, + "grad_norm": 6.235330581665039, + "learning_rate": 9.7891696238467e-05, + "loss": 0.11153937578201294, + "step": 14860 + }, + { + "epoch": 2.1107168204400284, + "grad_norm": 4.928127765655518, + "learning_rate": 9.789027679205111e-05, + "loss": 0.07672246694564819, + "step": 14870 + }, + { + "epoch": 2.1121362668559263, + "grad_norm": 4.837932109832764, + "learning_rate": 9.788885734563521e-05, + "loss": 0.07635858654975891, + "step": 14880 + }, + { + "epoch": 2.113555713271824, + "grad_norm": 7.02380895614624, + "learning_rate": 9.788743789921932e-05, + "loss": 0.07125227451324463, + "step": 14890 + }, + { + "epoch": 2.114975159687722, + "grad_norm": 5.700672149658203, + "learning_rate": 9.78860184528034e-05, + "loss": 0.19001219272613526, + "step": 14900 + }, + { + "epoch": 2.1163946061036194, + "grad_norm": 8.149482727050781, + "learning_rate": 9.788459900638751e-05, + "loss": 0.13992477655410768, + "step": 14910 + }, + { + "epoch": 2.1178140525195173, + "grad_norm": 2.9586234092712402, + "learning_rate": 9.788317955997161e-05, + "loss": 0.12763415575027465, + "step": 14920 + }, + { + "epoch": 2.119233498935415, + "grad_norm": 8.272931098937988, + "learning_rate": 9.788176011355572e-05, + "loss": 0.14072943925857545, + "step": 14930 + }, + { + "epoch": 2.120652945351313, + "grad_norm": 10.288031578063965, + "learning_rate": 9.788034066713982e-05, + "loss": 0.12365868091583251, + "step": 14940 + }, + { + "epoch": 2.122072391767211, + "grad_norm": 3.203730821609497, + "learning_rate": 9.787892122072392e-05, + "loss": 0.16196365356445314, + "step": 14950 + }, + { + "epoch": 2.1234918381831087, + "grad_norm": 1.575235366821289, + "learning_rate": 9.787750177430803e-05, + "loss": 0.10702955722808838, + "step": 14960 + }, + { + "epoch": 2.1249112845990066, + "grad_norm": 3.2818377017974854, + "learning_rate": 9.787608232789213e-05, + "loss": 0.109703528881073, + "step": 14970 + }, + { + "epoch": 2.126330731014904, + "grad_norm": 2.6222288608551025, + "learning_rate": 9.787466288147624e-05, + "loss": 0.13249775171279907, + "step": 14980 + }, + { + "epoch": 2.127750177430802, + "grad_norm": 2.1232478618621826, + "learning_rate": 9.787324343506033e-05, + "loss": 0.07887126207351684, + "step": 14990 + }, + { + "epoch": 2.1291696238466997, + "grad_norm": 2.6810293197631836, + "learning_rate": 9.787182398864443e-05, + "loss": 0.07232893705368042, + "step": 15000 + }, + { + "epoch": 2.1291696238466997, + "eval_accuracy": 0.9323456476123864, + "eval_loss": 0.19697453081607819, + "eval_runtime": 33.1486, + "eval_samples_per_second": 474.44, + "eval_steps_per_second": 14.842, + "step": 15000 + }, + { + "epoch": 2.1305890702625976, + "grad_norm": 1.1261463165283203, + "learning_rate": 9.787040454222853e-05, + "loss": 0.15110697746276855, + "step": 15010 + }, + { + "epoch": 2.1320085166784954, + "grad_norm": 7.047489166259766, + "learning_rate": 9.786898509581264e-05, + "loss": 0.12342967987060546, + "step": 15020 + }, + { + "epoch": 2.1334279630943933, + "grad_norm": 2.4421699047088623, + "learning_rate": 9.786756564939674e-05, + "loss": 0.10898158550262452, + "step": 15030 + }, + { + "epoch": 2.134847409510291, + "grad_norm": 13.27920913696289, + "learning_rate": 9.786614620298085e-05, + "loss": 0.17320735454559327, + "step": 15040 + }, + { + "epoch": 2.1362668559261886, + "grad_norm": 2.1594645977020264, + "learning_rate": 9.786472675656495e-05, + "loss": 0.1370407223701477, + "step": 15050 + }, + { + "epoch": 2.1376863023420865, + "grad_norm": 3.3465182781219482, + "learning_rate": 9.786330731014904e-05, + "loss": 0.0927284300327301, + "step": 15060 + }, + { + "epoch": 2.1391057487579843, + "grad_norm": 4.845798015594482, + "learning_rate": 9.786188786373315e-05, + "loss": 0.09592834115028381, + "step": 15070 + }, + { + "epoch": 2.140525195173882, + "grad_norm": 5.797274112701416, + "learning_rate": 9.786046841731725e-05, + "loss": 0.09021830558776855, + "step": 15080 + }, + { + "epoch": 2.14194464158978, + "grad_norm": 6.726304054260254, + "learning_rate": 9.785904897090136e-05, + "loss": 0.08812606334686279, + "step": 15090 + }, + { + "epoch": 2.143364088005678, + "grad_norm": 11.3377046585083, + "learning_rate": 9.785762952448545e-05, + "loss": 0.17364519834518433, + "step": 15100 + }, + { + "epoch": 2.1447835344215758, + "grad_norm": 3.504915237426758, + "learning_rate": 9.785621007806956e-05, + "loss": 0.11160609722137452, + "step": 15110 + }, + { + "epoch": 2.146202980837473, + "grad_norm": 8.797595024108887, + "learning_rate": 9.785479063165365e-05, + "loss": 0.19877324104309083, + "step": 15120 + }, + { + "epoch": 2.147622427253371, + "grad_norm": 3.8671157360076904, + "learning_rate": 9.785337118523777e-05, + "loss": 0.1070638656616211, + "step": 15130 + }, + { + "epoch": 2.149041873669269, + "grad_norm": 1.9480023384094238, + "learning_rate": 9.785195173882186e-05, + "loss": 0.08838028907775879, + "step": 15140 + }, + { + "epoch": 2.1504613200851668, + "grad_norm": 0.8382003903388977, + "learning_rate": 9.785053229240596e-05, + "loss": 0.13476892709732055, + "step": 15150 + }, + { + "epoch": 2.1518807665010646, + "grad_norm": 1.5311458110809326, + "learning_rate": 9.784911284599007e-05, + "loss": 0.1371008038520813, + "step": 15160 + }, + { + "epoch": 2.1533002129169625, + "grad_norm": 4.248318672180176, + "learning_rate": 9.784769339957417e-05, + "loss": 0.142839252948761, + "step": 15170 + }, + { + "epoch": 2.1547196593328604, + "grad_norm": 5.336694717407227, + "learning_rate": 9.784627395315828e-05, + "loss": 0.15205401182174683, + "step": 15180 + }, + { + "epoch": 2.156139105748758, + "grad_norm": 1.6950732469558716, + "learning_rate": 9.784485450674238e-05, + "loss": 0.09157877564430236, + "step": 15190 + }, + { + "epoch": 2.1575585521646556, + "grad_norm": 0.8742321133613586, + "learning_rate": 9.784343506032649e-05, + "loss": 0.07795000672340394, + "step": 15200 + }, + { + "epoch": 2.1589779985805535, + "grad_norm": 9.622370719909668, + "learning_rate": 9.784201561391057e-05, + "loss": 0.12661195993423463, + "step": 15210 + }, + { + "epoch": 2.1603974449964514, + "grad_norm": 2.450603723526001, + "learning_rate": 9.784059616749468e-05, + "loss": 0.07968658804893494, + "step": 15220 + }, + { + "epoch": 2.1618168914123492, + "grad_norm": 6.467986583709717, + "learning_rate": 9.783917672107878e-05, + "loss": 0.09993529319763184, + "step": 15230 + }, + { + "epoch": 2.163236337828247, + "grad_norm": 4.023931980133057, + "learning_rate": 9.783775727466289e-05, + "loss": 0.13655495643615723, + "step": 15240 + }, + { + "epoch": 2.164655784244145, + "grad_norm": 6.877175807952881, + "learning_rate": 9.783633782824699e-05, + "loss": 0.11687321662902832, + "step": 15250 + }, + { + "epoch": 2.1660752306600424, + "grad_norm": 6.720952033996582, + "learning_rate": 9.783491838183109e-05, + "loss": 0.1210485816001892, + "step": 15260 + }, + { + "epoch": 2.1674946770759402, + "grad_norm": 3.8507208824157715, + "learning_rate": 9.78334989354152e-05, + "loss": 0.132388699054718, + "step": 15270 + }, + { + "epoch": 2.168914123491838, + "grad_norm": 1.8653970956802368, + "learning_rate": 9.78320794889993e-05, + "loss": 0.08510831594467164, + "step": 15280 + }, + { + "epoch": 2.170333569907736, + "grad_norm": 2.0540809631347656, + "learning_rate": 9.78306600425834e-05, + "loss": 0.07614290714263916, + "step": 15290 + }, + { + "epoch": 2.171753016323634, + "grad_norm": 3.400786876678467, + "learning_rate": 9.78292405961675e-05, + "loss": 0.1373605966567993, + "step": 15300 + }, + { + "epoch": 2.1731724627395317, + "grad_norm": 4.475280284881592, + "learning_rate": 9.78278211497516e-05, + "loss": 0.170183527469635, + "step": 15310 + }, + { + "epoch": 2.1745919091554295, + "grad_norm": 1.2852575778961182, + "learning_rate": 9.78264017033357e-05, + "loss": 0.09261202812194824, + "step": 15320 + }, + { + "epoch": 2.176011355571327, + "grad_norm": 2.492828369140625, + "learning_rate": 9.782498225691981e-05, + "loss": 0.1506461977958679, + "step": 15330 + }, + { + "epoch": 2.177430801987225, + "grad_norm": 1.1873884201049805, + "learning_rate": 9.78235628105039e-05, + "loss": 0.1407165050506592, + "step": 15340 + }, + { + "epoch": 2.1788502484031227, + "grad_norm": 6.442225933074951, + "learning_rate": 9.782214336408802e-05, + "loss": 0.10227712392807006, + "step": 15350 + }, + { + "epoch": 2.1802696948190206, + "grad_norm": 4.296558856964111, + "learning_rate": 9.782072391767211e-05, + "loss": 0.1007123589515686, + "step": 15360 + }, + { + "epoch": 2.1816891412349184, + "grad_norm": 5.814218044281006, + "learning_rate": 9.781944641589781e-05, + "loss": 0.19718022346496583, + "step": 15370 + }, + { + "epoch": 2.1831085876508163, + "grad_norm": 4.71889066696167, + "learning_rate": 9.78180269694819e-05, + "loss": 0.18047010898590088, + "step": 15380 + }, + { + "epoch": 2.184528034066714, + "grad_norm": 4.318767070770264, + "learning_rate": 9.7816607523066e-05, + "loss": 0.15934972763061522, + "step": 15390 + }, + { + "epoch": 2.1859474804826116, + "grad_norm": 5.206693172454834, + "learning_rate": 9.78151880766501e-05, + "loss": 0.16389219760894774, + "step": 15400 + }, + { + "epoch": 2.1873669268985094, + "grad_norm": 5.830376148223877, + "learning_rate": 9.781376863023421e-05, + "loss": 0.09744818210601806, + "step": 15410 + }, + { + "epoch": 2.1887863733144073, + "grad_norm": 3.7071948051452637, + "learning_rate": 9.781234918381833e-05, + "loss": 0.06997872591018676, + "step": 15420 + }, + { + "epoch": 2.190205819730305, + "grad_norm": 1.3492387533187866, + "learning_rate": 9.781092973740241e-05, + "loss": 0.12530778646469115, + "step": 15430 + }, + { + "epoch": 2.191625266146203, + "grad_norm": 4.588033199310303, + "learning_rate": 9.780951029098652e-05, + "loss": 0.09968525767326356, + "step": 15440 + }, + { + "epoch": 2.193044712562101, + "grad_norm": 7.795054912567139, + "learning_rate": 9.780809084457062e-05, + "loss": 0.14231202602386475, + "step": 15450 + }, + { + "epoch": 2.1944641589779987, + "grad_norm": 3.2043259143829346, + "learning_rate": 9.780667139815473e-05, + "loss": 0.13562475442886351, + "step": 15460 + }, + { + "epoch": 2.195883605393896, + "grad_norm": 4.458872318267822, + "learning_rate": 9.780525195173883e-05, + "loss": 0.12291073799133301, + "step": 15470 + }, + { + "epoch": 2.197303051809794, + "grad_norm": 0.49556025862693787, + "learning_rate": 9.780383250532294e-05, + "loss": 0.030131521821022033, + "step": 15480 + }, + { + "epoch": 2.198722498225692, + "grad_norm": 10.009795188903809, + "learning_rate": 9.780241305890702e-05, + "loss": 0.13439586162567138, + "step": 15490 + }, + { + "epoch": 2.2001419446415897, + "grad_norm": 9.650060653686523, + "learning_rate": 9.780099361249113e-05, + "loss": 0.15003018379211425, + "step": 15500 + }, + { + "epoch": 2.2001419446415897, + "eval_accuracy": 0.9476696127678514, + "eval_loss": 0.15076443552970886, + "eval_runtime": 33.5101, + "eval_samples_per_second": 469.322, + "eval_steps_per_second": 14.682, + "step": 15500 + }, + { + "epoch": 2.2015613910574876, + "grad_norm": 3.4228737354278564, + "learning_rate": 9.779957416607524e-05, + "loss": 0.08043778538703919, + "step": 15510 + }, + { + "epoch": 2.2029808374733855, + "grad_norm": 7.456453800201416, + "learning_rate": 9.779815471965934e-05, + "loss": 0.08067357540130615, + "step": 15520 + }, + { + "epoch": 2.2044002838892833, + "grad_norm": 7.92563533782959, + "learning_rate": 9.779673527324345e-05, + "loss": 0.15267107486724854, + "step": 15530 + }, + { + "epoch": 2.2058197303051807, + "grad_norm": 7.132428169250488, + "learning_rate": 9.779531582682753e-05, + "loss": 0.20551769733428954, + "step": 15540 + }, + { + "epoch": 2.2072391767210786, + "grad_norm": 5.588425636291504, + "learning_rate": 9.779389638041165e-05, + "loss": 0.0594519853591919, + "step": 15550 + }, + { + "epoch": 2.2086586231369765, + "grad_norm": 0.8327229619026184, + "learning_rate": 9.779247693399574e-05, + "loss": 0.09828418493270874, + "step": 15560 + }, + { + "epoch": 2.2100780695528743, + "grad_norm": 4.466777324676514, + "learning_rate": 9.779105748757985e-05, + "loss": 0.0886389136314392, + "step": 15570 + }, + { + "epoch": 2.211497515968772, + "grad_norm": 6.381712913513184, + "learning_rate": 9.778963804116395e-05, + "loss": 0.11927787065505982, + "step": 15580 + }, + { + "epoch": 2.21291696238467, + "grad_norm": 6.469443321228027, + "learning_rate": 9.778821859474805e-05, + "loss": 0.17326163053512572, + "step": 15590 + }, + { + "epoch": 2.214336408800568, + "grad_norm": 6.632884502410889, + "learning_rate": 9.778679914833216e-05, + "loss": 0.11724759340286255, + "step": 15600 + }, + { + "epoch": 2.215755855216466, + "grad_norm": 3.7693932056427, + "learning_rate": 9.778537970191626e-05, + "loss": 0.12318531274795533, + "step": 15610 + }, + { + "epoch": 2.217175301632363, + "grad_norm": 11.708182334899902, + "learning_rate": 9.778396025550037e-05, + "loss": 0.1665675401687622, + "step": 15620 + }, + { + "epoch": 2.218594748048261, + "grad_norm": 6.708708763122559, + "learning_rate": 9.778254080908447e-05, + "loss": 0.09552123546600341, + "step": 15630 + }, + { + "epoch": 2.220014194464159, + "grad_norm": 3.537140130996704, + "learning_rate": 9.778112136266856e-05, + "loss": 0.17162953615188598, + "step": 15640 + }, + { + "epoch": 2.221433640880057, + "grad_norm": 3.47255802154541, + "learning_rate": 9.777970191625266e-05, + "loss": 0.11431492567062378, + "step": 15650 + }, + { + "epoch": 2.2228530872959547, + "grad_norm": 2.390170097351074, + "learning_rate": 9.777828246983677e-05, + "loss": 0.1374788761138916, + "step": 15660 + }, + { + "epoch": 2.2242725337118525, + "grad_norm": 8.488000869750977, + "learning_rate": 9.777686302342087e-05, + "loss": 0.075135737657547, + "step": 15670 + }, + { + "epoch": 2.2256919801277504, + "grad_norm": 5.250071048736572, + "learning_rate": 9.777544357700498e-05, + "loss": 0.15566228628158568, + "step": 15680 + }, + { + "epoch": 2.227111426543648, + "grad_norm": 1.0439021587371826, + "learning_rate": 9.777402413058908e-05, + "loss": 0.08581479787826538, + "step": 15690 + }, + { + "epoch": 2.2285308729595457, + "grad_norm": 5.081490993499756, + "learning_rate": 9.777260468417317e-05, + "loss": 0.0691333532333374, + "step": 15700 + }, + { + "epoch": 2.2299503193754435, + "grad_norm": 4.931427478790283, + "learning_rate": 9.777118523775729e-05, + "loss": 0.08706582188606263, + "step": 15710 + }, + { + "epoch": 2.2313697657913414, + "grad_norm": 2.0620617866516113, + "learning_rate": 9.776976579134138e-05, + "loss": 0.09351248145103455, + "step": 15720 + }, + { + "epoch": 2.2327892122072392, + "grad_norm": 11.9086275100708, + "learning_rate": 9.77683463449255e-05, + "loss": 0.167766273021698, + "step": 15730 + }, + { + "epoch": 2.234208658623137, + "grad_norm": 7.802628993988037, + "learning_rate": 9.776692689850958e-05, + "loss": 0.08956191539764405, + "step": 15740 + }, + { + "epoch": 2.235628105039035, + "grad_norm": 6.4769134521484375, + "learning_rate": 9.776550745209369e-05, + "loss": 0.0949668049812317, + "step": 15750 + }, + { + "epoch": 2.2370475514549324, + "grad_norm": 5.48812198638916, + "learning_rate": 9.776408800567779e-05, + "loss": 0.10781463384628295, + "step": 15760 + }, + { + "epoch": 2.2384669978708303, + "grad_norm": 4.095717430114746, + "learning_rate": 9.77626685592619e-05, + "loss": 0.10710879564285278, + "step": 15770 + }, + { + "epoch": 2.239886444286728, + "grad_norm": 7.886163234710693, + "learning_rate": 9.7761249112846e-05, + "loss": 0.16387512683868408, + "step": 15780 + }, + { + "epoch": 2.241305890702626, + "grad_norm": 5.275144577026367, + "learning_rate": 9.775982966643009e-05, + "loss": 0.13074166774749757, + "step": 15790 + }, + { + "epoch": 2.242725337118524, + "grad_norm": 6.263736248016357, + "learning_rate": 9.77584102200142e-05, + "loss": 0.1308918595314026, + "step": 15800 + }, + { + "epoch": 2.2441447835344217, + "grad_norm": 8.084881782531738, + "learning_rate": 9.77569907735983e-05, + "loss": 0.15410442352294923, + "step": 15810 + }, + { + "epoch": 2.2455642299503196, + "grad_norm": 5.83068323135376, + "learning_rate": 9.775557132718241e-05, + "loss": 0.15612525939941407, + "step": 15820 + }, + { + "epoch": 2.246983676366217, + "grad_norm": 0.40490075945854187, + "learning_rate": 9.775415188076651e-05, + "loss": 0.05670689940452576, + "step": 15830 + }, + { + "epoch": 2.248403122782115, + "grad_norm": 9.664972305297852, + "learning_rate": 9.775273243435062e-05, + "loss": 0.08322632312774658, + "step": 15840 + }, + { + "epoch": 2.2498225691980127, + "grad_norm": 5.599974632263184, + "learning_rate": 9.77513129879347e-05, + "loss": 0.0942413330078125, + "step": 15850 + }, + { + "epoch": 2.2512420156139106, + "grad_norm": 4.52598762512207, + "learning_rate": 9.774989354151881e-05, + "loss": 0.1192929744720459, + "step": 15860 + }, + { + "epoch": 2.2526614620298084, + "grad_norm": 8.435208320617676, + "learning_rate": 9.774847409510291e-05, + "loss": 0.10933787822723388, + "step": 15870 + }, + { + "epoch": 2.2540809084457063, + "grad_norm": 6.769467353820801, + "learning_rate": 9.774705464868702e-05, + "loss": 0.08203907608985901, + "step": 15880 + }, + { + "epoch": 2.255500354861604, + "grad_norm": 7.499700546264648, + "learning_rate": 9.774563520227112e-05, + "loss": 0.1266704320907593, + "step": 15890 + }, + { + "epoch": 2.2569198012775016, + "grad_norm": 4.320639133453369, + "learning_rate": 9.774421575585522e-05, + "loss": 0.13925156593322754, + "step": 15900 + }, + { + "epoch": 2.2583392476933994, + "grad_norm": 5.2828168869018555, + "learning_rate": 9.774279630943933e-05, + "loss": 0.12411700487136841, + "step": 15910 + }, + { + "epoch": 2.2597586941092973, + "grad_norm": 7.704649448394775, + "learning_rate": 9.774137686302343e-05, + "loss": 0.17451765537261962, + "step": 15920 + }, + { + "epoch": 2.261178140525195, + "grad_norm": 10.570831298828125, + "learning_rate": 9.773995741660754e-05, + "loss": 0.14159404039382933, + "step": 15930 + }, + { + "epoch": 2.262597586941093, + "grad_norm": 1.9156538248062134, + "learning_rate": 9.773853797019163e-05, + "loss": 0.10246649980545045, + "step": 15940 + }, + { + "epoch": 2.264017033356991, + "grad_norm": 10.271675109863281, + "learning_rate": 9.773711852377573e-05, + "loss": 0.1498422145843506, + "step": 15950 + }, + { + "epoch": 2.2654364797728888, + "grad_norm": 2.2951345443725586, + "learning_rate": 9.773569907735983e-05, + "loss": 0.1181708812713623, + "step": 15960 + }, + { + "epoch": 2.2668559261887866, + "grad_norm": 7.073802471160889, + "learning_rate": 9.773427963094394e-05, + "loss": 0.13307657241821289, + "step": 15970 + }, + { + "epoch": 2.268275372604684, + "grad_norm": 3.94195556640625, + "learning_rate": 9.773286018452804e-05, + "loss": 0.06159374713897705, + "step": 15980 + }, + { + "epoch": 2.269694819020582, + "grad_norm": 0.3510136902332306, + "learning_rate": 9.773144073811215e-05, + "loss": 0.05166938900947571, + "step": 15990 + }, + { + "epoch": 2.2711142654364798, + "grad_norm": 2.7349507808685303, + "learning_rate": 9.773002129169624e-05, + "loss": 0.12719658613204957, + "step": 16000 + }, + { + "epoch": 2.2711142654364798, + "eval_accuracy": 0.9534558402746869, + "eval_loss": 0.12974673509597778, + "eval_runtime": 32.9436, + "eval_samples_per_second": 477.391, + "eval_steps_per_second": 14.935, + "step": 16000 + }, + { + "epoch": 2.2725337118523776, + "grad_norm": 3.8580965995788574, + "learning_rate": 9.772860184528034e-05, + "loss": 0.07134815454483032, + "step": 16010 + }, + { + "epoch": 2.2739531582682755, + "grad_norm": 10.57183837890625, + "learning_rate": 9.772718239886445e-05, + "loss": 0.11877801418304443, + "step": 16020 + }, + { + "epoch": 2.2753726046841733, + "grad_norm": 8.526998519897461, + "learning_rate": 9.772576295244855e-05, + "loss": 0.11683057546615601, + "step": 16030 + }, + { + "epoch": 2.276792051100071, + "grad_norm": 2.470162868499756, + "learning_rate": 9.772434350603266e-05, + "loss": 0.0911303460597992, + "step": 16040 + }, + { + "epoch": 2.2782114975159686, + "grad_norm": 8.097274780273438, + "learning_rate": 9.772292405961675e-05, + "loss": 0.1780623197555542, + "step": 16050 + }, + { + "epoch": 2.2796309439318665, + "grad_norm": 4.431247234344482, + "learning_rate": 9.772150461320086e-05, + "loss": 0.13148776292800904, + "step": 16060 + }, + { + "epoch": 2.2810503903477644, + "grad_norm": 7.473452568054199, + "learning_rate": 9.772008516678495e-05, + "loss": 0.09967323541641235, + "step": 16070 + }, + { + "epoch": 2.282469836763662, + "grad_norm": 2.283681869506836, + "learning_rate": 9.771866572036906e-05, + "loss": 0.06625600457191468, + "step": 16080 + }, + { + "epoch": 2.28388928317956, + "grad_norm": 0.9107749462127686, + "learning_rate": 9.771724627395316e-05, + "loss": 0.07753741145133972, + "step": 16090 + }, + { + "epoch": 2.285308729595458, + "grad_norm": 2.082306146621704, + "learning_rate": 9.771582682753726e-05, + "loss": 0.06911807656288146, + "step": 16100 + }, + { + "epoch": 2.286728176011356, + "grad_norm": 8.424261093139648, + "learning_rate": 9.771440738112137e-05, + "loss": 0.06900658011436463, + "step": 16110 + }, + { + "epoch": 2.2881476224272532, + "grad_norm": 2.821417808532715, + "learning_rate": 9.771298793470547e-05, + "loss": 0.10042606592178345, + "step": 16120 + }, + { + "epoch": 2.289567068843151, + "grad_norm": 4.486814975738525, + "learning_rate": 9.771156848828958e-05, + "loss": 0.1290997862815857, + "step": 16130 + }, + { + "epoch": 2.290986515259049, + "grad_norm": 8.3433198928833, + "learning_rate": 9.771014904187368e-05, + "loss": 0.14453980922698975, + "step": 16140 + }, + { + "epoch": 2.292405961674947, + "grad_norm": 9.422966003417969, + "learning_rate": 9.770872959545777e-05, + "loss": 0.13661658763885498, + "step": 16150 + }, + { + "epoch": 2.2938254080908447, + "grad_norm": 6.411171913146973, + "learning_rate": 9.770731014904187e-05, + "loss": 0.09912009239196777, + "step": 16160 + }, + { + "epoch": 2.2952448545067425, + "grad_norm": 4.763072490692139, + "learning_rate": 9.770589070262598e-05, + "loss": 0.10291681289672852, + "step": 16170 + }, + { + "epoch": 2.2966643009226404, + "grad_norm": 5.987633228302002, + "learning_rate": 9.770447125621008e-05, + "loss": 0.15251626968383789, + "step": 16180 + }, + { + "epoch": 2.298083747338538, + "grad_norm": 10.529451370239258, + "learning_rate": 9.770305180979419e-05, + "loss": 0.17285287380218506, + "step": 16190 + }, + { + "epoch": 2.2995031937544357, + "grad_norm": 1.2355297803878784, + "learning_rate": 9.770163236337829e-05, + "loss": 0.16878000497817994, + "step": 16200 + }, + { + "epoch": 2.3009226401703335, + "grad_norm": 2.409059762954712, + "learning_rate": 9.770021291696238e-05, + "loss": 0.08963816165924073, + "step": 16210 + }, + { + "epoch": 2.3023420865862314, + "grad_norm": 0.2473367154598236, + "learning_rate": 9.76987934705465e-05, + "loss": 0.07898592352867126, + "step": 16220 + }, + { + "epoch": 2.3037615330021293, + "grad_norm": 3.4052321910858154, + "learning_rate": 9.769737402413059e-05, + "loss": 0.13420867919921875, + "step": 16230 + }, + { + "epoch": 2.305180979418027, + "grad_norm": 2.8136518001556396, + "learning_rate": 9.76959545777147e-05, + "loss": 0.08897106051445007, + "step": 16240 + }, + { + "epoch": 2.306600425833925, + "grad_norm": 4.1067094802856445, + "learning_rate": 9.76945351312988e-05, + "loss": 0.11498106718063354, + "step": 16250 + }, + { + "epoch": 2.3080198722498224, + "grad_norm": 3.161066770553589, + "learning_rate": 9.76931156848829e-05, + "loss": 0.12238447666168213, + "step": 16260 + }, + { + "epoch": 2.3094393186657203, + "grad_norm": 8.762333869934082, + "learning_rate": 9.7691696238467e-05, + "loss": 0.07079674601554871, + "step": 16270 + }, + { + "epoch": 2.310858765081618, + "grad_norm": 2.2034451961517334, + "learning_rate": 9.76902767920511e-05, + "loss": 0.1289450168609619, + "step": 16280 + }, + { + "epoch": 2.312278211497516, + "grad_norm": 3.3836324214935303, + "learning_rate": 9.76888573456352e-05, + "loss": 0.08217411041259766, + "step": 16290 + }, + { + "epoch": 2.313697657913414, + "grad_norm": 2.8655858039855957, + "learning_rate": 9.768743789921932e-05, + "loss": 0.09491733908653259, + "step": 16300 + }, + { + "epoch": 2.3151171043293117, + "grad_norm": 4.423978805541992, + "learning_rate": 9.768601845280341e-05, + "loss": 0.13365116119384765, + "step": 16310 + }, + { + "epoch": 2.3165365507452096, + "grad_norm": 8.303816795349121, + "learning_rate": 9.768459900638751e-05, + "loss": 0.15843117237091064, + "step": 16320 + }, + { + "epoch": 2.317955997161107, + "grad_norm": 0.4200175404548645, + "learning_rate": 9.768317955997162e-05, + "loss": 0.12860283851623536, + "step": 16330 + }, + { + "epoch": 2.319375443577005, + "grad_norm": 0.9817140698432922, + "learning_rate": 9.768176011355572e-05, + "loss": 0.0771494209766388, + "step": 16340 + }, + { + "epoch": 2.3207948899929027, + "grad_norm": 5.904425144195557, + "learning_rate": 9.768034066713983e-05, + "loss": 0.09748343229293824, + "step": 16350 + }, + { + "epoch": 2.3222143364088006, + "grad_norm": 11.307563781738281, + "learning_rate": 9.767892122072391e-05, + "loss": 0.16953353881835936, + "step": 16360 + }, + { + "epoch": 2.3236337828246985, + "grad_norm": 4.251320838928223, + "learning_rate": 9.767750177430802e-05, + "loss": 0.10789685249328614, + "step": 16370 + }, + { + "epoch": 2.3250532292405963, + "grad_norm": 3.149813175201416, + "learning_rate": 9.767608232789212e-05, + "loss": 0.09740127325057983, + "step": 16380 + }, + { + "epoch": 2.326472675656494, + "grad_norm": 9.757298469543457, + "learning_rate": 9.767466288147623e-05, + "loss": 0.16251888275146484, + "step": 16390 + }, + { + "epoch": 2.3278921220723916, + "grad_norm": 3.574176073074341, + "learning_rate": 9.767324343506033e-05, + "loss": 0.08429834246635437, + "step": 16400 + }, + { + "epoch": 2.3293115684882895, + "grad_norm": 3.4276225566864014, + "learning_rate": 9.767182398864443e-05, + "loss": 0.08981868624687195, + "step": 16410 + }, + { + "epoch": 2.3307310149041873, + "grad_norm": 7.491410732269287, + "learning_rate": 9.767040454222854e-05, + "loss": 0.1534734010696411, + "step": 16420 + }, + { + "epoch": 2.332150461320085, + "grad_norm": 7.178809642791748, + "learning_rate": 9.766898509581264e-05, + "loss": 0.13512442111968995, + "step": 16430 + }, + { + "epoch": 2.333569907735983, + "grad_norm": 7.452297687530518, + "learning_rate": 9.766756564939675e-05, + "loss": 0.15903291702270508, + "step": 16440 + }, + { + "epoch": 2.334989354151881, + "grad_norm": 4.820403575897217, + "learning_rate": 9.766614620298084e-05, + "loss": 0.1309017300605774, + "step": 16450 + }, + { + "epoch": 2.3364088005677788, + "grad_norm": 7.638652801513672, + "learning_rate": 9.766472675656494e-05, + "loss": 0.12363828420639038, + "step": 16460 + }, + { + "epoch": 2.337828246983676, + "grad_norm": 9.250051498413086, + "learning_rate": 9.766330731014904e-05, + "loss": 0.15233538150787354, + "step": 16470 + }, + { + "epoch": 2.339247693399574, + "grad_norm": 4.459556579589844, + "learning_rate": 9.766188786373315e-05, + "loss": 0.16799700260162354, + "step": 16480 + }, + { + "epoch": 2.340667139815472, + "grad_norm": 5.2020955085754395, + "learning_rate": 9.766046841731725e-05, + "loss": 0.0790201485157013, + "step": 16490 + }, + { + "epoch": 2.34208658623137, + "grad_norm": 1.893151879310608, + "learning_rate": 9.765904897090136e-05, + "loss": 0.07257702350616455, + "step": 16500 + }, + { + "epoch": 2.34208658623137, + "eval_accuracy": 0.952883576015769, + "eval_loss": 0.14282935857772827, + "eval_runtime": 32.8134, + "eval_samples_per_second": 479.287, + "eval_steps_per_second": 14.994, + "step": 16500 + }, + { + "epoch": 2.3435060326472676, + "grad_norm": 2.2250635623931885, + "learning_rate": 9.765762952448545e-05, + "loss": 0.11261917352676391, + "step": 16510 + }, + { + "epoch": 2.3449254790631655, + "grad_norm": 1.5180538892745972, + "learning_rate": 9.765621007806955e-05, + "loss": 0.08184219598770141, + "step": 16520 + }, + { + "epoch": 2.3463449254790634, + "grad_norm": 3.8298745155334473, + "learning_rate": 9.765479063165366e-05, + "loss": 0.08645458817481995, + "step": 16530 + }, + { + "epoch": 2.347764371894961, + "grad_norm": 3.3084588050842285, + "learning_rate": 9.765337118523776e-05, + "loss": 0.06830872893333435, + "step": 16540 + }, + { + "epoch": 2.3491838183108587, + "grad_norm": 0.7720867395401001, + "learning_rate": 9.765195173882187e-05, + "loss": 0.1192325472831726, + "step": 16550 + }, + { + "epoch": 2.3506032647267565, + "grad_norm": 7.036698341369629, + "learning_rate": 9.765053229240597e-05, + "loss": 0.09893574118614197, + "step": 16560 + }, + { + "epoch": 2.3520227111426544, + "grad_norm": 7.439764499664307, + "learning_rate": 9.764911284599007e-05, + "loss": 0.09484468102455139, + "step": 16570 + }, + { + "epoch": 2.3534421575585522, + "grad_norm": 4.2301435470581055, + "learning_rate": 9.764769339957416e-05, + "loss": 0.11805753707885742, + "step": 16580 + }, + { + "epoch": 2.35486160397445, + "grad_norm": 6.39113712310791, + "learning_rate": 9.764627395315827e-05, + "loss": 0.09725428223609925, + "step": 16590 + }, + { + "epoch": 2.356281050390348, + "grad_norm": 6.1582841873168945, + "learning_rate": 9.764485450674237e-05, + "loss": 0.10667927265167236, + "step": 16600 + }, + { + "epoch": 2.3577004968062454, + "grad_norm": 3.7757277488708496, + "learning_rate": 9.764343506032648e-05, + "loss": 0.12746351957321167, + "step": 16610 + }, + { + "epoch": 2.3591199432221432, + "grad_norm": 5.895532131195068, + "learning_rate": 9.764201561391058e-05, + "loss": 0.13624510765075684, + "step": 16620 + }, + { + "epoch": 2.360539389638041, + "grad_norm": 3.6180717945098877, + "learning_rate": 9.764059616749468e-05, + "loss": 0.12134850025177002, + "step": 16630 + }, + { + "epoch": 2.361958836053939, + "grad_norm": 4.084766864776611, + "learning_rate": 9.763917672107879e-05, + "loss": 0.09110198616981506, + "step": 16640 + }, + { + "epoch": 2.363378282469837, + "grad_norm": 7.207777500152588, + "learning_rate": 9.763775727466289e-05, + "loss": 0.0987035095691681, + "step": 16650 + }, + { + "epoch": 2.3647977288857347, + "grad_norm": 7.370236396789551, + "learning_rate": 9.7636337828247e-05, + "loss": 0.15047061443328857, + "step": 16660 + }, + { + "epoch": 2.3662171753016326, + "grad_norm": 7.778202056884766, + "learning_rate": 9.763491838183108e-05, + "loss": 0.16292293071746827, + "step": 16670 + }, + { + "epoch": 2.36763662171753, + "grad_norm": 3.764970541000366, + "learning_rate": 9.763349893541519e-05, + "loss": 0.12445385456085205, + "step": 16680 + }, + { + "epoch": 2.369056068133428, + "grad_norm": 2.9177567958831787, + "learning_rate": 9.763207948899929e-05, + "loss": 0.12629375457763672, + "step": 16690 + }, + { + "epoch": 2.3704755145493257, + "grad_norm": 4.7777099609375, + "learning_rate": 9.76306600425834e-05, + "loss": 0.10483566522598267, + "step": 16700 + }, + { + "epoch": 2.3718949609652236, + "grad_norm": 2.476802349090576, + "learning_rate": 9.76292405961675e-05, + "loss": 0.07830199003219604, + "step": 16710 + }, + { + "epoch": 2.3733144073811214, + "grad_norm": 6.576395034790039, + "learning_rate": 9.76278211497516e-05, + "loss": 0.12722206115722656, + "step": 16720 + }, + { + "epoch": 2.3747338537970193, + "grad_norm": 1.5219242572784424, + "learning_rate": 9.76264017033357e-05, + "loss": 0.08835641741752624, + "step": 16730 + }, + { + "epoch": 2.376153300212917, + "grad_norm": 2.6990671157836914, + "learning_rate": 9.76249822569198e-05, + "loss": 0.10250411033630372, + "step": 16740 + }, + { + "epoch": 2.3775727466288146, + "grad_norm": 4.596541404724121, + "learning_rate": 9.762356281050391e-05, + "loss": 0.14535219669342042, + "step": 16750 + }, + { + "epoch": 2.3789921930447124, + "grad_norm": 2.865243434906006, + "learning_rate": 9.762214336408801e-05, + "loss": 0.061080020666122434, + "step": 16760 + }, + { + "epoch": 2.3804116394606103, + "grad_norm": 4.850032806396484, + "learning_rate": 9.762072391767211e-05, + "loss": 0.11783115863800049, + "step": 16770 + }, + { + "epoch": 2.381831085876508, + "grad_norm": 1.7372711896896362, + "learning_rate": 9.76193044712562e-05, + "loss": 0.09774195551872253, + "step": 16780 + }, + { + "epoch": 2.383250532292406, + "grad_norm": 7.511697769165039, + "learning_rate": 9.761788502484032e-05, + "loss": 0.1309769868850708, + "step": 16790 + }, + { + "epoch": 2.384669978708304, + "grad_norm": 8.27840805053711, + "learning_rate": 9.761646557842441e-05, + "loss": 0.17970755100250244, + "step": 16800 + }, + { + "epoch": 2.3860894251242017, + "grad_norm": 0.9087435603141785, + "learning_rate": 9.761504613200853e-05, + "loss": 0.07040458917617798, + "step": 16810 + }, + { + "epoch": 2.387508871540099, + "grad_norm": 3.8493130207061768, + "learning_rate": 9.761362668559262e-05, + "loss": 0.11651371717453003, + "step": 16820 + }, + { + "epoch": 2.388928317955997, + "grad_norm": 1.5010507106781006, + "learning_rate": 9.761220723917672e-05, + "loss": 0.08106373548507691, + "step": 16830 + }, + { + "epoch": 2.390347764371895, + "grad_norm": 6.315835475921631, + "learning_rate": 9.761078779276083e-05, + "loss": 0.11155383586883545, + "step": 16840 + }, + { + "epoch": 2.3917672107877928, + "grad_norm": 2.8264517784118652, + "learning_rate": 9.760936834634493e-05, + "loss": 0.12171386480331421, + "step": 16850 + }, + { + "epoch": 2.3931866572036906, + "grad_norm": 2.294635057449341, + "learning_rate": 9.760794889992904e-05, + "loss": 0.1550905466079712, + "step": 16860 + }, + { + "epoch": 2.3946061036195885, + "grad_norm": 4.574626445770264, + "learning_rate": 9.760652945351312e-05, + "loss": 0.11418824195861817, + "step": 16870 + }, + { + "epoch": 2.3960255500354863, + "grad_norm": 2.7776918411254883, + "learning_rate": 9.760511000709723e-05, + "loss": 0.12959576845169068, + "step": 16880 + }, + { + "epoch": 2.3974449964513838, + "grad_norm": 3.4543848037719727, + "learning_rate": 9.760369056068133e-05, + "loss": 0.11354950666427613, + "step": 16890 + }, + { + "epoch": 2.3988644428672816, + "grad_norm": 5.274985313415527, + "learning_rate": 9.760227111426544e-05, + "loss": 0.06138370633125305, + "step": 16900 + }, + { + "epoch": 2.4002838892831795, + "grad_norm": 6.934667110443115, + "learning_rate": 9.760085166784955e-05, + "loss": 0.1329074501991272, + "step": 16910 + }, + { + "epoch": 2.4017033356990773, + "grad_norm": 6.645686626434326, + "learning_rate": 9.759943222143365e-05, + "loss": 0.17836753129959107, + "step": 16920 + }, + { + "epoch": 2.403122782114975, + "grad_norm": 6.251645088195801, + "learning_rate": 9.759801277501775e-05, + "loss": 0.0962505280971527, + "step": 16930 + }, + { + "epoch": 2.404542228530873, + "grad_norm": 5.136745452880859, + "learning_rate": 9.759659332860185e-05, + "loss": 0.08273377418518066, + "step": 16940 + }, + { + "epoch": 2.405961674946771, + "grad_norm": 7.956725120544434, + "learning_rate": 9.759517388218596e-05, + "loss": 0.11856834888458252, + "step": 16950 + }, + { + "epoch": 2.4073811213626684, + "grad_norm": 2.631044387817383, + "learning_rate": 9.759375443577005e-05, + "loss": 0.11917568445205688, + "step": 16960 + }, + { + "epoch": 2.408800567778566, + "grad_norm": 5.937511444091797, + "learning_rate": 9.759233498935416e-05, + "loss": 0.07629096508026123, + "step": 16970 + }, + { + "epoch": 2.410220014194464, + "grad_norm": 5.794412612915039, + "learning_rate": 9.759091554293825e-05, + "loss": 0.1741081953048706, + "step": 16980 + }, + { + "epoch": 2.411639460610362, + "grad_norm": 6.313220977783203, + "learning_rate": 9.758949609652236e-05, + "loss": 0.07898733615875245, + "step": 16990 + }, + { + "epoch": 2.41305890702626, + "grad_norm": 7.137319087982178, + "learning_rate": 9.758807665010647e-05, + "loss": 0.11363914012908935, + "step": 17000 + }, + { + "epoch": 2.41305890702626, + "eval_accuracy": 0.9416926305080435, + "eval_loss": 0.1742754876613617, + "eval_runtime": 31.9943, + "eval_samples_per_second": 491.556, + "eval_steps_per_second": 15.378, + "step": 17000 + }, + { + "epoch": 2.4144783534421577, + "grad_norm": 5.010659217834473, + "learning_rate": 9.758665720369057e-05, + "loss": 0.15786590576171874, + "step": 17010 + }, + { + "epoch": 2.4158977998580555, + "grad_norm": 6.37407112121582, + "learning_rate": 9.758523775727468e-05, + "loss": 0.1406489849090576, + "step": 17020 + }, + { + "epoch": 2.417317246273953, + "grad_norm": 4.527013301849365, + "learning_rate": 9.758381831085876e-05, + "loss": 0.10702930688858033, + "step": 17030 + }, + { + "epoch": 2.418736692689851, + "grad_norm": 2.203209161758423, + "learning_rate": 9.758239886444287e-05, + "loss": 0.21100082397460937, + "step": 17040 + }, + { + "epoch": 2.4201561391057487, + "grad_norm": 2.5778391361236572, + "learning_rate": 9.758097941802697e-05, + "loss": 0.05981506705284119, + "step": 17050 + }, + { + "epoch": 2.4215755855216465, + "grad_norm": 6.347795486450195, + "learning_rate": 9.757955997161108e-05, + "loss": 0.12853623628616334, + "step": 17060 + }, + { + "epoch": 2.4229950319375444, + "grad_norm": 9.994209289550781, + "learning_rate": 9.757814052519518e-05, + "loss": 0.10259546041488647, + "step": 17070 + }, + { + "epoch": 2.4244144783534423, + "grad_norm": 3.367839813232422, + "learning_rate": 9.757672107877928e-05, + "loss": 0.06157753467559814, + "step": 17080 + }, + { + "epoch": 2.42583392476934, + "grad_norm": 3.509408473968506, + "learning_rate": 9.757530163236339e-05, + "loss": 0.08180438876152038, + "step": 17090 + }, + { + "epoch": 2.4272533711852375, + "grad_norm": 4.197175025939941, + "learning_rate": 9.757388218594748e-05, + "loss": 0.14403607845306396, + "step": 17100 + }, + { + "epoch": 2.4286728176011354, + "grad_norm": 4.370192527770996, + "learning_rate": 9.75724627395316e-05, + "loss": 0.16384668350219728, + "step": 17110 + }, + { + "epoch": 2.4300922640170333, + "grad_norm": 3.144803047180176, + "learning_rate": 9.757104329311569e-05, + "loss": 0.08878316283226013, + "step": 17120 + }, + { + "epoch": 2.431511710432931, + "grad_norm": 4.3488593101501465, + "learning_rate": 9.756962384669979e-05, + "loss": 0.17752463817596437, + "step": 17130 + }, + { + "epoch": 2.432931156848829, + "grad_norm": 9.861291885375977, + "learning_rate": 9.756820440028389e-05, + "loss": 0.10461457967758178, + "step": 17140 + }, + { + "epoch": 2.434350603264727, + "grad_norm": 2.252723217010498, + "learning_rate": 9.7566784953868e-05, + "loss": 0.09538206458091736, + "step": 17150 + }, + { + "epoch": 2.4357700496806247, + "grad_norm": 3.788640022277832, + "learning_rate": 9.75653655074521e-05, + "loss": 0.10890170335769653, + "step": 17160 + }, + { + "epoch": 2.437189496096522, + "grad_norm": 8.450477600097656, + "learning_rate": 9.756394606103621e-05, + "loss": 0.1873611330986023, + "step": 17170 + }, + { + "epoch": 2.43860894251242, + "grad_norm": 6.922235012054443, + "learning_rate": 9.75625266146203e-05, + "loss": 0.13029056787490845, + "step": 17180 + }, + { + "epoch": 2.440028388928318, + "grad_norm": 6.11525821685791, + "learning_rate": 9.75611071682044e-05, + "loss": 0.11692187786102295, + "step": 17190 + }, + { + "epoch": 2.4414478353442157, + "grad_norm": 7.727966785430908, + "learning_rate": 9.755968772178851e-05, + "loss": 0.18141931295394897, + "step": 17200 + }, + { + "epoch": 2.4428672817601136, + "grad_norm": 1.1188493967056274, + "learning_rate": 9.755826827537261e-05, + "loss": 0.14119462966918944, + "step": 17210 + }, + { + "epoch": 2.4442867281760114, + "grad_norm": 8.788047790527344, + "learning_rate": 9.755684882895672e-05, + "loss": 0.11063623428344727, + "step": 17220 + }, + { + "epoch": 2.4457061745919093, + "grad_norm": 4.968696117401123, + "learning_rate": 9.75554293825408e-05, + "loss": 0.11871033906936646, + "step": 17230 + }, + { + "epoch": 2.4471256210078067, + "grad_norm": 4.146373271942139, + "learning_rate": 9.755400993612492e-05, + "loss": 0.1038577675819397, + "step": 17240 + }, + { + "epoch": 2.4485450674237046, + "grad_norm": 4.578568458557129, + "learning_rate": 9.755259048970901e-05, + "loss": 0.1644783616065979, + "step": 17250 + }, + { + "epoch": 2.4499645138396025, + "grad_norm": 5.26609992980957, + "learning_rate": 9.755117104329312e-05, + "loss": 0.1413109540939331, + "step": 17260 + }, + { + "epoch": 2.4513839602555003, + "grad_norm": 5.410380840301514, + "learning_rate": 9.754975159687722e-05, + "loss": 0.10622183084487916, + "step": 17270 + }, + { + "epoch": 2.452803406671398, + "grad_norm": 8.643942832946777, + "learning_rate": 9.754833215046133e-05, + "loss": 0.12519901990890503, + "step": 17280 + }, + { + "epoch": 2.454222853087296, + "grad_norm": 5.121556282043457, + "learning_rate": 9.754691270404543e-05, + "loss": 0.1216310977935791, + "step": 17290 + }, + { + "epoch": 2.455642299503194, + "grad_norm": 4.879176139831543, + "learning_rate": 9.754549325762953e-05, + "loss": 0.07838413119316101, + "step": 17300 + }, + { + "epoch": 2.4570617459190913, + "grad_norm": 5.997292518615723, + "learning_rate": 9.754407381121364e-05, + "loss": 0.11862040758132934, + "step": 17310 + }, + { + "epoch": 2.458481192334989, + "grad_norm": 7.370124340057373, + "learning_rate": 9.754265436479774e-05, + "loss": 0.13782591819763185, + "step": 17320 + }, + { + "epoch": 2.459900638750887, + "grad_norm": 3.0784833431243896, + "learning_rate": 9.754123491838185e-05, + "loss": 0.12893285751342773, + "step": 17330 + }, + { + "epoch": 2.461320085166785, + "grad_norm": 4.132889747619629, + "learning_rate": 9.753981547196593e-05, + "loss": 0.1482453465461731, + "step": 17340 + }, + { + "epoch": 2.4627395315826828, + "grad_norm": 4.386025905609131, + "learning_rate": 9.753839602555004e-05, + "loss": 0.08701491355895996, + "step": 17350 + }, + { + "epoch": 2.4641589779985806, + "grad_norm": 7.536581516265869, + "learning_rate": 9.753697657913414e-05, + "loss": 0.1785440683364868, + "step": 17360 + }, + { + "epoch": 2.4655784244144785, + "grad_norm": 4.566206455230713, + "learning_rate": 9.753555713271825e-05, + "loss": 0.07483741641044617, + "step": 17370 + }, + { + "epoch": 2.466997870830376, + "grad_norm": 4.969336032867432, + "learning_rate": 9.753413768630235e-05, + "loss": 0.09664581418037414, + "step": 17380 + }, + { + "epoch": 2.468417317246274, + "grad_norm": 3.3608598709106445, + "learning_rate": 9.753271823988644e-05, + "loss": 0.08268053531646728, + "step": 17390 + }, + { + "epoch": 2.4698367636621716, + "grad_norm": 7.48677396774292, + "learning_rate": 9.753129879347055e-05, + "loss": 0.08111786842346191, + "step": 17400 + }, + { + "epoch": 2.4712562100780695, + "grad_norm": 2.8628151416778564, + "learning_rate": 9.752987934705465e-05, + "loss": 0.09410454630851746, + "step": 17410 + }, + { + "epoch": 2.4726756564939674, + "grad_norm": 5.564269065856934, + "learning_rate": 9.752845990063876e-05, + "loss": 0.09594557881355285, + "step": 17420 + }, + { + "epoch": 2.4740951029098652, + "grad_norm": 0.6636775135993958, + "learning_rate": 9.752704045422286e-05, + "loss": 0.09588454365730285, + "step": 17430 + }, + { + "epoch": 2.475514549325763, + "grad_norm": 6.354304313659668, + "learning_rate": 9.752562100780696e-05, + "loss": 0.10989620685577392, + "step": 17440 + }, + { + "epoch": 2.4769339957416605, + "grad_norm": 3.9579975605010986, + "learning_rate": 9.752434350603266e-05, + "loss": 0.10450366735458375, + "step": 17450 + }, + { + "epoch": 2.4783534421575584, + "grad_norm": 2.8820838928222656, + "learning_rate": 9.752292405961675e-05, + "loss": 0.09479145407676696, + "step": 17460 + }, + { + "epoch": 2.4797728885734562, + "grad_norm": 1.7476080656051636, + "learning_rate": 9.752150461320085e-05, + "loss": 0.12545448541641235, + "step": 17470 + }, + { + "epoch": 2.481192334989354, + "grad_norm": 7.19633150100708, + "learning_rate": 9.752008516678496e-05, + "loss": 0.0939016044139862, + "step": 17480 + }, + { + "epoch": 2.482611781405252, + "grad_norm": 11.924422264099121, + "learning_rate": 9.751866572036906e-05, + "loss": 0.16066315174102783, + "step": 17490 + }, + { + "epoch": 2.48403122782115, + "grad_norm": 2.1974613666534424, + "learning_rate": 9.751724627395317e-05, + "loss": 0.1423601746559143, + "step": 17500 + }, + { + "epoch": 2.48403122782115, + "eval_accuracy": 0.9445539518026325, + "eval_loss": 0.1727043092250824, + "eval_runtime": 32.757, + "eval_samples_per_second": 480.111, + "eval_steps_per_second": 15.02, + "step": 17500 + }, + { + "epoch": 2.4854506742370477, + "grad_norm": 6.336993217468262, + "learning_rate": 9.751582682753725e-05, + "loss": 0.13694591522216798, + "step": 17510 + }, + { + "epoch": 2.486870120652945, + "grad_norm": 4.340056896209717, + "learning_rate": 9.751440738112137e-05, + "loss": 0.1784249186515808, + "step": 17520 + }, + { + "epoch": 2.488289567068843, + "grad_norm": 4.247930526733398, + "learning_rate": 9.751298793470546e-05, + "loss": 0.10544888973236084, + "step": 17530 + }, + { + "epoch": 2.489709013484741, + "grad_norm": 2.6260440349578857, + "learning_rate": 9.751156848828957e-05, + "loss": 0.0672307550907135, + "step": 17540 + }, + { + "epoch": 2.4911284599006387, + "grad_norm": 1.9838597774505615, + "learning_rate": 9.751014904187367e-05, + "loss": 0.14270519018173217, + "step": 17550 + }, + { + "epoch": 2.4925479063165366, + "grad_norm": 1.2045660018920898, + "learning_rate": 9.750872959545777e-05, + "loss": 0.15043948888778685, + "step": 17560 + }, + { + "epoch": 2.4939673527324344, + "grad_norm": 7.862235069274902, + "learning_rate": 9.750731014904188e-05, + "loss": 0.07321544885635375, + "step": 17570 + }, + { + "epoch": 2.4953867991483323, + "grad_norm": 6.350536823272705, + "learning_rate": 9.750589070262598e-05, + "loss": 0.11304857730865478, + "step": 17580 + }, + { + "epoch": 2.49680624556423, + "grad_norm": 0.9608795046806335, + "learning_rate": 9.750447125621009e-05, + "loss": 0.08769638538360595, + "step": 17590 + }, + { + "epoch": 2.4982256919801276, + "grad_norm": 4.455130100250244, + "learning_rate": 9.750305180979418e-05, + "loss": 0.06901848912239075, + "step": 17600 + }, + { + "epoch": 2.4996451383960254, + "grad_norm": 3.236755132675171, + "learning_rate": 9.75016323633783e-05, + "loss": 0.10142921209335327, + "step": 17610 + }, + { + "epoch": 2.5010645848119233, + "grad_norm": 0.9103105068206787, + "learning_rate": 9.750021291696238e-05, + "loss": 0.12128010988235474, + "step": 17620 + }, + { + "epoch": 2.502484031227821, + "grad_norm": 3.3010218143463135, + "learning_rate": 9.749879347054649e-05, + "loss": 0.09445170164108277, + "step": 17630 + }, + { + "epoch": 2.503903477643719, + "grad_norm": 5.537515163421631, + "learning_rate": 9.749737402413059e-05, + "loss": 0.051540815830230714, + "step": 17640 + }, + { + "epoch": 2.505322924059617, + "grad_norm": 6.594273090362549, + "learning_rate": 9.74959545777147e-05, + "loss": 0.11053000688552857, + "step": 17650 + }, + { + "epoch": 2.5067423704755143, + "grad_norm": 6.973751068115234, + "learning_rate": 9.749453513129881e-05, + "loss": 0.17602165937423705, + "step": 17660 + }, + { + "epoch": 2.5081618168914126, + "grad_norm": 1.8898471593856812, + "learning_rate": 9.74931156848829e-05, + "loss": 0.09699593782424927, + "step": 17670 + }, + { + "epoch": 2.50958126330731, + "grad_norm": 8.757147789001465, + "learning_rate": 9.7491696238467e-05, + "loss": 0.09828613996505738, + "step": 17680 + }, + { + "epoch": 2.511000709723208, + "grad_norm": 5.698178291320801, + "learning_rate": 9.74902767920511e-05, + "loss": 0.09792088270187378, + "step": 17690 + }, + { + "epoch": 2.5124201561391057, + "grad_norm": 2.3245534896850586, + "learning_rate": 9.748885734563521e-05, + "loss": 0.08730112314224243, + "step": 17700 + }, + { + "epoch": 2.5138396025550036, + "grad_norm": 3.97782301902771, + "learning_rate": 9.748743789921931e-05, + "loss": 0.09204915165901184, + "step": 17710 + }, + { + "epoch": 2.5152590489709015, + "grad_norm": 2.635392904281616, + "learning_rate": 9.748601845280341e-05, + "loss": 0.08571889400482177, + "step": 17720 + }, + { + "epoch": 2.516678495386799, + "grad_norm": 4.555758476257324, + "learning_rate": 9.74845990063875e-05, + "loss": 0.10614382028579712, + "step": 17730 + }, + { + "epoch": 2.518097941802697, + "grad_norm": 6.458566665649414, + "learning_rate": 9.748317955997162e-05, + "loss": 0.1116061806678772, + "step": 17740 + }, + { + "epoch": 2.5195173882185946, + "grad_norm": 7.498642921447754, + "learning_rate": 9.748176011355573e-05, + "loss": 0.08102936148643494, + "step": 17750 + }, + { + "epoch": 2.5209368346344925, + "grad_norm": 8.974710464477539, + "learning_rate": 9.748034066713982e-05, + "loss": 0.15357725620269774, + "step": 17760 + }, + { + "epoch": 2.5223562810503903, + "grad_norm": 6.158868789672852, + "learning_rate": 9.747892122072392e-05, + "loss": 0.1006664514541626, + "step": 17770 + }, + { + "epoch": 2.523775727466288, + "grad_norm": 0.8831135630607605, + "learning_rate": 9.747750177430802e-05, + "loss": 0.07348037958145141, + "step": 17780 + }, + { + "epoch": 2.525195173882186, + "grad_norm": 8.365797996520996, + "learning_rate": 9.747608232789213e-05, + "loss": 0.09979128241539001, + "step": 17790 + }, + { + "epoch": 2.5266146202980835, + "grad_norm": 13.500819206237793, + "learning_rate": 9.747466288147623e-05, + "loss": 0.13896944522857665, + "step": 17800 + }, + { + "epoch": 2.528034066713982, + "grad_norm": 4.766392230987549, + "learning_rate": 9.747324343506034e-05, + "loss": 0.07932850122451782, + "step": 17810 + }, + { + "epoch": 2.529453513129879, + "grad_norm": 7.3413310050964355, + "learning_rate": 9.747182398864442e-05, + "loss": 0.07950088977813721, + "step": 17820 + }, + { + "epoch": 2.530872959545777, + "grad_norm": 3.8923566341400146, + "learning_rate": 9.747040454222853e-05, + "loss": 0.09398716688156128, + "step": 17830 + }, + { + "epoch": 2.532292405961675, + "grad_norm": 5.209949970245361, + "learning_rate": 9.746898509581264e-05, + "loss": 0.11348887681961059, + "step": 17840 + }, + { + "epoch": 2.533711852377573, + "grad_norm": 8.087526321411133, + "learning_rate": 9.746756564939674e-05, + "loss": 0.13804304599761963, + "step": 17850 + }, + { + "epoch": 2.5351312987934707, + "grad_norm": 4.874515056610107, + "learning_rate": 9.746614620298085e-05, + "loss": 0.12363841533660888, + "step": 17860 + }, + { + "epoch": 2.536550745209368, + "grad_norm": 9.139041900634766, + "learning_rate": 9.746472675656494e-05, + "loss": 0.09068549871444702, + "step": 17870 + }, + { + "epoch": 2.5379701916252664, + "grad_norm": 6.489454746246338, + "learning_rate": 9.746330731014905e-05, + "loss": 0.1587399125099182, + "step": 17880 + }, + { + "epoch": 2.539389638041164, + "grad_norm": 9.474618911743164, + "learning_rate": 9.746188786373314e-05, + "loss": 0.13566343784332274, + "step": 17890 + }, + { + "epoch": 2.5408090844570617, + "grad_norm": 3.8730716705322266, + "learning_rate": 9.746046841731726e-05, + "loss": 0.08422473669052125, + "step": 17900 + }, + { + "epoch": 2.5422285308729595, + "grad_norm": 2.2097864151000977, + "learning_rate": 9.745904897090135e-05, + "loss": 0.13542672395706176, + "step": 17910 + }, + { + "epoch": 2.5436479772888574, + "grad_norm": 15.095120429992676, + "learning_rate": 9.745762952448545e-05, + "loss": 0.14511890411376954, + "step": 17920 + }, + { + "epoch": 2.5450674237047552, + "grad_norm": 12.847689628601074, + "learning_rate": 9.745621007806956e-05, + "loss": 0.0919945478439331, + "step": 17930 + }, + { + "epoch": 2.5464868701206527, + "grad_norm": 2.031590223312378, + "learning_rate": 9.745479063165366e-05, + "loss": 0.13927642107009888, + "step": 17940 + }, + { + "epoch": 2.547906316536551, + "grad_norm": 4.216944694519043, + "learning_rate": 9.745337118523777e-05, + "loss": 0.10198723077774048, + "step": 17950 + }, + { + "epoch": 2.5493257629524484, + "grad_norm": 7.031200408935547, + "learning_rate": 9.745195173882187e-05, + "loss": 0.11566638946533203, + "step": 17960 + }, + { + "epoch": 2.5507452093683463, + "grad_norm": 5.59580135345459, + "learning_rate": 9.745053229240598e-05, + "loss": 0.0891038417816162, + "step": 17970 + }, + { + "epoch": 2.552164655784244, + "grad_norm": 8.706607818603516, + "learning_rate": 9.744911284599006e-05, + "loss": 0.09640666842460632, + "step": 17980 + }, + { + "epoch": 2.553584102200142, + "grad_norm": 3.204340934753418, + "learning_rate": 9.744769339957417e-05, + "loss": 0.10391557216644287, + "step": 17990 + }, + { + "epoch": 2.55500354861604, + "grad_norm": 6.2729573249816895, + "learning_rate": 9.744627395315827e-05, + "loss": 0.11966743469238281, + "step": 18000 + }, + { + "epoch": 2.55500354861604, + "eval_accuracy": 0.9338081007185096, + "eval_loss": 0.1904294788837433, + "eval_runtime": 32.5049, + "eval_samples_per_second": 483.835, + "eval_steps_per_second": 15.136, + "step": 18000 + }, + { + "epoch": 2.5564229950319377, + "grad_norm": 4.016758918762207, + "learning_rate": 9.744485450674238e-05, + "loss": 0.16458499431610107, + "step": 18010 + }, + { + "epoch": 2.5578424414478356, + "grad_norm": 9.767767906188965, + "learning_rate": 9.744343506032648e-05, + "loss": 0.11013137102127075, + "step": 18020 + }, + { + "epoch": 2.559261887863733, + "grad_norm": 10.628437042236328, + "learning_rate": 9.744201561391058e-05, + "loss": 0.1186720848083496, + "step": 18030 + }, + { + "epoch": 2.560681334279631, + "grad_norm": 4.2828545570373535, + "learning_rate": 9.744059616749469e-05, + "loss": 0.11388142108917236, + "step": 18040 + }, + { + "epoch": 2.5621007806955287, + "grad_norm": 5.870272636413574, + "learning_rate": 9.743917672107878e-05, + "loss": 0.09274822473526001, + "step": 18050 + }, + { + "epoch": 2.5635202271114266, + "grad_norm": 1.7781943082809448, + "learning_rate": 9.74377572746629e-05, + "loss": 0.10968050956726075, + "step": 18060 + }, + { + "epoch": 2.5649396735273244, + "grad_norm": 10.247567176818848, + "learning_rate": 9.743633782824699e-05, + "loss": 0.12503312826156615, + "step": 18070 + }, + { + "epoch": 2.5663591199432223, + "grad_norm": 5.602545261383057, + "learning_rate": 9.743491838183109e-05, + "loss": 0.09583965539932252, + "step": 18080 + }, + { + "epoch": 2.56777856635912, + "grad_norm": 1.3222918510437012, + "learning_rate": 9.743349893541519e-05, + "loss": 0.11057568788528442, + "step": 18090 + }, + { + "epoch": 2.5691980127750176, + "grad_norm": 2.3814685344696045, + "learning_rate": 9.74320794889993e-05, + "loss": 0.11936540603637695, + "step": 18100 + }, + { + "epoch": 2.5706174591909154, + "grad_norm": 2.4344863891601562, + "learning_rate": 9.74306600425834e-05, + "loss": 0.0944204032421112, + "step": 18110 + }, + { + "epoch": 2.5720369056068133, + "grad_norm": 8.206236839294434, + "learning_rate": 9.74292405961675e-05, + "loss": 0.08790295124053955, + "step": 18120 + }, + { + "epoch": 2.573456352022711, + "grad_norm": 6.2798566818237305, + "learning_rate": 9.74278211497516e-05, + "loss": 0.13661357164382934, + "step": 18130 + }, + { + "epoch": 2.574875798438609, + "grad_norm": 9.54171085357666, + "learning_rate": 9.74264017033357e-05, + "loss": 0.11890660524368286, + "step": 18140 + }, + { + "epoch": 2.576295244854507, + "grad_norm": 2.0758354663848877, + "learning_rate": 9.742498225691981e-05, + "loss": 0.14780707359313966, + "step": 18150 + }, + { + "epoch": 2.5777146912704048, + "grad_norm": 9.819342613220215, + "learning_rate": 9.742356281050391e-05, + "loss": 0.17009602785110473, + "step": 18160 + }, + { + "epoch": 2.579134137686302, + "grad_norm": 0.4771549105644226, + "learning_rate": 9.742214336408802e-05, + "loss": 0.09668282270431519, + "step": 18170 + }, + { + "epoch": 2.5805535841022, + "grad_norm": 3.620116710662842, + "learning_rate": 9.74207239176721e-05, + "loss": 0.09066780805587768, + "step": 18180 + }, + { + "epoch": 2.581973030518098, + "grad_norm": 2.4723594188690186, + "learning_rate": 9.741930447125621e-05, + "loss": 0.09381983876228332, + "step": 18190 + }, + { + "epoch": 2.5833924769339958, + "grad_norm": 8.35051441192627, + "learning_rate": 9.741788502484031e-05, + "loss": 0.15823612213134766, + "step": 18200 + }, + { + "epoch": 2.5848119233498936, + "grad_norm": 5.235237121582031, + "learning_rate": 9.741646557842442e-05, + "loss": 0.1483514666557312, + "step": 18210 + }, + { + "epoch": 2.5862313697657915, + "grad_norm": 4.181369781494141, + "learning_rate": 9.741504613200852e-05, + "loss": 0.07576992511749267, + "step": 18220 + }, + { + "epoch": 2.5876508161816894, + "grad_norm": 7.384850025177002, + "learning_rate": 9.741362668559262e-05, + "loss": 0.07849894762039185, + "step": 18230 + }, + { + "epoch": 2.5890702625975868, + "grad_norm": 2.344217300415039, + "learning_rate": 9.741220723917673e-05, + "loss": 0.10990880727767945, + "step": 18240 + }, + { + "epoch": 2.5904897090134846, + "grad_norm": 5.363242149353027, + "learning_rate": 9.741078779276083e-05, + "loss": 0.1412426710128784, + "step": 18250 + }, + { + "epoch": 2.5919091554293825, + "grad_norm": 3.7980527877807617, + "learning_rate": 9.740936834634494e-05, + "loss": 0.10421816110610962, + "step": 18260 + }, + { + "epoch": 2.5933286018452804, + "grad_norm": 9.759673118591309, + "learning_rate": 9.740794889992903e-05, + "loss": 0.11693978309631348, + "step": 18270 + }, + { + "epoch": 2.594748048261178, + "grad_norm": 2.0219240188598633, + "learning_rate": 9.740652945351315e-05, + "loss": 0.12884674072265626, + "step": 18280 + }, + { + "epoch": 2.596167494677076, + "grad_norm": 9.535964012145996, + "learning_rate": 9.740511000709723e-05, + "loss": 0.12031383514404297, + "step": 18290 + }, + { + "epoch": 2.597586941092974, + "grad_norm": 5.354515552520752, + "learning_rate": 9.740369056068134e-05, + "loss": 0.0845773994922638, + "step": 18300 + }, + { + "epoch": 2.5990063875088714, + "grad_norm": 1.1112140417099, + "learning_rate": 9.740227111426544e-05, + "loss": 0.1002803087234497, + "step": 18310 + }, + { + "epoch": 2.6004258339247692, + "grad_norm": 2.0215070247650146, + "learning_rate": 9.740085166784955e-05, + "loss": 0.10047941207885742, + "step": 18320 + }, + { + "epoch": 2.601845280340667, + "grad_norm": 6.67712926864624, + "learning_rate": 9.739943222143365e-05, + "loss": 0.13017858266830445, + "step": 18330 + }, + { + "epoch": 2.603264726756565, + "grad_norm": 10.4568452835083, + "learning_rate": 9.739801277501774e-05, + "loss": 0.19226794242858886, + "step": 18340 + }, + { + "epoch": 2.604684173172463, + "grad_norm": 6.936629772186279, + "learning_rate": 9.739659332860185e-05, + "loss": 0.1478518009185791, + "step": 18350 + }, + { + "epoch": 2.6061036195883607, + "grad_norm": 0.7439237236976624, + "learning_rate": 9.739517388218595e-05, + "loss": 0.11475565433502197, + "step": 18360 + }, + { + "epoch": 2.6075230660042585, + "grad_norm": 6.165897369384766, + "learning_rate": 9.739375443577006e-05, + "loss": 0.13509042263031007, + "step": 18370 + }, + { + "epoch": 2.608942512420156, + "grad_norm": 5.026000022888184, + "learning_rate": 9.739233498935416e-05, + "loss": 0.11895132064819336, + "step": 18380 + }, + { + "epoch": 2.610361958836054, + "grad_norm": 4.722821235656738, + "learning_rate": 9.739091554293826e-05, + "loss": 0.15483348369598388, + "step": 18390 + }, + { + "epoch": 2.6117814052519517, + "grad_norm": 4.340688705444336, + "learning_rate": 9.738949609652235e-05, + "loss": 0.09090102910995483, + "step": 18400 + }, + { + "epoch": 2.6132008516678495, + "grad_norm": 1.8677579164505005, + "learning_rate": 9.738807665010647e-05, + "loss": 0.12864718437194825, + "step": 18410 + }, + { + "epoch": 2.6146202980837474, + "grad_norm": 4.120899200439453, + "learning_rate": 9.738665720369056e-05, + "loss": 0.10905364751815796, + "step": 18420 + }, + { + "epoch": 2.6160397444996453, + "grad_norm": 2.1230714321136475, + "learning_rate": 9.738523775727467e-05, + "loss": 0.11330556869506836, + "step": 18430 + }, + { + "epoch": 2.617459190915543, + "grad_norm": 7.033359527587891, + "learning_rate": 9.738381831085877e-05, + "loss": 0.08752457499504089, + "step": 18440 + }, + { + "epoch": 2.6188786373314406, + "grad_norm": 5.958856105804443, + "learning_rate": 9.738239886444287e-05, + "loss": 0.07405679225921631, + "step": 18450 + }, + { + "epoch": 2.6202980837473384, + "grad_norm": 3.3164892196655273, + "learning_rate": 9.738097941802698e-05, + "loss": 0.049712374806404114, + "step": 18460 + }, + { + "epoch": 2.6217175301632363, + "grad_norm": 5.792750358581543, + "learning_rate": 9.737955997161108e-05, + "loss": 0.11241586208343506, + "step": 18470 + }, + { + "epoch": 2.623136976579134, + "grad_norm": 5.713932514190674, + "learning_rate": 9.737814052519519e-05, + "loss": 0.0947425127029419, + "step": 18480 + }, + { + "epoch": 2.624556422995032, + "grad_norm": 5.652758598327637, + "learning_rate": 9.737672107877927e-05, + "loss": 0.09331372976303101, + "step": 18490 + }, + { + "epoch": 2.62597586941093, + "grad_norm": 4.281705856323242, + "learning_rate": 9.737530163236338e-05, + "loss": 0.09365745782852172, + "step": 18500 + }, + { + "epoch": 2.62597586941093, + "eval_accuracy": 0.950721688815413, + "eval_loss": 0.1462646871805191, + "eval_runtime": 34.5176, + "eval_samples_per_second": 455.623, + "eval_steps_per_second": 14.254, + "step": 18500 + }, + { + "epoch": 2.6273953158268277, + "grad_norm": 4.376514911651611, + "learning_rate": 9.737388218594748e-05, + "loss": 0.06313493251800537, + "step": 18510 + }, + { + "epoch": 2.628814762242725, + "grad_norm": 7.006924629211426, + "learning_rate": 9.737246273953159e-05, + "loss": 0.1129868745803833, + "step": 18520 + }, + { + "epoch": 2.630234208658623, + "grad_norm": 6.207458972930908, + "learning_rate": 9.737104329311569e-05, + "loss": 0.15238604545593262, + "step": 18530 + }, + { + "epoch": 2.631653655074521, + "grad_norm": 0.35649651288986206, + "learning_rate": 9.736962384669979e-05, + "loss": 0.1252423644065857, + "step": 18540 + }, + { + "epoch": 2.6330731014904187, + "grad_norm": 4.224631309509277, + "learning_rate": 9.73682044002839e-05, + "loss": 0.11180676221847534, + "step": 18550 + }, + { + "epoch": 2.6344925479063166, + "grad_norm": 6.666781425476074, + "learning_rate": 9.7366784953868e-05, + "loss": 0.09207946062088013, + "step": 18560 + }, + { + "epoch": 2.6359119943222145, + "grad_norm": 5.663329124450684, + "learning_rate": 9.73653655074521e-05, + "loss": 0.09166657328605651, + "step": 18570 + }, + { + "epoch": 2.6373314407381123, + "grad_norm": 4.614907741546631, + "learning_rate": 9.73639460610362e-05, + "loss": 0.08460969924926758, + "step": 18580 + }, + { + "epoch": 2.6387508871540097, + "grad_norm": 4.568515300750732, + "learning_rate": 9.73625266146203e-05, + "loss": 0.0926063060760498, + "step": 18590 + }, + { + "epoch": 2.6401703335699076, + "grad_norm": 4.265593528747559, + "learning_rate": 9.73611071682044e-05, + "loss": 0.14236600399017335, + "step": 18600 + }, + { + "epoch": 2.6415897799858055, + "grad_norm": 3.393044948577881, + "learning_rate": 9.735968772178851e-05, + "loss": 0.06547205448150635, + "step": 18610 + }, + { + "epoch": 2.6430092264017033, + "grad_norm": 2.976576328277588, + "learning_rate": 9.73582682753726e-05, + "loss": 0.07752239108085632, + "step": 18620 + }, + { + "epoch": 2.644428672817601, + "grad_norm": 5.691226959228516, + "learning_rate": 9.735684882895672e-05, + "loss": 0.10452626943588257, + "step": 18630 + }, + { + "epoch": 2.645848119233499, + "grad_norm": 6.348296642303467, + "learning_rate": 9.735542938254081e-05, + "loss": 0.11550105810165405, + "step": 18640 + }, + { + "epoch": 2.647267565649397, + "grad_norm": 9.737822532653809, + "learning_rate": 9.735400993612491e-05, + "loss": 0.12678935527801513, + "step": 18650 + }, + { + "epoch": 2.6486870120652943, + "grad_norm": 1.7993618249893188, + "learning_rate": 9.735259048970902e-05, + "loss": 0.09803841710090637, + "step": 18660 + }, + { + "epoch": 2.650106458481192, + "grad_norm": 5.785006523132324, + "learning_rate": 9.735117104329312e-05, + "loss": 0.12399122714996338, + "step": 18670 + }, + { + "epoch": 2.65152590489709, + "grad_norm": 5.436007976531982, + "learning_rate": 9.734975159687723e-05, + "loss": 0.11214399337768555, + "step": 18680 + }, + { + "epoch": 2.652945351312988, + "grad_norm": 6.046454429626465, + "learning_rate": 9.734833215046133e-05, + "loss": 0.08356254100799561, + "step": 18690 + }, + { + "epoch": 2.654364797728886, + "grad_norm": 7.5290021896362305, + "learning_rate": 9.734691270404542e-05, + "loss": 0.101429283618927, + "step": 18700 + }, + { + "epoch": 2.6557842441447836, + "grad_norm": 3.0168631076812744, + "learning_rate": 9.734549325762952e-05, + "loss": 0.09058440327644349, + "step": 18710 + }, + { + "epoch": 2.6572036905606815, + "grad_norm": 8.676300048828125, + "learning_rate": 9.734407381121363e-05, + "loss": 0.13883825540542602, + "step": 18720 + }, + { + "epoch": 2.658623136976579, + "grad_norm": 10.840899467468262, + "learning_rate": 9.734265436479773e-05, + "loss": 0.11511178016662597, + "step": 18730 + }, + { + "epoch": 2.660042583392477, + "grad_norm": 2.287022113800049, + "learning_rate": 9.734123491838184e-05, + "loss": 0.10089895725250245, + "step": 18740 + }, + { + "epoch": 2.6614620298083747, + "grad_norm": 5.894728183746338, + "learning_rate": 9.733981547196594e-05, + "loss": 0.10052759647369384, + "step": 18750 + }, + { + "epoch": 2.6628814762242725, + "grad_norm": 3.954016923904419, + "learning_rate": 9.733839602555004e-05, + "loss": 0.1216499924659729, + "step": 18760 + }, + { + "epoch": 2.6643009226401704, + "grad_norm": 9.103641510009766, + "learning_rate": 9.733697657913415e-05, + "loss": 0.10710169076919555, + "step": 18770 + }, + { + "epoch": 2.6657203690560682, + "grad_norm": 0.37438610196113586, + "learning_rate": 9.733555713271824e-05, + "loss": 0.08723070025444031, + "step": 18780 + }, + { + "epoch": 2.667139815471966, + "grad_norm": 9.926944732666016, + "learning_rate": 9.733413768630236e-05, + "loss": 0.12807276248931884, + "step": 18790 + }, + { + "epoch": 2.6685592618878635, + "grad_norm": 9.92432689666748, + "learning_rate": 9.733271823988644e-05, + "loss": 0.18386597633361818, + "step": 18800 + }, + { + "epoch": 2.6699787083037614, + "grad_norm": 4.39555549621582, + "learning_rate": 9.733129879347055e-05, + "loss": 0.10847448110580445, + "step": 18810 + }, + { + "epoch": 2.6713981547196592, + "grad_norm": 4.371532440185547, + "learning_rate": 9.732987934705465e-05, + "loss": 0.11950172185897827, + "step": 18820 + }, + { + "epoch": 2.672817601135557, + "grad_norm": 3.7563788890838623, + "learning_rate": 9.732845990063876e-05, + "loss": 0.11064698696136474, + "step": 18830 + }, + { + "epoch": 2.674237047551455, + "grad_norm": 8.16103458404541, + "learning_rate": 9.732704045422286e-05, + "loss": 0.1522403836250305, + "step": 18840 + }, + { + "epoch": 2.675656493967353, + "grad_norm": 2.7513720989227295, + "learning_rate": 9.732562100780695e-05, + "loss": 0.14767955541610717, + "step": 18850 + }, + { + "epoch": 2.6770759403832507, + "grad_norm": 4.588718891143799, + "learning_rate": 9.732420156139106e-05, + "loss": 0.11084201335906982, + "step": 18860 + }, + { + "epoch": 2.678495386799148, + "grad_norm": 3.071213722229004, + "learning_rate": 9.732278211497516e-05, + "loss": 0.15097259283065795, + "step": 18870 + }, + { + "epoch": 2.679914833215046, + "grad_norm": 6.630822658538818, + "learning_rate": 9.732136266855927e-05, + "loss": 0.09166755676269531, + "step": 18880 + }, + { + "epoch": 2.681334279630944, + "grad_norm": 7.124295711517334, + "learning_rate": 9.731994322214337e-05, + "loss": 0.14961253404617308, + "step": 18890 + }, + { + "epoch": 2.6827537260468417, + "grad_norm": 8.885273933410645, + "learning_rate": 9.731852377572747e-05, + "loss": 0.1840854525566101, + "step": 18900 + }, + { + "epoch": 2.6841731724627396, + "grad_norm": 14.617013931274414, + "learning_rate": 9.731710432931156e-05, + "loss": 0.15676331520080566, + "step": 18910 + }, + { + "epoch": 2.6855926188786374, + "grad_norm": 5.9459452629089355, + "learning_rate": 9.731568488289568e-05, + "loss": 0.13418021202087402, + "step": 18920 + }, + { + "epoch": 2.6870120652945353, + "grad_norm": 1.0808570384979248, + "learning_rate": 9.731426543647977e-05, + "loss": 0.15757611989974976, + "step": 18930 + }, + { + "epoch": 2.6884315117104327, + "grad_norm": 1.862561583518982, + "learning_rate": 9.731284599006388e-05, + "loss": 0.09019602537155151, + "step": 18940 + }, + { + "epoch": 2.6898509581263306, + "grad_norm": 2.4577274322509766, + "learning_rate": 9.731142654364798e-05, + "loss": 0.06294019222259521, + "step": 18950 + }, + { + "epoch": 2.6912704045422284, + "grad_norm": 3.2663893699645996, + "learning_rate": 9.731000709723208e-05, + "loss": 0.06696848869323731, + "step": 18960 + }, + { + "epoch": 2.6926898509581263, + "grad_norm": 1.4709694385528564, + "learning_rate": 9.730858765081619e-05, + "loss": 0.061003082990646364, + "step": 18970 + }, + { + "epoch": 2.694109297374024, + "grad_norm": 2.4802117347717285, + "learning_rate": 9.730716820440029e-05, + "loss": 0.10601764917373657, + "step": 18980 + }, + { + "epoch": 2.695528743789922, + "grad_norm": 5.821985244750977, + "learning_rate": 9.73057487579844e-05, + "loss": 0.12596286535263063, + "step": 18990 + }, + { + "epoch": 2.69694819020582, + "grad_norm": 4.4037981033325195, + "learning_rate": 9.73043293115685e-05, + "loss": 0.08721169829368591, + "step": 19000 + }, + { + "epoch": 2.69694819020582, + "eval_accuracy": 0.9462071596617282, + "eval_loss": 0.1497952938079834, + "eval_runtime": 35.1407, + "eval_samples_per_second": 447.544, + "eval_steps_per_second": 14.001, + "step": 19000 + }, + { + "epoch": 2.6983676366217173, + "grad_norm": 5.155467987060547, + "learning_rate": 9.730290986515259e-05, + "loss": 0.16012940406799317, + "step": 19010 + }, + { + "epoch": 2.699787083037615, + "grad_norm": 6.539963245391846, + "learning_rate": 9.730149041873669e-05, + "loss": 0.13179491758346557, + "step": 19020 + }, + { + "epoch": 2.701206529453513, + "grad_norm": 5.117822647094727, + "learning_rate": 9.73000709723208e-05, + "loss": 0.11193997859954834, + "step": 19030 + }, + { + "epoch": 2.702625975869411, + "grad_norm": 13.319026947021484, + "learning_rate": 9.72986515259049e-05, + "loss": 0.06884243488311767, + "step": 19040 + }, + { + "epoch": 2.7040454222853088, + "grad_norm": 12.856066703796387, + "learning_rate": 9.729723207948901e-05, + "loss": 0.11155580282211304, + "step": 19050 + }, + { + "epoch": 2.7054648687012066, + "grad_norm": 3.3367395401000977, + "learning_rate": 9.72958126330731e-05, + "loss": 0.11018801927566528, + "step": 19060 + }, + { + "epoch": 2.7068843151171045, + "grad_norm": 2.5702414512634277, + "learning_rate": 9.72943931866572e-05, + "loss": 0.14847090244293212, + "step": 19070 + }, + { + "epoch": 2.708303761533002, + "grad_norm": 3.5079307556152344, + "learning_rate": 9.729297374024131e-05, + "loss": 0.12648016214370728, + "step": 19080 + }, + { + "epoch": 2.7097232079489, + "grad_norm": 7.1927642822265625, + "learning_rate": 9.729155429382541e-05, + "loss": 0.08001441359519959, + "step": 19090 + }, + { + "epoch": 2.7111426543647976, + "grad_norm": 2.3428845405578613, + "learning_rate": 9.729013484740952e-05, + "loss": 0.07565593719482422, + "step": 19100 + }, + { + "epoch": 2.7125621007806955, + "grad_norm": 5.344996929168701, + "learning_rate": 9.728871540099361e-05, + "loss": 0.06011520624160767, + "step": 19110 + }, + { + "epoch": 2.7139815471965933, + "grad_norm": 3.558228015899658, + "learning_rate": 9.728729595457772e-05, + "loss": 0.13906779289245605, + "step": 19120 + }, + { + "epoch": 2.715400993612491, + "grad_norm": 2.2271339893341064, + "learning_rate": 9.728587650816182e-05, + "loss": 0.06516092419624328, + "step": 19130 + }, + { + "epoch": 2.716820440028389, + "grad_norm": 6.620656490325928, + "learning_rate": 9.728445706174593e-05, + "loss": 0.08588937520980836, + "step": 19140 + }, + { + "epoch": 2.7182398864442865, + "grad_norm": 0.9995052218437195, + "learning_rate": 9.728303761533004e-05, + "loss": 0.07684165835380555, + "step": 19150 + }, + { + "epoch": 2.719659332860185, + "grad_norm": 2.3631653785705566, + "learning_rate": 9.728161816891412e-05, + "loss": 0.08287461400032044, + "step": 19160 + }, + { + "epoch": 2.721078779276082, + "grad_norm": 6.304315567016602, + "learning_rate": 9.728019872249823e-05, + "loss": 0.14411957263946534, + "step": 19170 + }, + { + "epoch": 2.72249822569198, + "grad_norm": 2.651029109954834, + "learning_rate": 9.727877927608233e-05, + "loss": 0.16562498807907106, + "step": 19180 + }, + { + "epoch": 2.723917672107878, + "grad_norm": 1.1602712869644165, + "learning_rate": 9.727735982966644e-05, + "loss": 0.0994363009929657, + "step": 19190 + }, + { + "epoch": 2.725337118523776, + "grad_norm": 2.081709384918213, + "learning_rate": 9.727594038325054e-05, + "loss": 0.1161266803741455, + "step": 19200 + }, + { + "epoch": 2.7267565649396737, + "grad_norm": 5.32574462890625, + "learning_rate": 9.727452093683463e-05, + "loss": 0.11266434192657471, + "step": 19210 + }, + { + "epoch": 2.728176011355571, + "grad_norm": 4.33624267578125, + "learning_rate": 9.727310149041873e-05, + "loss": 0.07457006573677064, + "step": 19220 + }, + { + "epoch": 2.7295954577714694, + "grad_norm": 9.516417503356934, + "learning_rate": 9.727168204400284e-05, + "loss": 0.06251566410064698, + "step": 19230 + }, + { + "epoch": 2.731014904187367, + "grad_norm": 7.441606044769287, + "learning_rate": 9.727026259758695e-05, + "loss": 0.11953941583633423, + "step": 19240 + }, + { + "epoch": 2.7324343506032647, + "grad_norm": 0.9915375113487244, + "learning_rate": 9.726884315117105e-05, + "loss": 0.10013129711151122, + "step": 19250 + }, + { + "epoch": 2.7338537970191625, + "grad_norm": 6.937955379486084, + "learning_rate": 9.726742370475515e-05, + "loss": 0.13717392683029175, + "step": 19260 + }, + { + "epoch": 2.7352732434350604, + "grad_norm": 6.149573802947998, + "learning_rate": 9.726600425833925e-05, + "loss": 0.11093438863754272, + "step": 19270 + }, + { + "epoch": 2.7366926898509583, + "grad_norm": 4.646894454956055, + "learning_rate": 9.726458481192336e-05, + "loss": 0.15733885765075684, + "step": 19280 + }, + { + "epoch": 2.7381121362668557, + "grad_norm": 5.516530513763428, + "learning_rate": 9.726316536550745e-05, + "loss": 0.06147825121879578, + "step": 19290 + }, + { + "epoch": 2.739531582682754, + "grad_norm": 3.121425151824951, + "learning_rate": 9.726174591909157e-05, + "loss": 0.06866928935050964, + "step": 19300 + }, + { + "epoch": 2.7409510290986514, + "grad_norm": 7.502362251281738, + "learning_rate": 9.726032647267565e-05, + "loss": 0.08418467044830322, + "step": 19310 + }, + { + "epoch": 2.7423704755145493, + "grad_norm": 2.791508436203003, + "learning_rate": 9.725890702625976e-05, + "loss": 0.11801939010620117, + "step": 19320 + }, + { + "epoch": 2.743789921930447, + "grad_norm": 7.064516544342041, + "learning_rate": 9.725748757984387e-05, + "loss": 0.13972241878509523, + "step": 19330 + }, + { + "epoch": 2.745209368346345, + "grad_norm": 5.9328932762146, + "learning_rate": 9.725606813342797e-05, + "loss": 0.12251147031784057, + "step": 19340 + }, + { + "epoch": 2.746628814762243, + "grad_norm": 6.175622940063477, + "learning_rate": 9.725464868701208e-05, + "loss": 0.06602987051010131, + "step": 19350 + }, + { + "epoch": 2.7480482611781403, + "grad_norm": 4.53786563873291, + "learning_rate": 9.725322924059618e-05, + "loss": 0.1297551393508911, + "step": 19360 + }, + { + "epoch": 2.7494677075940386, + "grad_norm": 3.098621368408203, + "learning_rate": 9.725180979418027e-05, + "loss": 0.1370749831199646, + "step": 19370 + }, + { + "epoch": 2.750887154009936, + "grad_norm": 3.015416383743286, + "learning_rate": 9.725039034776437e-05, + "loss": 0.12202317714691162, + "step": 19380 + }, + { + "epoch": 2.752306600425834, + "grad_norm": 2.518812656402588, + "learning_rate": 9.724897090134848e-05, + "loss": 0.08936739563941956, + "step": 19390 + }, + { + "epoch": 2.7537260468417317, + "grad_norm": 6.073837757110596, + "learning_rate": 9.724755145493258e-05, + "loss": 0.1370900511741638, + "step": 19400 + }, + { + "epoch": 2.7551454932576296, + "grad_norm": 5.372803211212158, + "learning_rate": 9.724613200851669e-05, + "loss": 0.16160420179367066, + "step": 19410 + }, + { + "epoch": 2.7565649396735274, + "grad_norm": 3.8927814960479736, + "learning_rate": 9.724471256210079e-05, + "loss": 0.18655315637588502, + "step": 19420 + }, + { + "epoch": 2.757984386089425, + "grad_norm": 6.601566314697266, + "learning_rate": 9.724329311568489e-05, + "loss": 0.06503421068191528, + "step": 19430 + }, + { + "epoch": 2.759403832505323, + "grad_norm": 8.965290069580078, + "learning_rate": 9.7241873669269e-05, + "loss": 0.15749263763427734, + "step": 19440 + }, + { + "epoch": 2.7608232789212206, + "grad_norm": 6.057149410247803, + "learning_rate": 9.72404542228531e-05, + "loss": 0.09035987257957459, + "step": 19450 + }, + { + "epoch": 2.7622427253371185, + "grad_norm": 3.8677871227264404, + "learning_rate": 9.72390347764372e-05, + "loss": 0.09661787152290344, + "step": 19460 + }, + { + "epoch": 2.7636621717530163, + "grad_norm": 1.7954285144805908, + "learning_rate": 9.723761533002129e-05, + "loss": 0.11428978443145751, + "step": 19470 + }, + { + "epoch": 2.765081618168914, + "grad_norm": 8.921133041381836, + "learning_rate": 9.72361958836054e-05, + "loss": 0.14268529415130615, + "step": 19480 + }, + { + "epoch": 2.766501064584812, + "grad_norm": 0.6554881930351257, + "learning_rate": 9.72347764371895e-05, + "loss": 0.05844693183898926, + "step": 19490 + }, + { + "epoch": 2.7679205110007095, + "grad_norm": 1.2021902799606323, + "learning_rate": 9.723335699077361e-05, + "loss": 0.048795363306999205, + "step": 19500 + }, + { + "epoch": 2.7679205110007095, + "eval_accuracy": 0.9642016913588097, + "eval_loss": 0.10457975417375565, + "eval_runtime": 32.1695, + "eval_samples_per_second": 488.879, + "eval_steps_per_second": 15.294, + "step": 19500 + }, + { + "epoch": 2.7693399574166078, + "grad_norm": 3.3482987880706787, + "learning_rate": 9.72319375443577e-05, + "loss": 0.08013315200805664, + "step": 19510 + }, + { + "epoch": 2.770759403832505, + "grad_norm": 7.4644036293029785, + "learning_rate": 9.72305180979418e-05, + "loss": 0.12772181034088134, + "step": 19520 + }, + { + "epoch": 2.772178850248403, + "grad_norm": 4.970337390899658, + "learning_rate": 9.722909865152591e-05, + "loss": 0.08325361609458923, + "step": 19530 + }, + { + "epoch": 2.773598296664301, + "grad_norm": 5.109130382537842, + "learning_rate": 9.722767920511001e-05, + "loss": 0.12823007106781006, + "step": 19540 + }, + { + "epoch": 2.7750177430801988, + "grad_norm": Infinity, + "learning_rate": 9.722625975869412e-05, + "loss": 0.07545018792152405, + "step": 19550 + }, + { + "epoch": 2.7764371894960966, + "grad_norm": 2.3274765014648438, + "learning_rate": 9.72249822569198e-05, + "loss": 0.09213562607765198, + "step": 19560 + }, + { + "epoch": 2.777856635911994, + "grad_norm": 1.3119785785675049, + "learning_rate": 9.72235628105039e-05, + "loss": 0.09134193658828735, + "step": 19570 + }, + { + "epoch": 2.7792760823278924, + "grad_norm": 1.7308454513549805, + "learning_rate": 9.722214336408801e-05, + "loss": 0.07336680889129639, + "step": 19580 + }, + { + "epoch": 2.78069552874379, + "grad_norm": 5.1270623207092285, + "learning_rate": 9.722072391767211e-05, + "loss": 0.10246673822402955, + "step": 19590 + }, + { + "epoch": 2.7821149751596876, + "grad_norm": 8.638457298278809, + "learning_rate": 9.721930447125621e-05, + "loss": 0.15175464153289794, + "step": 19600 + }, + { + "epoch": 2.7835344215755855, + "grad_norm": 2.7487826347351074, + "learning_rate": 9.721788502484032e-05, + "loss": 0.09026304483413697, + "step": 19610 + }, + { + "epoch": 2.7849538679914834, + "grad_norm": 1.0804003477096558, + "learning_rate": 9.721646557842442e-05, + "loss": 0.1334142804145813, + "step": 19620 + }, + { + "epoch": 2.7863733144073812, + "grad_norm": 4.871701717376709, + "learning_rate": 9.721504613200853e-05, + "loss": 0.0774698793888092, + "step": 19630 + }, + { + "epoch": 2.7877927608232786, + "grad_norm": 5.122735500335693, + "learning_rate": 9.721362668559261e-05, + "loss": 0.0750051498413086, + "step": 19640 + }, + { + "epoch": 2.789212207239177, + "grad_norm": 4.928715705871582, + "learning_rate": 9.721220723917672e-05, + "loss": 0.10383319854736328, + "step": 19650 + }, + { + "epoch": 2.7906316536550744, + "grad_norm": 4.654665470123291, + "learning_rate": 9.721078779276082e-05, + "loss": 0.07332990169525147, + "step": 19660 + }, + { + "epoch": 2.7920511000709722, + "grad_norm": 9.121614456176758, + "learning_rate": 9.720936834634493e-05, + "loss": 0.17799346446990966, + "step": 19670 + }, + { + "epoch": 2.79347054648687, + "grad_norm": 0.8097667694091797, + "learning_rate": 9.720794889992903e-05, + "loss": 0.13993927240371704, + "step": 19680 + }, + { + "epoch": 2.794889992902768, + "grad_norm": 6.301029682159424, + "learning_rate": 9.720652945351314e-05, + "loss": 0.049062016606330874, + "step": 19690 + }, + { + "epoch": 2.796309439318666, + "grad_norm": 7.916932582855225, + "learning_rate": 9.720511000709724e-05, + "loss": 0.13611079454421998, + "step": 19700 + }, + { + "epoch": 2.7977288857345637, + "grad_norm": 6.278209209442139, + "learning_rate": 9.720369056068134e-05, + "loss": 0.12774984836578368, + "step": 19710 + }, + { + "epoch": 2.7991483321504615, + "grad_norm": 8.645759582519531, + "learning_rate": 9.720227111426545e-05, + "loss": 0.09328774213790894, + "step": 19720 + }, + { + "epoch": 2.800567778566359, + "grad_norm": 3.0282325744628906, + "learning_rate": 9.720085166784954e-05, + "loss": 0.0923624575138092, + "step": 19730 + }, + { + "epoch": 2.801987224982257, + "grad_norm": 4.2578444480896, + "learning_rate": 9.719943222143365e-05, + "loss": 0.09177879095077515, + "step": 19740 + }, + { + "epoch": 2.8034066713981547, + "grad_norm": 7.6798996925354, + "learning_rate": 9.719801277501774e-05, + "loss": 0.12493581771850586, + "step": 19750 + }, + { + "epoch": 2.8048261178140526, + "grad_norm": 4.347507953643799, + "learning_rate": 9.719659332860185e-05, + "loss": 0.09963855147361755, + "step": 19760 + }, + { + "epoch": 2.8062455642299504, + "grad_norm": 4.931194305419922, + "learning_rate": 9.719517388218595e-05, + "loss": 0.07842986583709717, + "step": 19770 + }, + { + "epoch": 2.8076650106458483, + "grad_norm": 4.186477184295654, + "learning_rate": 9.719375443577006e-05, + "loss": 0.12233660221099854, + "step": 19780 + }, + { + "epoch": 2.809084457061746, + "grad_norm": 7.659719944000244, + "learning_rate": 9.719233498935415e-05, + "loss": 0.09655895829200745, + "step": 19790 + }, + { + "epoch": 2.8105039034776436, + "grad_norm": 0.47399571537971497, + "learning_rate": 9.719091554293825e-05, + "loss": 0.07599647045135498, + "step": 19800 + }, + { + "epoch": 2.8119233498935414, + "grad_norm": 4.59540319442749, + "learning_rate": 9.718949609652236e-05, + "loss": 0.07412179708480834, + "step": 19810 + }, + { + "epoch": 2.8133427963094393, + "grad_norm": 8.436945915222168, + "learning_rate": 9.718807665010646e-05, + "loss": 0.10687708854675293, + "step": 19820 + }, + { + "epoch": 2.814762242725337, + "grad_norm": 4.068880081176758, + "learning_rate": 9.718665720369057e-05, + "loss": 0.04072721004486084, + "step": 19830 + }, + { + "epoch": 2.816181689141235, + "grad_norm": 8.406689643859863, + "learning_rate": 9.718523775727467e-05, + "loss": 0.09728883504867554, + "step": 19840 + }, + { + "epoch": 2.817601135557133, + "grad_norm": 2.9611806869506836, + "learning_rate": 9.718381831085877e-05, + "loss": 0.0824375331401825, + "step": 19850 + }, + { + "epoch": 2.8190205819730307, + "grad_norm": 8.75788402557373, + "learning_rate": 9.718239886444286e-05, + "loss": 0.13575732707977295, + "step": 19860 + }, + { + "epoch": 2.820440028388928, + "grad_norm": 2.153355598449707, + "learning_rate": 9.718097941802697e-05, + "loss": 0.0826115369796753, + "step": 19870 + }, + { + "epoch": 2.821859474804826, + "grad_norm": 5.776090145111084, + "learning_rate": 9.717955997161107e-05, + "loss": 0.07727134227752686, + "step": 19880 + }, + { + "epoch": 2.823278921220724, + "grad_norm": 10.297713279724121, + "learning_rate": 9.717814052519518e-05, + "loss": 0.08978387117385864, + "step": 19890 + }, + { + "epoch": 2.8246983676366217, + "grad_norm": 4.710965156555176, + "learning_rate": 9.717672107877928e-05, + "loss": 0.14321819543838502, + "step": 19900 + }, + { + "epoch": 2.8261178140525196, + "grad_norm": 4.13072395324707, + "learning_rate": 9.717530163236338e-05, + "loss": 0.15760390758514403, + "step": 19910 + }, + { + "epoch": 2.8275372604684175, + "grad_norm": 0.497278094291687, + "learning_rate": 9.717388218594749e-05, + "loss": 0.08274838328361511, + "step": 19920 + }, + { + "epoch": 2.8289567068843153, + "grad_norm": 7.707274913787842, + "learning_rate": 9.717246273953159e-05, + "loss": 0.09570494294166565, + "step": 19930 + }, + { + "epoch": 2.8303761533002127, + "grad_norm": 5.368363857269287, + "learning_rate": 9.71710432931157e-05, + "loss": 0.07190582752227784, + "step": 19940 + }, + { + "epoch": 2.8317955997161106, + "grad_norm": 7.027709484100342, + "learning_rate": 9.716962384669978e-05, + "loss": 0.11582446098327637, + "step": 19950 + }, + { + "epoch": 2.8332150461320085, + "grad_norm": 12.213539123535156, + "learning_rate": 9.716820440028389e-05, + "loss": 0.10933125019073486, + "step": 19960 + }, + { + "epoch": 2.8346344925479063, + "grad_norm": 6.922082901000977, + "learning_rate": 9.716678495386799e-05, + "loss": 0.16551480293273926, + "step": 19970 + }, + { + "epoch": 2.836053938963804, + "grad_norm": 3.005093812942505, + "learning_rate": 9.71653655074521e-05, + "loss": 0.09381322860717774, + "step": 19980 + }, + { + "epoch": 2.837473385379702, + "grad_norm": 5.592711925506592, + "learning_rate": 9.716394606103621e-05, + "loss": 0.11934515237808227, + "step": 19990 + }, + { + "epoch": 2.8388928317956, + "grad_norm": 2.7002058029174805, + "learning_rate": 9.71625266146203e-05, + "loss": 0.06390081644058228, + "step": 20000 + }, + { + "epoch": 2.8388928317956, + "eval_accuracy": 0.9484326317797418, + "eval_loss": 0.14968876540660858, + "eval_runtime": 33.8107, + "eval_samples_per_second": 465.148, + "eval_steps_per_second": 14.552, + "step": 20000 + }, + { + "epoch": 2.8403122782114973, + "grad_norm": 4.695428371429443, + "learning_rate": 9.71611071682044e-05, + "loss": 0.11333894729614258, + "step": 20010 + }, + { + "epoch": 2.841731724627395, + "grad_norm": 0.6784132719039917, + "learning_rate": 9.71596877217885e-05, + "loss": 0.09425503015518188, + "step": 20020 + }, + { + "epoch": 2.843151171043293, + "grad_norm": 7.540246963500977, + "learning_rate": 9.715826827537261e-05, + "loss": 0.15037193298339843, + "step": 20030 + }, + { + "epoch": 2.844570617459191, + "grad_norm": 1.3910176753997803, + "learning_rate": 9.715684882895671e-05, + "loss": 0.10529568195343017, + "step": 20040 + }, + { + "epoch": 2.845990063875089, + "grad_norm": 10.363840103149414, + "learning_rate": 9.715542938254082e-05, + "loss": 0.13602850437164307, + "step": 20050 + }, + { + "epoch": 2.8474095102909867, + "grad_norm": 9.801745414733887, + "learning_rate": 9.71540099361249e-05, + "loss": 0.09394903779029846, + "step": 20060 + }, + { + "epoch": 2.8488289567068845, + "grad_norm": 4.273351192474365, + "learning_rate": 9.715259048970902e-05, + "loss": 0.12311586141586303, + "step": 20070 + }, + { + "epoch": 2.850248403122782, + "grad_norm": 11.77322006225586, + "learning_rate": 9.715117104329313e-05, + "loss": 0.12338924407958984, + "step": 20080 + }, + { + "epoch": 2.85166784953868, + "grad_norm": 2.7312419414520264, + "learning_rate": 9.714975159687723e-05, + "loss": 0.06953715085983277, + "step": 20090 + }, + { + "epoch": 2.8530872959545777, + "grad_norm": 5.562644958496094, + "learning_rate": 9.714833215046134e-05, + "loss": 0.06668174266815186, + "step": 20100 + }, + { + "epoch": 2.8545067423704755, + "grad_norm": 6.543910980224609, + "learning_rate": 9.714691270404542e-05, + "loss": 0.11938363313674927, + "step": 20110 + }, + { + "epoch": 2.8559261887863734, + "grad_norm": 1.5311610698699951, + "learning_rate": 9.714549325762953e-05, + "loss": 0.0953073263168335, + "step": 20120 + }, + { + "epoch": 2.8573456352022713, + "grad_norm": 10.13642406463623, + "learning_rate": 9.714407381121363e-05, + "loss": 0.10842293500900269, + "step": 20130 + }, + { + "epoch": 2.858765081618169, + "grad_norm": 6.405614376068115, + "learning_rate": 9.714265436479774e-05, + "loss": 0.18160440921783447, + "step": 20140 + }, + { + "epoch": 2.8601845280340665, + "grad_norm": 8.15994644165039, + "learning_rate": 9.714123491838184e-05, + "loss": 0.15880486965179444, + "step": 20150 + }, + { + "epoch": 2.8616039744499644, + "grad_norm": 9.660137176513672, + "learning_rate": 9.713981547196593e-05, + "loss": 0.1277371048927307, + "step": 20160 + }, + { + "epoch": 2.8630234208658623, + "grad_norm": 13.830092430114746, + "learning_rate": 9.713839602555004e-05, + "loss": 0.12971055507659912, + "step": 20170 + }, + { + "epoch": 2.86444286728176, + "grad_norm": 3.822737455368042, + "learning_rate": 9.713697657913414e-05, + "loss": 0.16139203310012817, + "step": 20180 + }, + { + "epoch": 2.865862313697658, + "grad_norm": 2.0092313289642334, + "learning_rate": 9.713555713271825e-05, + "loss": 0.06620528101921082, + "step": 20190 + }, + { + "epoch": 2.867281760113556, + "grad_norm": 3.479095458984375, + "learning_rate": 9.713413768630235e-05, + "loss": 0.10068619251251221, + "step": 20200 + }, + { + "epoch": 2.8687012065294537, + "grad_norm": 1.8399436473846436, + "learning_rate": 9.713271823988645e-05, + "loss": 0.07809083461761475, + "step": 20210 + }, + { + "epoch": 2.870120652945351, + "grad_norm": 1.2535580396652222, + "learning_rate": 9.713129879347055e-05, + "loss": 0.10528775453567504, + "step": 20220 + }, + { + "epoch": 2.871540099361249, + "grad_norm": 5.34690523147583, + "learning_rate": 9.712987934705466e-05, + "loss": 0.09714440107345582, + "step": 20230 + }, + { + "epoch": 2.872959545777147, + "grad_norm": 3.72548770904541, + "learning_rate": 9.712845990063875e-05, + "loss": 0.05409139394760132, + "step": 20240 + }, + { + "epoch": 2.8743789921930447, + "grad_norm": 4.422288417816162, + "learning_rate": 9.712704045422286e-05, + "loss": 0.0929717779159546, + "step": 20250 + }, + { + "epoch": 2.8757984386089426, + "grad_norm": 1.4169726371765137, + "learning_rate": 9.712562100780696e-05, + "loss": 0.04481082260608673, + "step": 20260 + }, + { + "epoch": 2.8772178850248404, + "grad_norm": 3.0234224796295166, + "learning_rate": 9.712420156139106e-05, + "loss": 0.14665982723236085, + "step": 20270 + }, + { + "epoch": 2.8786373314407383, + "grad_norm": 0.8741635680198669, + "learning_rate": 9.712278211497517e-05, + "loss": 0.057705503702163694, + "step": 20280 + }, + { + "epoch": 2.8800567778566357, + "grad_norm": 1.1250085830688477, + "learning_rate": 9.712136266855927e-05, + "loss": 0.1067537546157837, + "step": 20290 + }, + { + "epoch": 2.8814762242725336, + "grad_norm": 10.388190269470215, + "learning_rate": 9.711994322214338e-05, + "loss": 0.10462450981140137, + "step": 20300 + }, + { + "epoch": 2.8828956706884314, + "grad_norm": 3.0416109561920166, + "learning_rate": 9.711852377572746e-05, + "loss": 0.10544465780258179, + "step": 20310 + }, + { + "epoch": 2.8843151171043293, + "grad_norm": 5.297311782836914, + "learning_rate": 9.711710432931157e-05, + "loss": 0.06729884147644043, + "step": 20320 + }, + { + "epoch": 2.885734563520227, + "grad_norm": 2.5105323791503906, + "learning_rate": 9.711568488289567e-05, + "loss": 0.08199673295021057, + "step": 20330 + }, + { + "epoch": 2.887154009936125, + "grad_norm": 2.514965057373047, + "learning_rate": 9.711426543647978e-05, + "loss": 0.07696297764778137, + "step": 20340 + }, + { + "epoch": 2.888573456352023, + "grad_norm": 2.9623782634735107, + "learning_rate": 9.711284599006388e-05, + "loss": 0.06418653130531311, + "step": 20350 + }, + { + "epoch": 2.8899929027679203, + "grad_norm": 7.9242777824401855, + "learning_rate": 9.711142654364798e-05, + "loss": 0.10036368370056152, + "step": 20360 + }, + { + "epoch": 2.891412349183818, + "grad_norm": 0.3050519526004791, + "learning_rate": 9.711000709723209e-05, + "loss": 0.09710363149642945, + "step": 20370 + }, + { + "epoch": 2.892831795599716, + "grad_norm": 4.167988300323486, + "learning_rate": 9.710858765081618e-05, + "loss": 0.09933966994285584, + "step": 20380 + }, + { + "epoch": 2.894251242015614, + "grad_norm": 4.994990348815918, + "learning_rate": 9.71071682044003e-05, + "loss": 0.14826220273971558, + "step": 20390 + }, + { + "epoch": 2.8956706884315118, + "grad_norm": 5.276573657989502, + "learning_rate": 9.710574875798439e-05, + "loss": 0.06008061766624451, + "step": 20400 + }, + { + "epoch": 2.8970901348474096, + "grad_norm": 1.4481778144836426, + "learning_rate": 9.71043293115685e-05, + "loss": 0.07454321980476379, + "step": 20410 + }, + { + "epoch": 2.8985095812633075, + "grad_norm": 3.215022087097168, + "learning_rate": 9.710290986515259e-05, + "loss": 0.11371394395828247, + "step": 20420 + }, + { + "epoch": 2.899929027679205, + "grad_norm": 7.932292461395264, + "learning_rate": 9.71014904187367e-05, + "loss": 0.1307593822479248, + "step": 20430 + }, + { + "epoch": 2.9013484740951028, + "grad_norm": 3.419353723526001, + "learning_rate": 9.71000709723208e-05, + "loss": 0.11492658853530884, + "step": 20440 + }, + { + "epoch": 2.9027679205110006, + "grad_norm": 1.6420551538467407, + "learning_rate": 9.70986515259049e-05, + "loss": 0.09474117159843445, + "step": 20450 + }, + { + "epoch": 2.9041873669268985, + "grad_norm": 1.5180848836898804, + "learning_rate": 9.7097232079489e-05, + "loss": 0.16010476350784303, + "step": 20460 + }, + { + "epoch": 2.9056068133427964, + "grad_norm": 7.387273788452148, + "learning_rate": 9.70958126330731e-05, + "loss": 0.12979986667633056, + "step": 20470 + }, + { + "epoch": 2.907026259758694, + "grad_norm": 2.0460073947906494, + "learning_rate": 9.709439318665721e-05, + "loss": 0.09822458028793335, + "step": 20480 + }, + { + "epoch": 2.908445706174592, + "grad_norm": 8.7783784866333, + "learning_rate": 9.709297374024131e-05, + "loss": 0.10728850364685058, + "step": 20490 + }, + { + "epoch": 2.9098651525904895, + "grad_norm": 10.74223804473877, + "learning_rate": 9.709155429382542e-05, + "loss": 0.14357963800430298, + "step": 20500 + }, + { + "epoch": 2.9098651525904895, + "eval_accuracy": 0.9639473516881796, + "eval_loss": 0.10387223958969116, + "eval_runtime": 32.6025, + "eval_samples_per_second": 482.386, + "eval_steps_per_second": 15.091, + "step": 20500 + }, + { + "epoch": 2.9112845990063874, + "grad_norm": 6.561285495758057, + "learning_rate": 9.709013484740952e-05, + "loss": 0.09862427711486817, + "step": 20510 + }, + { + "epoch": 2.9127040454222852, + "grad_norm": 9.80976390838623, + "learning_rate": 9.708871540099362e-05, + "loss": 0.10329036712646485, + "step": 20520 + }, + { + "epoch": 2.914123491838183, + "grad_norm": 3.2249553203582764, + "learning_rate": 9.708729595457771e-05, + "loss": 0.09398200511932372, + "step": 20530 + }, + { + "epoch": 2.915542938254081, + "grad_norm": 3.0429465770721436, + "learning_rate": 9.708587650816182e-05, + "loss": 0.08952078223228455, + "step": 20540 + }, + { + "epoch": 2.916962384669979, + "grad_norm": 2.384573459625244, + "learning_rate": 9.708445706174592e-05, + "loss": 0.06835871934890747, + "step": 20550 + }, + { + "epoch": 2.9183818310858767, + "grad_norm": 2.1423826217651367, + "learning_rate": 9.708303761533003e-05, + "loss": 0.07619114518165589, + "step": 20560 + }, + { + "epoch": 2.919801277501774, + "grad_norm": 3.932051181793213, + "learning_rate": 9.708161816891413e-05, + "loss": 0.11714667081832886, + "step": 20570 + }, + { + "epoch": 2.921220723917672, + "grad_norm": 5.277032852172852, + "learning_rate": 9.708019872249823e-05, + "loss": 0.07001240253448486, + "step": 20580 + }, + { + "epoch": 2.92264017033357, + "grad_norm": 5.1413798332214355, + "learning_rate": 9.707877927608234e-05, + "loss": 0.09098179340362549, + "step": 20590 + }, + { + "epoch": 2.9240596167494677, + "grad_norm": 5.64100456237793, + "learning_rate": 9.707735982966644e-05, + "loss": 0.10666381120681763, + "step": 20600 + }, + { + "epoch": 2.9254790631653655, + "grad_norm": 9.501540184020996, + "learning_rate": 9.707594038325055e-05, + "loss": 0.05949283242225647, + "step": 20610 + }, + { + "epoch": 2.9268985095812634, + "grad_norm": 6.489498138427734, + "learning_rate": 9.707452093683463e-05, + "loss": 0.083852881193161, + "step": 20620 + }, + { + "epoch": 2.9283179559971613, + "grad_norm": 1.9999171495437622, + "learning_rate": 9.707310149041874e-05, + "loss": 0.08824072480201721, + "step": 20630 + }, + { + "epoch": 2.9297374024130587, + "grad_norm": 10.467041015625, + "learning_rate": 9.707168204400284e-05, + "loss": 0.22370665073394774, + "step": 20640 + }, + { + "epoch": 2.9311568488289566, + "grad_norm": 3.191193103790283, + "learning_rate": 9.707026259758695e-05, + "loss": 0.08672508597373962, + "step": 20650 + }, + { + "epoch": 2.9325762952448544, + "grad_norm": 8.910825729370117, + "learning_rate": 9.706884315117105e-05, + "loss": 0.09984519481658935, + "step": 20660 + }, + { + "epoch": 2.9339957416607523, + "grad_norm": 5.282776832580566, + "learning_rate": 9.706742370475514e-05, + "loss": 0.12132351398468018, + "step": 20670 + }, + { + "epoch": 2.93541518807665, + "grad_norm": 6.024061679840088, + "learning_rate": 9.706600425833925e-05, + "loss": 0.0702341616153717, + "step": 20680 + }, + { + "epoch": 2.936834634492548, + "grad_norm": 3.016757011413574, + "learning_rate": 9.706458481192335e-05, + "loss": 0.18172093629837036, + "step": 20690 + }, + { + "epoch": 2.938254080908446, + "grad_norm": 6.451714515686035, + "learning_rate": 9.706316536550746e-05, + "loss": 0.16414980888366698, + "step": 20700 + }, + { + "epoch": 2.9396735273243433, + "grad_norm": 3.6543655395507812, + "learning_rate": 9.706174591909156e-05, + "loss": 0.06588509678840637, + "step": 20710 + }, + { + "epoch": 2.941092973740241, + "grad_norm": 2.2044341564178467, + "learning_rate": 9.706032647267566e-05, + "loss": 0.10182955265045165, + "step": 20720 + }, + { + "epoch": 2.942512420156139, + "grad_norm": 4.035127639770508, + "learning_rate": 9.705890702625976e-05, + "loss": 0.08563597202301025, + "step": 20730 + }, + { + "epoch": 2.943931866572037, + "grad_norm": 0.5155683159828186, + "learning_rate": 9.705748757984387e-05, + "loss": 0.061300069093704224, + "step": 20740 + }, + { + "epoch": 2.9453513129879347, + "grad_norm": 5.438033103942871, + "learning_rate": 9.705606813342796e-05, + "loss": 0.10039635896682739, + "step": 20750 + }, + { + "epoch": 2.9467707594038326, + "grad_norm": 4.031142711639404, + "learning_rate": 9.705464868701207e-05, + "loss": 0.13106780052185057, + "step": 20760 + }, + { + "epoch": 2.9481902058197305, + "grad_norm": 1.6434075832366943, + "learning_rate": 9.705322924059617e-05, + "loss": 0.06390889883041381, + "step": 20770 + }, + { + "epoch": 2.949609652235628, + "grad_norm": 0.5606821775436401, + "learning_rate": 9.705180979418027e-05, + "loss": 0.1445988893508911, + "step": 20780 + }, + { + "epoch": 2.9510290986515257, + "grad_norm": 8.509517669677734, + "learning_rate": 9.705039034776438e-05, + "loss": 0.11696761846542358, + "step": 20790 + }, + { + "epoch": 2.9524485450674236, + "grad_norm": 1.219256043434143, + "learning_rate": 9.704897090134848e-05, + "loss": 0.1131407618522644, + "step": 20800 + }, + { + "epoch": 2.9538679914833215, + "grad_norm": 4.903664588928223, + "learning_rate": 9.704755145493259e-05, + "loss": 0.12911027669906616, + "step": 20810 + }, + { + "epoch": 2.9552874378992193, + "grad_norm": 2.6238746643066406, + "learning_rate": 9.704613200851669e-05, + "loss": 0.04189004898071289, + "step": 20820 + }, + { + "epoch": 2.956706884315117, + "grad_norm": 4.9772443771362305, + "learning_rate": 9.704471256210078e-05, + "loss": 0.1641558289527893, + "step": 20830 + }, + { + "epoch": 2.958126330731015, + "grad_norm": 3.766991376876831, + "learning_rate": 9.704329311568488e-05, + "loss": 0.15985740423202516, + "step": 20840 + }, + { + "epoch": 2.9595457771469125, + "grad_norm": 0.883904218673706, + "learning_rate": 9.704187366926899e-05, + "loss": 0.04713291525840759, + "step": 20850 + }, + { + "epoch": 2.9609652235628108, + "grad_norm": 2.7351174354553223, + "learning_rate": 9.704045422285309e-05, + "loss": 0.08329285383224487, + "step": 20860 + }, + { + "epoch": 2.962384669978708, + "grad_norm": 7.424506187438965, + "learning_rate": 9.70390347764372e-05, + "loss": 0.1191827893257141, + "step": 20870 + }, + { + "epoch": 2.963804116394606, + "grad_norm": 2.405928134918213, + "learning_rate": 9.70376153300213e-05, + "loss": 0.03490549027919769, + "step": 20880 + }, + { + "epoch": 2.965223562810504, + "grad_norm": 2.498183488845825, + "learning_rate": 9.70361958836054e-05, + "loss": 0.043746381998062134, + "step": 20890 + }, + { + "epoch": 2.966643009226402, + "grad_norm": 5.296067237854004, + "learning_rate": 9.70347764371895e-05, + "loss": 0.12182191610336304, + "step": 20900 + }, + { + "epoch": 2.9680624556422996, + "grad_norm": 5.240711688995361, + "learning_rate": 9.70333569907736e-05, + "loss": 0.05737144351005554, + "step": 20910 + }, + { + "epoch": 2.969481902058197, + "grad_norm": 9.032751083374023, + "learning_rate": 9.703193754435771e-05, + "loss": 0.10139278173446656, + "step": 20920 + }, + { + "epoch": 2.9709013484740954, + "grad_norm": 8.68384838104248, + "learning_rate": 9.70305180979418e-05, + "loss": 0.10488021373748779, + "step": 20930 + }, + { + "epoch": 2.972320794889993, + "grad_norm": 11.946162223815918, + "learning_rate": 9.702909865152591e-05, + "loss": 0.08643736839294433, + "step": 20940 + }, + { + "epoch": 2.9737402413058907, + "grad_norm": 7.999373435974121, + "learning_rate": 9.702767920511e-05, + "loss": 0.10784640312194824, + "step": 20950 + }, + { + "epoch": 2.9751596877217885, + "grad_norm": 10.503974914550781, + "learning_rate": 9.702625975869412e-05, + "loss": 0.14789512157440185, + "step": 20960 + }, + { + "epoch": 2.9765791341376864, + "grad_norm": 2.7038733959198, + "learning_rate": 9.702484031227821e-05, + "loss": 0.11931388378143311, + "step": 20970 + }, + { + "epoch": 2.9779985805535842, + "grad_norm": 4.435423374176025, + "learning_rate": 9.702342086586231e-05, + "loss": 0.1014961838722229, + "step": 20980 + }, + { + "epoch": 2.9794180269694817, + "grad_norm": 9.037029266357422, + "learning_rate": 9.702200141944642e-05, + "loss": 0.10802547931671143, + "step": 20990 + }, + { + "epoch": 2.98083747338538, + "grad_norm": 1.3593106269836426, + "learning_rate": 9.702058197303052e-05, + "loss": 0.06956174969673157, + "step": 21000 + }, + { + "epoch": 2.98083747338538, + "eval_accuracy": 0.9595599923698099, + "eval_loss": 0.12062688916921616, + "eval_runtime": 33.1203, + "eval_samples_per_second": 474.845, + "eval_steps_per_second": 14.855, + "step": 21000 + }, + { + "epoch": 2.9822569198012774, + "grad_norm": 0.6549391150474548, + "learning_rate": 9.701916252661463e-05, + "loss": 0.13815345764160156, + "step": 21010 + }, + { + "epoch": 2.9836763662171752, + "grad_norm": 6.318053722381592, + "learning_rate": 9.701774308019873e-05, + "loss": 0.126990008354187, + "step": 21020 + }, + { + "epoch": 2.985095812633073, + "grad_norm": 0.9818340539932251, + "learning_rate": 9.701632363378283e-05, + "loss": 0.1425946831703186, + "step": 21030 + }, + { + "epoch": 2.986515259048971, + "grad_norm": 7.161218643188477, + "learning_rate": 9.701490418736692e-05, + "loss": 0.10167466402053833, + "step": 21040 + }, + { + "epoch": 2.987934705464869, + "grad_norm": 2.8544816970825195, + "learning_rate": 9.701348474095103e-05, + "loss": 0.052589023113250734, + "step": 21050 + }, + { + "epoch": 2.9893541518807663, + "grad_norm": 3.788613796234131, + "learning_rate": 9.701206529453513e-05, + "loss": 0.0730807602405548, + "step": 21060 + }, + { + "epoch": 2.9907735982966646, + "grad_norm": 3.1659812927246094, + "learning_rate": 9.701064584811924e-05, + "loss": 0.0667772889137268, + "step": 21070 + }, + { + "epoch": 2.992193044712562, + "grad_norm": 3.7923996448516846, + "learning_rate": 9.700922640170334e-05, + "loss": 0.09958038330078126, + "step": 21080 + }, + { + "epoch": 2.99361249112846, + "grad_norm": 0.3780229985713959, + "learning_rate": 9.700780695528744e-05, + "loss": 0.04535020887851715, + "step": 21090 + }, + { + "epoch": 2.9950319375443577, + "grad_norm": 6.924422264099121, + "learning_rate": 9.700638750887155e-05, + "loss": 0.07231849431991577, + "step": 21100 + }, + { + "epoch": 2.9964513839602556, + "grad_norm": 4.052742958068848, + "learning_rate": 9.700496806245565e-05, + "loss": 0.06652356386184692, + "step": 21110 + }, + { + "epoch": 2.9978708303761534, + "grad_norm": 2.4228880405426025, + "learning_rate": 9.700354861603976e-05, + "loss": 0.13166139125823975, + "step": 21120 + }, + { + "epoch": 2.999290276792051, + "grad_norm": 1.3871126174926758, + "learning_rate": 9.700212916962385e-05, + "loss": 0.0858015775680542, + "step": 21130 + }, + { + "epoch": 3.0007097232079487, + "grad_norm": 4.722600936889648, + "learning_rate": 9.700070972320795e-05, + "loss": 0.12765930891036986, + "step": 21140 + }, + { + "epoch": 3.0021291696238466, + "grad_norm": 1.4345152378082275, + "learning_rate": 9.699929027679205e-05, + "loss": 0.11781737804412842, + "step": 21150 + }, + { + "epoch": 3.0035486160397444, + "grad_norm": 4.4884352684021, + "learning_rate": 9.699787083037616e-05, + "loss": 0.05820587873458862, + "step": 21160 + }, + { + "epoch": 3.0049680624556423, + "grad_norm": 2.4350528717041016, + "learning_rate": 9.699645138396026e-05, + "loss": 0.08642982244491577, + "step": 21170 + }, + { + "epoch": 3.00638750887154, + "grad_norm": 2.5722460746765137, + "learning_rate": 9.699503193754437e-05, + "loss": 0.13995343446731567, + "step": 21180 + }, + { + "epoch": 3.007806955287438, + "grad_norm": 8.12808895111084, + "learning_rate": 9.699361249112846e-05, + "loss": 0.10619027614593506, + "step": 21190 + }, + { + "epoch": 3.009226401703336, + "grad_norm": 9.416518211364746, + "learning_rate": 9.699219304471256e-05, + "loss": 0.10611592531204224, + "step": 21200 + }, + { + "epoch": 3.0106458481192333, + "grad_norm": 2.595517873764038, + "learning_rate": 9.699077359829667e-05, + "loss": 0.03944927752017975, + "step": 21210 + }, + { + "epoch": 3.012065294535131, + "grad_norm": 6.59434175491333, + "learning_rate": 9.698935415188077e-05, + "loss": 0.06297655701637268, + "step": 21220 + }, + { + "epoch": 3.013484740951029, + "grad_norm": 7.814486026763916, + "learning_rate": 9.698793470546488e-05, + "loss": 0.12234771251678467, + "step": 21230 + }, + { + "epoch": 3.014904187366927, + "grad_norm": 3.0475339889526367, + "learning_rate": 9.698651525904897e-05, + "loss": 0.10413910150527954, + "step": 21240 + }, + { + "epoch": 3.0163236337828248, + "grad_norm": 3.0739729404449463, + "learning_rate": 9.698509581263308e-05, + "loss": 0.048439356684684756, + "step": 21250 + }, + { + "epoch": 3.0177430801987226, + "grad_norm": 2.5247795581817627, + "learning_rate": 9.698367636621717e-05, + "loss": 0.08907513618469239, + "step": 21260 + }, + { + "epoch": 3.0191625266146205, + "grad_norm": 1.5360527038574219, + "learning_rate": 9.698225691980128e-05, + "loss": 0.08706284761428833, + "step": 21270 + }, + { + "epoch": 3.020581973030518, + "grad_norm": 2.9414641857147217, + "learning_rate": 9.698083747338538e-05, + "loss": 0.06573014259338379, + "step": 21280 + }, + { + "epoch": 3.0220014194464158, + "grad_norm": 4.994847297668457, + "learning_rate": 9.697941802696948e-05, + "loss": 0.13635185956954957, + "step": 21290 + }, + { + "epoch": 3.0234208658623136, + "grad_norm": 1.8111882209777832, + "learning_rate": 9.697799858055359e-05, + "loss": 0.09840369820594788, + "step": 21300 + }, + { + "epoch": 3.0248403122782115, + "grad_norm": 1.4137115478515625, + "learning_rate": 9.697657913413769e-05, + "loss": 0.136954402923584, + "step": 21310 + }, + { + "epoch": 3.0262597586941093, + "grad_norm": 2.749936819076538, + "learning_rate": 9.697530163236339e-05, + "loss": 0.10054677724838257, + "step": 21320 + }, + { + "epoch": 3.027679205110007, + "grad_norm": 4.701079368591309, + "learning_rate": 9.697388218594748e-05, + "loss": 0.07355481386184692, + "step": 21330 + }, + { + "epoch": 3.029098651525905, + "grad_norm": 4.2811408042907715, + "learning_rate": 9.69724627395316e-05, + "loss": 0.07188469767570496, + "step": 21340 + }, + { + "epoch": 3.0305180979418025, + "grad_norm": 7.573612213134766, + "learning_rate": 9.697104329311569e-05, + "loss": 0.05111314058303833, + "step": 21350 + }, + { + "epoch": 3.0319375443577004, + "grad_norm": 3.0801517963409424, + "learning_rate": 9.696962384669979e-05, + "loss": 0.0739107072353363, + "step": 21360 + }, + { + "epoch": 3.033356990773598, + "grad_norm": 14.997776985168457, + "learning_rate": 9.696820440028389e-05, + "loss": 0.18201708793640137, + "step": 21370 + }, + { + "epoch": 3.034776437189496, + "grad_norm": 8.705801963806152, + "learning_rate": 9.6966784953868e-05, + "loss": 0.09414076805114746, + "step": 21380 + }, + { + "epoch": 3.036195883605394, + "grad_norm": 2.687983751296997, + "learning_rate": 9.69653655074521e-05, + "loss": 0.10116174221038818, + "step": 21390 + }, + { + "epoch": 3.037615330021292, + "grad_norm": 3.300055503845215, + "learning_rate": 9.69639460610362e-05, + "loss": 0.05839415788650513, + "step": 21400 + }, + { + "epoch": 3.0390347764371897, + "grad_norm": 4.883892059326172, + "learning_rate": 9.69625266146203e-05, + "loss": 0.0997147798538208, + "step": 21410 + }, + { + "epoch": 3.040454222853087, + "grad_norm": 4.513243675231934, + "learning_rate": 9.69611071682044e-05, + "loss": 0.053650110960006714, + "step": 21420 + }, + { + "epoch": 3.041873669268985, + "grad_norm": 1.9839102029800415, + "learning_rate": 9.695968772178851e-05, + "loss": 0.1009899377822876, + "step": 21430 + }, + { + "epoch": 3.043293115684883, + "grad_norm": 3.678035259246826, + "learning_rate": 9.695826827537261e-05, + "loss": 0.09355159401893616, + "step": 21440 + }, + { + "epoch": 3.0447125621007807, + "grad_norm": 12.25532054901123, + "learning_rate": 9.695684882895672e-05, + "loss": 0.09784587025642395, + "step": 21450 + }, + { + "epoch": 3.0461320085166785, + "grad_norm": 10.08337688446045, + "learning_rate": 9.695542938254082e-05, + "loss": 0.22380528450012208, + "step": 21460 + }, + { + "epoch": 3.0475514549325764, + "grad_norm": 1.0703997611999512, + "learning_rate": 9.695400993612491e-05, + "loss": 0.03969487845897675, + "step": 21470 + }, + { + "epoch": 3.0489709013484743, + "grad_norm": 2.9388980865478516, + "learning_rate": 9.695259048970901e-05, + "loss": 0.07186501622200012, + "step": 21480 + }, + { + "epoch": 3.0503903477643717, + "grad_norm": 3.5290896892547607, + "learning_rate": 9.695117104329312e-05, + "loss": 0.07260159850120544, + "step": 21490 + }, + { + "epoch": 3.0518097941802695, + "grad_norm": 2.9938881397247314, + "learning_rate": 9.694975159687722e-05, + "loss": 0.09509387612342834, + "step": 21500 + }, + { + "epoch": 3.0518097941802695, + "eval_accuracy": 0.9589877281108921, + "eval_loss": 0.12867264449596405, + "eval_runtime": 31.3789, + "eval_samples_per_second": 501.196, + "eval_steps_per_second": 15.679, + "step": 21500 + }, + { + "epoch": 3.0532292405961674, + "grad_norm": 10.751752853393555, + "learning_rate": 9.694833215046133e-05, + "loss": 0.08080363273620605, + "step": 21510 + }, + { + "epoch": 3.0546486870120653, + "grad_norm": 2.299959659576416, + "learning_rate": 9.694691270404543e-05, + "loss": 0.14854525327682494, + "step": 21520 + }, + { + "epoch": 3.056068133427963, + "grad_norm": 4.220566272735596, + "learning_rate": 9.694549325762953e-05, + "loss": 0.09466566443443299, + "step": 21530 + }, + { + "epoch": 3.057487579843861, + "grad_norm": 6.087703704833984, + "learning_rate": 9.694407381121364e-05, + "loss": 0.09965238571166993, + "step": 21540 + }, + { + "epoch": 3.058907026259759, + "grad_norm": 8.385695457458496, + "learning_rate": 9.694265436479773e-05, + "loss": 0.10562925338745117, + "step": 21550 + }, + { + "epoch": 3.0603264726756567, + "grad_norm": 0.5750550031661987, + "learning_rate": 9.694123491838185e-05, + "loss": 0.07159033417701721, + "step": 21560 + }, + { + "epoch": 3.061745919091554, + "grad_norm": 5.470452308654785, + "learning_rate": 9.693981547196593e-05, + "loss": 0.1067430019378662, + "step": 21570 + }, + { + "epoch": 3.063165365507452, + "grad_norm": 1.6126492023468018, + "learning_rate": 9.693839602555004e-05, + "loss": 0.07778850793838502, + "step": 21580 + }, + { + "epoch": 3.06458481192335, + "grad_norm": 8.54702377319336, + "learning_rate": 9.693697657913414e-05, + "loss": 0.051190412044525145, + "step": 21590 + }, + { + "epoch": 3.0660042583392477, + "grad_norm": 5.9458818435668945, + "learning_rate": 9.693555713271825e-05, + "loss": 0.05976734161376953, + "step": 21600 + }, + { + "epoch": 3.0674237047551456, + "grad_norm": 11.962884902954102, + "learning_rate": 9.693413768630235e-05, + "loss": 0.08366570472717286, + "step": 21610 + }, + { + "epoch": 3.0688431511710434, + "grad_norm": 5.248124122619629, + "learning_rate": 9.693271823988644e-05, + "loss": 0.06071932911872864, + "step": 21620 + }, + { + "epoch": 3.0702625975869413, + "grad_norm": 3.1197493076324463, + "learning_rate": 9.693129879347055e-05, + "loss": 0.08671906590461731, + "step": 21630 + }, + { + "epoch": 3.0716820440028387, + "grad_norm": 6.69197940826416, + "learning_rate": 9.692987934705465e-05, + "loss": 0.0895846426486969, + "step": 21640 + }, + { + "epoch": 3.0731014904187366, + "grad_norm": 1.1883106231689453, + "learning_rate": 9.692845990063876e-05, + "loss": 0.09830948114395141, + "step": 21650 + }, + { + "epoch": 3.0745209368346345, + "grad_norm": 1.1830201148986816, + "learning_rate": 9.692704045422286e-05, + "loss": 0.09011884927749633, + "step": 21660 + }, + { + "epoch": 3.0759403832505323, + "grad_norm": 1.0241851806640625, + "learning_rate": 9.692562100780696e-05, + "loss": 0.11997926235198975, + "step": 21670 + }, + { + "epoch": 3.07735982966643, + "grad_norm": 5.068016052246094, + "learning_rate": 9.692420156139105e-05, + "loss": 0.11507253646850586, + "step": 21680 + }, + { + "epoch": 3.078779276082328, + "grad_norm": 3.562347173690796, + "learning_rate": 9.692278211497517e-05, + "loss": 0.07022674679756165, + "step": 21690 + }, + { + "epoch": 3.080198722498226, + "grad_norm": 7.2673163414001465, + "learning_rate": 9.692136266855926e-05, + "loss": 0.09197630882263183, + "step": 21700 + }, + { + "epoch": 3.0816181689141233, + "grad_norm": 2.2533631324768066, + "learning_rate": 9.691994322214337e-05, + "loss": 0.05809432864189148, + "step": 21710 + }, + { + "epoch": 3.083037615330021, + "grad_norm": 5.0073561668396, + "learning_rate": 9.691852377572747e-05, + "loss": 0.10983726978302003, + "step": 21720 + }, + { + "epoch": 3.084457061745919, + "grad_norm": 8.21857738494873, + "learning_rate": 9.691710432931157e-05, + "loss": 0.06723290681838989, + "step": 21730 + }, + { + "epoch": 3.085876508161817, + "grad_norm": 9.05629825592041, + "learning_rate": 9.691568488289568e-05, + "loss": 0.05822429656982422, + "step": 21740 + }, + { + "epoch": 3.0872959545777148, + "grad_norm": 2.9089202880859375, + "learning_rate": 9.691426543647978e-05, + "loss": 0.062278813123703, + "step": 21750 + }, + { + "epoch": 3.0887154009936126, + "grad_norm": 5.445140838623047, + "learning_rate": 9.691284599006389e-05, + "loss": 0.07242774367332458, + "step": 21760 + }, + { + "epoch": 3.0901348474095105, + "grad_norm": 5.643183708190918, + "learning_rate": 9.691142654364798e-05, + "loss": 0.17729694843292237, + "step": 21770 + }, + { + "epoch": 3.091554293825408, + "grad_norm": 1.2977749109268188, + "learning_rate": 9.691000709723208e-05, + "loss": 0.06676494479179382, + "step": 21780 + }, + { + "epoch": 3.092973740241306, + "grad_norm": 3.805422067642212, + "learning_rate": 9.690858765081618e-05, + "loss": 0.09304124712944031, + "step": 21790 + }, + { + "epoch": 3.0943931866572036, + "grad_norm": 6.814877510070801, + "learning_rate": 9.690716820440029e-05, + "loss": 0.08317658305168152, + "step": 21800 + }, + { + "epoch": 3.0958126330731015, + "grad_norm": 6.4380388259887695, + "learning_rate": 9.690574875798439e-05, + "loss": 0.11440763473510743, + "step": 21810 + }, + { + "epoch": 3.0972320794889994, + "grad_norm": 2.2712135314941406, + "learning_rate": 9.69043293115685e-05, + "loss": 0.05781182050704956, + "step": 21820 + }, + { + "epoch": 3.0986515259048972, + "grad_norm": 2.6996850967407227, + "learning_rate": 9.69029098651526e-05, + "loss": 0.09182395935058593, + "step": 21830 + }, + { + "epoch": 3.100070972320795, + "grad_norm": 3.8571221828460693, + "learning_rate": 9.69014904187367e-05, + "loss": 0.05620205998420715, + "step": 21840 + }, + { + "epoch": 3.1014904187366925, + "grad_norm": 2.1438169479370117, + "learning_rate": 9.69000709723208e-05, + "loss": 0.11742359399795532, + "step": 21850 + }, + { + "epoch": 3.1029098651525904, + "grad_norm": 0.5870881676673889, + "learning_rate": 9.68986515259049e-05, + "loss": 0.10411131381988525, + "step": 21860 + }, + { + "epoch": 3.1043293115684882, + "grad_norm": 3.8963239192962646, + "learning_rate": 9.689723207948901e-05, + "loss": 0.073959881067276, + "step": 21870 + }, + { + "epoch": 3.105748757984386, + "grad_norm": 1.869137167930603, + "learning_rate": 9.68958126330731e-05, + "loss": 0.09284948706626892, + "step": 21880 + }, + { + "epoch": 3.107168204400284, + "grad_norm": 7.974472522735596, + "learning_rate": 9.689439318665721e-05, + "loss": 0.08199034929275513, + "step": 21890 + }, + { + "epoch": 3.108587650816182, + "grad_norm": 5.112462520599365, + "learning_rate": 9.68929737402413e-05, + "loss": 0.04500599205493927, + "step": 21900 + }, + { + "epoch": 3.1100070972320797, + "grad_norm": 4.712485313415527, + "learning_rate": 9.689155429382542e-05, + "loss": 0.08608510494232177, + "step": 21910 + }, + { + "epoch": 3.111426543647977, + "grad_norm": 4.643701553344727, + "learning_rate": 9.689013484740951e-05, + "loss": 0.06371254920959472, + "step": 21920 + }, + { + "epoch": 3.112845990063875, + "grad_norm": 0.6126397252082825, + "learning_rate": 9.688871540099361e-05, + "loss": 0.06569015383720397, + "step": 21930 + }, + { + "epoch": 3.114265436479773, + "grad_norm": 0.9692607522010803, + "learning_rate": 9.688729595457772e-05, + "loss": 0.04018869698047638, + "step": 21940 + }, + { + "epoch": 3.1156848828956707, + "grad_norm": 1.4925132989883423, + "learning_rate": 9.688587650816182e-05, + "loss": 0.12035884857177734, + "step": 21950 + }, + { + "epoch": 3.1171043293115686, + "grad_norm": 8.849794387817383, + "learning_rate": 9.688445706174593e-05, + "loss": 0.10423930883407592, + "step": 21960 + }, + { + "epoch": 3.1185237757274664, + "grad_norm": 0.555972158908844, + "learning_rate": 9.688303761533003e-05, + "loss": 0.036292347311973575, + "step": 21970 + }, + { + "epoch": 3.1199432221433643, + "grad_norm": 1.3053301572799683, + "learning_rate": 9.688161816891412e-05, + "loss": 0.055543911457061765, + "step": 21980 + }, + { + "epoch": 3.1213626685592617, + "grad_norm": 5.318549633026123, + "learning_rate": 9.688019872249822e-05, + "loss": 0.06087319850921631, + "step": 21990 + }, + { + "epoch": 3.1227821149751596, + "grad_norm": 1.2716312408447266, + "learning_rate": 9.687877927608233e-05, + "loss": 0.05343518257141113, + "step": 22000 + }, + { + "epoch": 3.1227821149751596, + "eval_accuracy": 0.9604501812170153, + "eval_loss": 0.12624655663967133, + "eval_runtime": 31.6816, + "eval_samples_per_second": 496.408, + "eval_steps_per_second": 15.53, + "step": 22000 + }, + { + "epoch": 3.1242015613910574, + "grad_norm": 2.791890859603882, + "learning_rate": 9.687735982966643e-05, + "loss": 0.16994814872741698, + "step": 22010 + }, + { + "epoch": 3.1256210078069553, + "grad_norm": 0.727378249168396, + "learning_rate": 9.687594038325054e-05, + "loss": 0.09330202341079712, + "step": 22020 + }, + { + "epoch": 3.127040454222853, + "grad_norm": 2.6088101863861084, + "learning_rate": 9.687452093683464e-05, + "loss": 0.05271919369697571, + "step": 22030 + }, + { + "epoch": 3.128459900638751, + "grad_norm": 5.061529159545898, + "learning_rate": 9.687310149041874e-05, + "loss": 0.1032175898551941, + "step": 22040 + }, + { + "epoch": 3.129879347054649, + "grad_norm": 3.324045419692993, + "learning_rate": 9.687168204400285e-05, + "loss": 0.13030195236206055, + "step": 22050 + }, + { + "epoch": 3.1312987934705463, + "grad_norm": 2.8977231979370117, + "learning_rate": 9.687026259758694e-05, + "loss": 0.04515729248523712, + "step": 22060 + }, + { + "epoch": 3.132718239886444, + "grad_norm": 13.42546272277832, + "learning_rate": 9.686884315117106e-05, + "loss": 0.16047141551971436, + "step": 22070 + }, + { + "epoch": 3.134137686302342, + "grad_norm": 8.009624481201172, + "learning_rate": 9.686742370475514e-05, + "loss": 0.07332398891448974, + "step": 22080 + }, + { + "epoch": 3.13555713271824, + "grad_norm": 1.6250791549682617, + "learning_rate": 9.686600425833925e-05, + "loss": 0.08664785027503967, + "step": 22090 + }, + { + "epoch": 3.1369765791341377, + "grad_norm": 3.961372137069702, + "learning_rate": 9.686458481192335e-05, + "loss": 0.05184776782989502, + "step": 22100 + }, + { + "epoch": 3.1383960255500356, + "grad_norm": 3.3162078857421875, + "learning_rate": 9.686316536550746e-05, + "loss": 0.14172728061676027, + "step": 22110 + }, + { + "epoch": 3.1398154719659335, + "grad_norm": 2.8545219898223877, + "learning_rate": 9.686174591909156e-05, + "loss": 0.12487195730209351, + "step": 22120 + }, + { + "epoch": 3.141234918381831, + "grad_norm": 5.991825580596924, + "learning_rate": 9.686032647267567e-05, + "loss": 0.09468575716018676, + "step": 22130 + }, + { + "epoch": 3.1426543647977287, + "grad_norm": 3.7277402877807617, + "learning_rate": 9.685890702625976e-05, + "loss": 0.15779935121536254, + "step": 22140 + }, + { + "epoch": 3.1440738112136266, + "grad_norm": 5.867143630981445, + "learning_rate": 9.685748757984386e-05, + "loss": 0.06446941494941712, + "step": 22150 + }, + { + "epoch": 3.1454932576295245, + "grad_norm": 0.9702675342559814, + "learning_rate": 9.685606813342797e-05, + "loss": 0.10171631574630738, + "step": 22160 + }, + { + "epoch": 3.1469127040454223, + "grad_norm": 12.031753540039062, + "learning_rate": 9.685464868701207e-05, + "loss": 0.1400713086128235, + "step": 22170 + }, + { + "epoch": 3.14833215046132, + "grad_norm": 3.781707525253296, + "learning_rate": 9.685322924059618e-05, + "loss": 0.05259775519371033, + "step": 22180 + }, + { + "epoch": 3.149751596877218, + "grad_norm": 4.4153642654418945, + "learning_rate": 9.685180979418026e-05, + "loss": 0.10050948858261108, + "step": 22190 + }, + { + "epoch": 3.1511710432931155, + "grad_norm": 2.492379665374756, + "learning_rate": 9.685039034776438e-05, + "loss": 0.13373640775680543, + "step": 22200 + }, + { + "epoch": 3.1525904897090133, + "grad_norm": 8.212589263916016, + "learning_rate": 9.684897090134847e-05, + "loss": 0.0804680585861206, + "step": 22210 + }, + { + "epoch": 3.154009936124911, + "grad_norm": 7.918879508972168, + "learning_rate": 9.684755145493258e-05, + "loss": 0.04239166975021362, + "step": 22220 + }, + { + "epoch": 3.155429382540809, + "grad_norm": 0.38615530729293823, + "learning_rate": 9.68461320085167e-05, + "loss": 0.07814024686813355, + "step": 22230 + }, + { + "epoch": 3.156848828956707, + "grad_norm": 6.945682048797607, + "learning_rate": 9.684471256210078e-05, + "loss": 0.11140685081481934, + "step": 22240 + }, + { + "epoch": 3.158268275372605, + "grad_norm": 5.574148654937744, + "learning_rate": 9.684329311568489e-05, + "loss": 0.12524588108062745, + "step": 22250 + }, + { + "epoch": 3.1596877217885027, + "grad_norm": 2.4712400436401367, + "learning_rate": 9.684187366926899e-05, + "loss": 0.06859158277511597, + "step": 22260 + }, + { + "epoch": 3.1611071682044, + "grad_norm": 11.472119331359863, + "learning_rate": 9.68404542228531e-05, + "loss": 0.07999058961868286, + "step": 22270 + }, + { + "epoch": 3.162526614620298, + "grad_norm": 0.743500828742981, + "learning_rate": 9.68390347764372e-05, + "loss": 0.05272719860076904, + "step": 22280 + }, + { + "epoch": 3.163946061036196, + "grad_norm": 3.228672742843628, + "learning_rate": 9.683761533002129e-05, + "loss": 0.09461968541145324, + "step": 22290 + }, + { + "epoch": 3.1653655074520937, + "grad_norm": 9.705907821655273, + "learning_rate": 9.683619588360539e-05, + "loss": 0.08296184539794922, + "step": 22300 + }, + { + "epoch": 3.1667849538679915, + "grad_norm": 5.514443397521973, + "learning_rate": 9.68347764371895e-05, + "loss": 0.08486506342887878, + "step": 22310 + }, + { + "epoch": 3.1682044002838894, + "grad_norm": 10.679105758666992, + "learning_rate": 9.683335699077361e-05, + "loss": 0.1270732879638672, + "step": 22320 + }, + { + "epoch": 3.1696238466997873, + "grad_norm": 6.348006725311279, + "learning_rate": 9.683193754435771e-05, + "loss": 0.09326770305633544, + "step": 22330 + }, + { + "epoch": 3.1710432931156847, + "grad_norm": 0.7028082609176636, + "learning_rate": 9.68305180979418e-05, + "loss": 0.057895565032958986, + "step": 22340 + }, + { + "epoch": 3.1724627395315825, + "grad_norm": 2.103309392929077, + "learning_rate": 9.68290986515259e-05, + "loss": 0.08313475251197815, + "step": 22350 + }, + { + "epoch": 3.1738821859474804, + "grad_norm": 1.7693034410476685, + "learning_rate": 9.682767920511001e-05, + "loss": 0.07178552150726318, + "step": 22360 + }, + { + "epoch": 3.1753016323633783, + "grad_norm": 1.420407772064209, + "learning_rate": 9.682625975869411e-05, + "loss": 0.1434171199798584, + "step": 22370 + }, + { + "epoch": 3.176721078779276, + "grad_norm": 2.847599744796753, + "learning_rate": 9.682484031227822e-05, + "loss": 0.06267567276954651, + "step": 22380 + }, + { + "epoch": 3.178140525195174, + "grad_norm": 2.813729763031006, + "learning_rate": 9.68234208658623e-05, + "loss": 0.07424157261848449, + "step": 22390 + }, + { + "epoch": 3.179559971611072, + "grad_norm": 7.473203182220459, + "learning_rate": 9.682200141944642e-05, + "loss": 0.11200079917907715, + "step": 22400 + }, + { + "epoch": 3.1809794180269693, + "grad_norm": 6.4801177978515625, + "learning_rate": 9.682058197303053e-05, + "loss": 0.13543713092803955, + "step": 22410 + }, + { + "epoch": 3.182398864442867, + "grad_norm": 3.577303409576416, + "learning_rate": 9.681916252661463e-05, + "loss": 0.11488020420074463, + "step": 22420 + }, + { + "epoch": 3.183818310858765, + "grad_norm": 4.844555377960205, + "learning_rate": 9.681774308019874e-05, + "loss": 0.03927421867847443, + "step": 22430 + }, + { + "epoch": 3.185237757274663, + "grad_norm": 1.6158503293991089, + "learning_rate": 9.681632363378282e-05, + "loss": 0.09847801327705383, + "step": 22440 + }, + { + "epoch": 3.1866572036905607, + "grad_norm": 12.733912467956543, + "learning_rate": 9.681490418736693e-05, + "loss": 0.08998562097549438, + "step": 22450 + }, + { + "epoch": 3.1880766501064586, + "grad_norm": 0.760240912437439, + "learning_rate": 9.681348474095103e-05, + "loss": 0.07409574389457703, + "step": 22460 + }, + { + "epoch": 3.1894960965223564, + "grad_norm": 2.920081377029419, + "learning_rate": 9.681206529453514e-05, + "loss": 0.11183276176452636, + "step": 22470 + }, + { + "epoch": 3.190915542938254, + "grad_norm": 4.768205165863037, + "learning_rate": 9.681064584811924e-05, + "loss": 0.07697643041610717, + "step": 22480 + }, + { + "epoch": 3.1923349893541517, + "grad_norm": 3.8446145057678223, + "learning_rate": 9.680922640170335e-05, + "loss": 0.068821781873703, + "step": 22490 + }, + { + "epoch": 3.1937544357700496, + "grad_norm": 8.481558799743652, + "learning_rate": 9.680780695528745e-05, + "loss": 0.09039323329925537, + "step": 22500 + }, + { + "epoch": 3.1937544357700496, + "eval_accuracy": 0.954791123545495, + "eval_loss": 0.1401221603155136, + "eval_runtime": 31.488, + "eval_samples_per_second": 499.46, + "eval_steps_per_second": 15.625, + "step": 22500 + }, + { + "epoch": 3.1951738821859474, + "grad_norm": 5.633203983306885, + "learning_rate": 9.680638750887154e-05, + "loss": 0.07210381031036377, + "step": 22510 + }, + { + "epoch": 3.1965933286018453, + "grad_norm": 1.863991379737854, + "learning_rate": 9.680496806245565e-05, + "loss": 0.0704656958580017, + "step": 22520 + }, + { + "epoch": 3.198012775017743, + "grad_norm": 0.9419695734977722, + "learning_rate": 9.680354861603975e-05, + "loss": 0.08578440546989441, + "step": 22530 + }, + { + "epoch": 3.199432221433641, + "grad_norm": 9.0354642868042, + "learning_rate": 9.680212916962386e-05, + "loss": 0.10872071981430054, + "step": 22540 + }, + { + "epoch": 3.2008516678495385, + "grad_norm": 3.955871820449829, + "learning_rate": 9.680070972320795e-05, + "loss": 0.05301453471183777, + "step": 22550 + }, + { + "epoch": 3.2022711142654363, + "grad_norm": 9.719240188598633, + "learning_rate": 9.679929027679206e-05, + "loss": 0.1132009506225586, + "step": 22560 + }, + { + "epoch": 3.203690560681334, + "grad_norm": 8.175822257995605, + "learning_rate": 9.679787083037615e-05, + "loss": 0.03667646646499634, + "step": 22570 + }, + { + "epoch": 3.205110007097232, + "grad_norm": 1.2011351585388184, + "learning_rate": 9.679645138396027e-05, + "loss": 0.06343533992767333, + "step": 22580 + }, + { + "epoch": 3.20652945351313, + "grad_norm": 2.916196823120117, + "learning_rate": 9.679503193754436e-05, + "loss": 0.054550164937973024, + "step": 22590 + }, + { + "epoch": 3.2079488999290278, + "grad_norm": 11.839608192443848, + "learning_rate": 9.679361249112846e-05, + "loss": 0.11105455160140991, + "step": 22600 + }, + { + "epoch": 3.2093683463449256, + "grad_norm": 5.120648384094238, + "learning_rate": 9.679219304471257e-05, + "loss": 0.04551963210105896, + "step": 22610 + }, + { + "epoch": 3.210787792760823, + "grad_norm": 2.64894437789917, + "learning_rate": 9.679077359829667e-05, + "loss": 0.07367442846298218, + "step": 22620 + }, + { + "epoch": 3.212207239176721, + "grad_norm": 7.870187759399414, + "learning_rate": 9.678935415188078e-05, + "loss": 0.12482872009277343, + "step": 22630 + }, + { + "epoch": 3.2136266855926188, + "grad_norm": 1.49652898311615, + "learning_rate": 9.678793470546488e-05, + "loss": 0.11122183799743653, + "step": 22640 + }, + { + "epoch": 3.2150461320085166, + "grad_norm": 2.1385059356689453, + "learning_rate": 9.678651525904897e-05, + "loss": 0.08030745387077332, + "step": 22650 + }, + { + "epoch": 3.2164655784244145, + "grad_norm": 5.634016036987305, + "learning_rate": 9.678509581263307e-05, + "loss": 0.135706627368927, + "step": 22660 + }, + { + "epoch": 3.2178850248403124, + "grad_norm": 7.30700159072876, + "learning_rate": 9.678367636621718e-05, + "loss": 0.09824522137641907, + "step": 22670 + }, + { + "epoch": 3.21930447125621, + "grad_norm": 3.9598324298858643, + "learning_rate": 9.678225691980128e-05, + "loss": 0.0592613160610199, + "step": 22680 + }, + { + "epoch": 3.220723917672108, + "grad_norm": 3.0672085285186768, + "learning_rate": 9.678083747338539e-05, + "loss": 0.10512404441833496, + "step": 22690 + }, + { + "epoch": 3.2221433640880055, + "grad_norm": 3.729863405227661, + "learning_rate": 9.677941802696949e-05, + "loss": 0.08016419410705566, + "step": 22700 + }, + { + "epoch": 3.2235628105039034, + "grad_norm": 2.7525126934051514, + "learning_rate": 9.677799858055359e-05, + "loss": 0.062538743019104, + "step": 22710 + }, + { + "epoch": 3.2249822569198012, + "grad_norm": 2.103010892868042, + "learning_rate": 9.67765791341377e-05, + "loss": 0.07154433131217956, + "step": 22720 + }, + { + "epoch": 3.226401703335699, + "grad_norm": 1.3044795989990234, + "learning_rate": 9.67751596877218e-05, + "loss": 0.04868173897266388, + "step": 22730 + }, + { + "epoch": 3.227821149751597, + "grad_norm": 0.34033793210983276, + "learning_rate": 9.67737402413059e-05, + "loss": 0.06057687401771546, + "step": 22740 + }, + { + "epoch": 3.229240596167495, + "grad_norm": 14.895809173583984, + "learning_rate": 9.677232079488999e-05, + "loss": 0.14414306879043579, + "step": 22750 + }, + { + "epoch": 3.2306600425833927, + "grad_norm": 2.03631329536438, + "learning_rate": 9.67709013484741e-05, + "loss": 0.03532655239105224, + "step": 22760 + }, + { + "epoch": 3.23207948899929, + "grad_norm": 1.9289063215255737, + "learning_rate": 9.67694819020582e-05, + "loss": 0.04410083889961243, + "step": 22770 + }, + { + "epoch": 3.233498935415188, + "grad_norm": 8.339526176452637, + "learning_rate": 9.676806245564231e-05, + "loss": 0.07176212072372437, + "step": 22780 + }, + { + "epoch": 3.234918381831086, + "grad_norm": 6.541379928588867, + "learning_rate": 9.67666430092264e-05, + "loss": 0.08053820133209229, + "step": 22790 + }, + { + "epoch": 3.2363378282469837, + "grad_norm": 3.6586859226226807, + "learning_rate": 9.67652235628105e-05, + "loss": 0.04074668884277344, + "step": 22800 + }, + { + "epoch": 3.2377572746628815, + "grad_norm": 0.39181602001190186, + "learning_rate": 9.676380411639461e-05, + "loss": 0.06584768891334533, + "step": 22810 + }, + { + "epoch": 3.2391767210787794, + "grad_norm": 4.53519868850708, + "learning_rate": 9.676238466997871e-05, + "loss": 0.10924329757690429, + "step": 22820 + }, + { + "epoch": 3.2405961674946773, + "grad_norm": 5.562971591949463, + "learning_rate": 9.676096522356282e-05, + "loss": 0.12216780185699463, + "step": 22830 + }, + { + "epoch": 3.2420156139105747, + "grad_norm": 9.106098175048828, + "learning_rate": 9.675954577714692e-05, + "loss": 0.09589399695396424, + "step": 22840 + }, + { + "epoch": 3.2434350603264726, + "grad_norm": 8.574522972106934, + "learning_rate": 9.675812633073103e-05, + "loss": 0.0823745608329773, + "step": 22850 + }, + { + "epoch": 3.2448545067423704, + "grad_norm": 8.706705093383789, + "learning_rate": 9.675670688431511e-05, + "loss": 0.14104554653167725, + "step": 22860 + }, + { + "epoch": 3.2462739531582683, + "grad_norm": 8.810419082641602, + "learning_rate": 9.675528743789922e-05, + "loss": 0.05990390777587891, + "step": 22870 + }, + { + "epoch": 3.247693399574166, + "grad_norm": 4.165992736816406, + "learning_rate": 9.675386799148332e-05, + "loss": 0.0668636441230774, + "step": 22880 + }, + { + "epoch": 3.249112845990064, + "grad_norm": 9.099569320678711, + "learning_rate": 9.675244854506743e-05, + "loss": 0.06936246156692505, + "step": 22890 + }, + { + "epoch": 3.250532292405962, + "grad_norm": 4.4353132247924805, + "learning_rate": 9.675102909865153e-05, + "loss": 0.06273015737533569, + "step": 22900 + }, + { + "epoch": 3.2519517388218593, + "grad_norm": 1.2650339603424072, + "learning_rate": 9.674960965223563e-05, + "loss": 0.06168818473815918, + "step": 22910 + }, + { + "epoch": 3.253371185237757, + "grad_norm": 4.567782402038574, + "learning_rate": 9.674819020581974e-05, + "loss": 0.10136575698852539, + "step": 22920 + }, + { + "epoch": 3.254790631653655, + "grad_norm": 6.448585510253906, + "learning_rate": 9.674677075940384e-05, + "loss": 0.07393231987953186, + "step": 22930 + }, + { + "epoch": 3.256210078069553, + "grad_norm": 10.017446517944336, + "learning_rate": 9.674535131298795e-05, + "loss": 0.10242644548416138, + "step": 22940 + }, + { + "epoch": 3.2576295244854507, + "grad_norm": 3.191063404083252, + "learning_rate": 9.674393186657204e-05, + "loss": 0.047987133264541626, + "step": 22950 + }, + { + "epoch": 3.2590489709013486, + "grad_norm": 3.556180477142334, + "learning_rate": 9.674251242015614e-05, + "loss": 0.047191986441612245, + "step": 22960 + }, + { + "epoch": 3.2604684173172465, + "grad_norm": 1.7208983898162842, + "learning_rate": 9.674109297374024e-05, + "loss": 0.08717820644378663, + "step": 22970 + }, + { + "epoch": 3.2618878637331443, + "grad_norm": 5.613543510437012, + "learning_rate": 9.673967352732435e-05, + "loss": 0.11286189556121826, + "step": 22980 + }, + { + "epoch": 3.2633073101490417, + "grad_norm": 5.163478374481201, + "learning_rate": 9.673825408090845e-05, + "loss": 0.11744798421859741, + "step": 22990 + }, + { + "epoch": 3.2647267565649396, + "grad_norm": 3.8311023712158203, + "learning_rate": 9.673683463449256e-05, + "loss": 0.0839583694934845, + "step": 23000 + }, + { + "epoch": 3.2647267565649396, + "eval_accuracy": 0.9688433903478095, + "eval_loss": 0.09206999838352203, + "eval_runtime": 32.6805, + "eval_samples_per_second": 481.235, + "eval_steps_per_second": 15.055, + "step": 23000 + }, + { + "epoch": 3.2661462029808375, + "grad_norm": 6.961423873901367, + "learning_rate": 9.673541518807666e-05, + "loss": 0.060645246505737306, + "step": 23010 + }, + { + "epoch": 3.2675656493967353, + "grad_norm": 4.491827011108398, + "learning_rate": 9.673399574166075e-05, + "loss": 0.060946452617645266, + "step": 23020 + }, + { + "epoch": 3.268985095812633, + "grad_norm": 8.529021263122559, + "learning_rate": 9.673257629524486e-05, + "loss": 0.0623835563659668, + "step": 23030 + }, + { + "epoch": 3.270404542228531, + "grad_norm": 7.560174942016602, + "learning_rate": 9.673115684882896e-05, + "loss": 0.05246782898902893, + "step": 23040 + }, + { + "epoch": 3.271823988644429, + "grad_norm": 5.852350234985352, + "learning_rate": 9.672973740241307e-05, + "loss": 0.1177408218383789, + "step": 23050 + }, + { + "epoch": 3.2732434350603263, + "grad_norm": 2.9898064136505127, + "learning_rate": 9.672831795599716e-05, + "loss": 0.10251556634902954, + "step": 23060 + }, + { + "epoch": 3.274662881476224, + "grad_norm": 0.7350359559059143, + "learning_rate": 9.672689850958127e-05, + "loss": 0.08793265223503113, + "step": 23070 + }, + { + "epoch": 3.276082327892122, + "grad_norm": 7.976613998413086, + "learning_rate": 9.672547906316536e-05, + "loss": 0.11746323108673096, + "step": 23080 + }, + { + "epoch": 3.27750177430802, + "grad_norm": 5.30941915512085, + "learning_rate": 9.672405961674948e-05, + "loss": 0.1818032145500183, + "step": 23090 + }, + { + "epoch": 3.278921220723918, + "grad_norm": 4.999229907989502, + "learning_rate": 9.672264017033357e-05, + "loss": 0.05894123911857605, + "step": 23100 + }, + { + "epoch": 3.2803406671398156, + "grad_norm": 5.794082164764404, + "learning_rate": 9.672122072391767e-05, + "loss": 0.081751549243927, + "step": 23110 + }, + { + "epoch": 3.2817601135557135, + "grad_norm": 2.565143346786499, + "learning_rate": 9.671980127750178e-05, + "loss": 0.07524069547653198, + "step": 23120 + }, + { + "epoch": 3.283179559971611, + "grad_norm": 4.894937038421631, + "learning_rate": 9.671838183108588e-05, + "loss": 0.12828075885772705, + "step": 23130 + }, + { + "epoch": 3.284599006387509, + "grad_norm": 6.212746620178223, + "learning_rate": 9.671696238466999e-05, + "loss": 0.1400521755218506, + "step": 23140 + }, + { + "epoch": 3.2860184528034067, + "grad_norm": 4.2761921882629395, + "learning_rate": 9.671554293825409e-05, + "loss": 0.09644685983657837, + "step": 23150 + }, + { + "epoch": 3.2874378992193045, + "grad_norm": 16.000354766845703, + "learning_rate": 9.671412349183818e-05, + "loss": 0.11152185201644897, + "step": 23160 + }, + { + "epoch": 3.2888573456352024, + "grad_norm": 6.135869026184082, + "learning_rate": 9.671270404542228e-05, + "loss": 0.0771405816078186, + "step": 23170 + }, + { + "epoch": 3.2902767920511002, + "grad_norm": 1.4707847833633423, + "learning_rate": 9.671128459900639e-05, + "loss": 0.09533407092094422, + "step": 23180 + }, + { + "epoch": 3.291696238466998, + "grad_norm": 1.1678895950317383, + "learning_rate": 9.670986515259049e-05, + "loss": 0.09652703404426574, + "step": 23190 + }, + { + "epoch": 3.2931156848828955, + "grad_norm": 3.4155921936035156, + "learning_rate": 9.67084457061746e-05, + "loss": 0.04331456124782562, + "step": 23200 + }, + { + "epoch": 3.2945351312987934, + "grad_norm": 3.263784408569336, + "learning_rate": 9.67070262597587e-05, + "loss": 0.12196718454360962, + "step": 23210 + }, + { + "epoch": 3.2959545777146912, + "grad_norm": 0.8338903188705444, + "learning_rate": 9.67056068133428e-05, + "loss": 0.08930212855339051, + "step": 23220 + }, + { + "epoch": 3.297374024130589, + "grad_norm": 2.3964731693267822, + "learning_rate": 9.67041873669269e-05, + "loss": 0.061741960048675534, + "step": 23230 + }, + { + "epoch": 3.298793470546487, + "grad_norm": 9.600022315979004, + "learning_rate": 9.6702767920511e-05, + "loss": 0.1284554719924927, + "step": 23240 + }, + { + "epoch": 3.300212916962385, + "grad_norm": 1.0024387836456299, + "learning_rate": 9.670134847409511e-05, + "loss": 0.09108211994171142, + "step": 23250 + }, + { + "epoch": 3.3016323633782827, + "grad_norm": 4.58043098449707, + "learning_rate": 9.669992902767921e-05, + "loss": 0.10650498867034912, + "step": 23260 + }, + { + "epoch": 3.30305180979418, + "grad_norm": 3.778592824935913, + "learning_rate": 9.669850958126331e-05, + "loss": 0.0809212327003479, + "step": 23270 + }, + { + "epoch": 3.304471256210078, + "grad_norm": 2.984292984008789, + "learning_rate": 9.669709013484741e-05, + "loss": 0.0674120306968689, + "step": 23280 + }, + { + "epoch": 3.305890702625976, + "grad_norm": 2.295304298400879, + "learning_rate": 9.669567068843152e-05, + "loss": 0.04605483114719391, + "step": 23290 + }, + { + "epoch": 3.3073101490418737, + "grad_norm": 5.067991256713867, + "learning_rate": 9.669425124201562e-05, + "loss": 0.07464765906333923, + "step": 23300 + }, + { + "epoch": 3.3087295954577716, + "grad_norm": 0.5175068974494934, + "learning_rate": 9.669283179559973e-05, + "loss": 0.10126523971557617, + "step": 23310 + }, + { + "epoch": 3.3101490418736694, + "grad_norm": 0.7718493938446045, + "learning_rate": 9.669141234918382e-05, + "loss": 0.1306004047393799, + "step": 23320 + }, + { + "epoch": 3.3115684882895673, + "grad_norm": 0.4733130931854248, + "learning_rate": 9.668999290276792e-05, + "loss": 0.07524165511131287, + "step": 23330 + }, + { + "epoch": 3.3129879347054647, + "grad_norm": 1.91227388381958, + "learning_rate": 9.668857345635203e-05, + "loss": 0.10234876871109008, + "step": 23340 + }, + { + "epoch": 3.3144073811213626, + "grad_norm": 3.8604981899261475, + "learning_rate": 9.668715400993613e-05, + "loss": 0.08232152462005615, + "step": 23350 + }, + { + "epoch": 3.3158268275372604, + "grad_norm": 4.264747619628906, + "learning_rate": 9.668573456352024e-05, + "loss": 0.08970657587051392, + "step": 23360 + }, + { + "epoch": 3.3172462739531583, + "grad_norm": 8.413162231445312, + "learning_rate": 9.668431511710432e-05, + "loss": 0.0798837423324585, + "step": 23370 + }, + { + "epoch": 3.318665720369056, + "grad_norm": 6.562158107757568, + "learning_rate": 9.668289567068843e-05, + "loss": 0.1796337842941284, + "step": 23380 + }, + { + "epoch": 3.320085166784954, + "grad_norm": 6.798343658447266, + "learning_rate": 9.668147622427253e-05, + "loss": 0.13204431533813477, + "step": 23390 + }, + { + "epoch": 3.321504613200852, + "grad_norm": 7.170462131500244, + "learning_rate": 9.668005677785664e-05, + "loss": 0.082490473985672, + "step": 23400 + }, + { + "epoch": 3.3229240596167493, + "grad_norm": 1.1640955209732056, + "learning_rate": 9.667863733144074e-05, + "loss": 0.11552011966705322, + "step": 23410 + }, + { + "epoch": 3.324343506032647, + "grad_norm": 3.5345652103424072, + "learning_rate": 9.667721788502484e-05, + "loss": 0.07584733963012695, + "step": 23420 + }, + { + "epoch": 3.325762952448545, + "grad_norm": 1.844787836074829, + "learning_rate": 9.667579843860895e-05, + "loss": 0.09344690442085266, + "step": 23430 + }, + { + "epoch": 3.327182398864443, + "grad_norm": 2.403691053390503, + "learning_rate": 9.667437899219305e-05, + "loss": 0.057882833480834964, + "step": 23440 + }, + { + "epoch": 3.3286018452803408, + "grad_norm": 2.586052894592285, + "learning_rate": 9.667295954577716e-05, + "loss": 0.07656934261322021, + "step": 23450 + }, + { + "epoch": 3.3300212916962386, + "grad_norm": 0.33396223187446594, + "learning_rate": 9.667154009936125e-05, + "loss": 0.08143852353096008, + "step": 23460 + }, + { + "epoch": 3.3314407381121365, + "grad_norm": 0.9797456860542297, + "learning_rate": 9.667012065294535e-05, + "loss": 0.032908812165260315, + "step": 23470 + }, + { + "epoch": 3.332860184528034, + "grad_norm": 0.3462522625923157, + "learning_rate": 9.666870120652945e-05, + "loss": 0.05224289894104004, + "step": 23480 + }, + { + "epoch": 3.3342796309439318, + "grad_norm": 5.588517189025879, + "learning_rate": 9.666728176011356e-05, + "loss": 0.08177621364593506, + "step": 23490 + }, + { + "epoch": 3.3356990773598296, + "grad_norm": 6.037621021270752, + "learning_rate": 9.666586231369766e-05, + "loss": 0.06431897282600403, + "step": 23500 + }, + { + "epoch": 3.3356990773598296, + "eval_accuracy": 0.9582247090990017, + "eval_loss": 0.1211514100432396, + "eval_runtime": 32.6171, + "eval_samples_per_second": 482.17, + "eval_steps_per_second": 15.084, + "step": 23500 + }, + { + "epoch": 3.3371185237757275, + "grad_norm": 4.2738142013549805, + "learning_rate": 9.666444286728177e-05, + "loss": 0.07732362151145936, + "step": 23510 + }, + { + "epoch": 3.3385379701916253, + "grad_norm": 5.357970237731934, + "learning_rate": 9.666302342086587e-05, + "loss": 0.057775235176086424, + "step": 23520 + }, + { + "epoch": 3.339957416607523, + "grad_norm": 2.4043660163879395, + "learning_rate": 9.666160397444996e-05, + "loss": 0.10017684698104859, + "step": 23530 + }, + { + "epoch": 3.341376863023421, + "grad_norm": 7.4561381340026855, + "learning_rate": 9.666018452803407e-05, + "loss": 0.14003334045410157, + "step": 23540 + }, + { + "epoch": 3.3427963094393185, + "grad_norm": 2.9771358966827393, + "learning_rate": 9.665876508161817e-05, + "loss": 0.11144789457321166, + "step": 23550 + }, + { + "epoch": 3.3442157558552164, + "grad_norm": 5.861306190490723, + "learning_rate": 9.665734563520228e-05, + "loss": 0.1083723783493042, + "step": 23560 + }, + { + "epoch": 3.345635202271114, + "grad_norm": 1.0332176685333252, + "learning_rate": 9.665592618878638e-05, + "loss": 0.08513032793998718, + "step": 23570 + }, + { + "epoch": 3.347054648687012, + "grad_norm": 6.1437177658081055, + "learning_rate": 9.665450674237048e-05, + "loss": 0.08397155404090881, + "step": 23580 + }, + { + "epoch": 3.34847409510291, + "grad_norm": 4.794635772705078, + "learning_rate": 9.665308729595457e-05, + "loss": 0.042923647165298465, + "step": 23590 + }, + { + "epoch": 3.349893541518808, + "grad_norm": 3.806190252304077, + "learning_rate": 9.665166784953869e-05, + "loss": 0.08098719120025635, + "step": 23600 + }, + { + "epoch": 3.3513129879347057, + "grad_norm": 0.2237672656774521, + "learning_rate": 9.665024840312278e-05, + "loss": 0.07011445760726928, + "step": 23610 + }, + { + "epoch": 3.352732434350603, + "grad_norm": 3.0982532501220703, + "learning_rate": 9.66488289567069e-05, + "loss": 0.061842381954193115, + "step": 23620 + }, + { + "epoch": 3.354151880766501, + "grad_norm": 2.976536512374878, + "learning_rate": 9.664740951029099e-05, + "loss": 0.10006380081176758, + "step": 23630 + }, + { + "epoch": 3.355571327182399, + "grad_norm": 4.319900035858154, + "learning_rate": 9.664599006387509e-05, + "loss": 0.13653013706207276, + "step": 23640 + }, + { + "epoch": 3.3569907735982967, + "grad_norm": 1.9102489948272705, + "learning_rate": 9.66445706174592e-05, + "loss": 0.038610780239105226, + "step": 23650 + }, + { + "epoch": 3.3584102200141945, + "grad_norm": 6.633970737457275, + "learning_rate": 9.66431511710433e-05, + "loss": 0.06831348538398743, + "step": 23660 + }, + { + "epoch": 3.3598296664300924, + "grad_norm": 1.1184673309326172, + "learning_rate": 9.664173172462741e-05, + "loss": 0.06864879727363586, + "step": 23670 + }, + { + "epoch": 3.3612491128459903, + "grad_norm": 0.8485651612281799, + "learning_rate": 9.664031227821149e-05, + "loss": 0.08388459086418151, + "step": 23680 + }, + { + "epoch": 3.3626685592618877, + "grad_norm": 1.4212796688079834, + "learning_rate": 9.66388928317956e-05, + "loss": 0.20324900150299072, + "step": 23690 + }, + { + "epoch": 3.3640880056777855, + "grad_norm": 0.2244710922241211, + "learning_rate": 9.66374733853797e-05, + "loss": 0.07268852591514588, + "step": 23700 + }, + { + "epoch": 3.3655074520936834, + "grad_norm": 0.2561863660812378, + "learning_rate": 9.663605393896381e-05, + "loss": 0.036457425355911253, + "step": 23710 + }, + { + "epoch": 3.3669268985095813, + "grad_norm": 2.078640937805176, + "learning_rate": 9.663463449254792e-05, + "loss": 0.07209231853485107, + "step": 23720 + }, + { + "epoch": 3.368346344925479, + "grad_norm": 4.892085552215576, + "learning_rate": 9.6633215046132e-05, + "loss": 0.1211774468421936, + "step": 23730 + }, + { + "epoch": 3.369765791341377, + "grad_norm": 1.651289939880371, + "learning_rate": 9.663179559971612e-05, + "loss": 0.08962616324424744, + "step": 23740 + }, + { + "epoch": 3.371185237757275, + "grad_norm": 1.4341058731079102, + "learning_rate": 9.663037615330021e-05, + "loss": 0.06757261753082275, + "step": 23750 + }, + { + "epoch": 3.3726046841731723, + "grad_norm": 0.5684829354286194, + "learning_rate": 9.662895670688432e-05, + "loss": 0.04020809531211853, + "step": 23760 + }, + { + "epoch": 3.37402413058907, + "grad_norm": 2.886730194091797, + "learning_rate": 9.662753726046842e-05, + "loss": 0.07528796195983886, + "step": 23770 + }, + { + "epoch": 3.375443577004968, + "grad_norm": 7.543295383453369, + "learning_rate": 9.662611781405252e-05, + "loss": 0.2501710891723633, + "step": 23780 + }, + { + "epoch": 3.376863023420866, + "grad_norm": 6.99386739730835, + "learning_rate": 9.662469836763662e-05, + "loss": 0.07813713550567628, + "step": 23790 + }, + { + "epoch": 3.3782824698367637, + "grad_norm": 6.142605781555176, + "learning_rate": 9.662327892122073e-05, + "loss": 0.0971024513244629, + "step": 23800 + }, + { + "epoch": 3.3797019162526616, + "grad_norm": 5.846232891082764, + "learning_rate": 9.662185947480484e-05, + "loss": 0.11326665878295898, + "step": 23810 + }, + { + "epoch": 3.3811213626685594, + "grad_norm": 3.8466222286224365, + "learning_rate": 9.662044002838894e-05, + "loss": 0.09037129282951355, + "step": 23820 + }, + { + "epoch": 3.382540809084457, + "grad_norm": 1.8509072065353394, + "learning_rate": 9.661902058197303e-05, + "loss": 0.08938190340995789, + "step": 23830 + }, + { + "epoch": 3.3839602555003547, + "grad_norm": 8.372735023498535, + "learning_rate": 9.661760113555713e-05, + "loss": 0.10955497026443481, + "step": 23840 + }, + { + "epoch": 3.3853797019162526, + "grad_norm": 10.327803611755371, + "learning_rate": 9.661618168914124e-05, + "loss": 0.10603039264678955, + "step": 23850 + }, + { + "epoch": 3.3867991483321505, + "grad_norm": 2.4464328289031982, + "learning_rate": 9.661476224272534e-05, + "loss": 0.07022827863693237, + "step": 23860 + }, + { + "epoch": 3.3882185947480483, + "grad_norm": 4.964604377746582, + "learning_rate": 9.661334279630945e-05, + "loss": 0.10754181146621704, + "step": 23870 + }, + { + "epoch": 3.389638041163946, + "grad_norm": 2.0936126708984375, + "learning_rate": 9.661192334989355e-05, + "loss": 0.07387771010398865, + "step": 23880 + }, + { + "epoch": 3.391057487579844, + "grad_norm": 1.5606902837753296, + "learning_rate": 9.661050390347764e-05, + "loss": 0.06499841809272766, + "step": 23890 + }, + { + "epoch": 3.3924769339957415, + "grad_norm": 0.09581028670072556, + "learning_rate": 9.660908445706176e-05, + "loss": 0.09357624053955078, + "step": 23900 + }, + { + "epoch": 3.3938963804116393, + "grad_norm": 2.011545181274414, + "learning_rate": 9.660766501064585e-05, + "loss": 0.04169844388961792, + "step": 23910 + }, + { + "epoch": 3.395315826827537, + "grad_norm": 0.6940661668777466, + "learning_rate": 9.660624556422996e-05, + "loss": 0.05995774269104004, + "step": 23920 + }, + { + "epoch": 3.396735273243435, + "grad_norm": 2.8684120178222656, + "learning_rate": 9.660482611781406e-05, + "loss": 0.05829171538352966, + "step": 23930 + }, + { + "epoch": 3.398154719659333, + "grad_norm": 5.727314472198486, + "learning_rate": 9.660340667139816e-05, + "loss": 0.0676846444606781, + "step": 23940 + }, + { + "epoch": 3.3995741660752308, + "grad_norm": 3.3505942821502686, + "learning_rate": 9.660198722498226e-05, + "loss": 0.12202495336532593, + "step": 23950 + }, + { + "epoch": 3.4009936124911286, + "grad_norm": 1.6798441410064697, + "learning_rate": 9.660056777856637e-05, + "loss": 0.10003808736801148, + "step": 23960 + }, + { + "epoch": 3.402413058907026, + "grad_norm": 2.8134841918945312, + "learning_rate": 9.659914833215046e-05, + "loss": 0.053173118829727174, + "step": 23970 + }, + { + "epoch": 3.403832505322924, + "grad_norm": 9.647566795349121, + "learning_rate": 9.659772888573458e-05, + "loss": 0.09169653654098511, + "step": 23980 + }, + { + "epoch": 3.405251951738822, + "grad_norm": 2.525071620941162, + "learning_rate": 9.659630943931867e-05, + "loss": 0.05470997095108032, + "step": 23990 + }, + { + "epoch": 3.4066713981547196, + "grad_norm": 7.918493270874023, + "learning_rate": 9.659488999290277e-05, + "loss": 0.12718768119812013, + "step": 24000 + }, + { + "epoch": 3.4066713981547196, + "eval_accuracy": 0.9593056526991798, + "eval_loss": 0.11784256994724274, + "eval_runtime": 34.1419, + "eval_samples_per_second": 460.636, + "eval_steps_per_second": 14.41, + "step": 24000 + }, + { + "epoch": 3.4080908445706175, + "grad_norm": 8.479427337646484, + "learning_rate": 9.659347054648688e-05, + "loss": 0.12565889358520507, + "step": 24010 + }, + { + "epoch": 3.4095102909865154, + "grad_norm": 1.4310401678085327, + "learning_rate": 9.659205110007098e-05, + "loss": 0.07409765720367431, + "step": 24020 + }, + { + "epoch": 3.4109297374024132, + "grad_norm": 1.3293160200119019, + "learning_rate": 9.659063165365509e-05, + "loss": 0.06405404210090637, + "step": 24030 + }, + { + "epoch": 3.4123491838183106, + "grad_norm": 2.3439300060272217, + "learning_rate": 9.658921220723917e-05, + "loss": 0.07644574642181397, + "step": 24040 + }, + { + "epoch": 3.4137686302342085, + "grad_norm": 5.991164207458496, + "learning_rate": 9.658779276082328e-05, + "loss": 0.1311761498451233, + "step": 24050 + }, + { + "epoch": 3.4151880766501064, + "grad_norm": 4.515506267547607, + "learning_rate": 9.658637331440738e-05, + "loss": 0.080819970369339, + "step": 24060 + }, + { + "epoch": 3.4166075230660042, + "grad_norm": 3.080458402633667, + "learning_rate": 9.658495386799149e-05, + "loss": 0.07668147087097169, + "step": 24070 + }, + { + "epoch": 3.418026969481902, + "grad_norm": 6.942470550537109, + "learning_rate": 9.658353442157559e-05, + "loss": 0.07289301753044128, + "step": 24080 + }, + { + "epoch": 3.4194464158978, + "grad_norm": 9.14225959777832, + "learning_rate": 9.658211497515969e-05, + "loss": 0.07435898780822754, + "step": 24090 + }, + { + "epoch": 3.420865862313698, + "grad_norm": 7.3029704093933105, + "learning_rate": 9.65806955287438e-05, + "loss": 0.12275665998458862, + "step": 24100 + }, + { + "epoch": 3.4222853087295952, + "grad_norm": 1.066394567489624, + "learning_rate": 9.65792760823279e-05, + "loss": 0.12547402381896972, + "step": 24110 + }, + { + "epoch": 3.423704755145493, + "grad_norm": 2.095668315887451, + "learning_rate": 9.6577856635912e-05, + "loss": 0.08885858654975891, + "step": 24120 + }, + { + "epoch": 3.425124201561391, + "grad_norm": 10.10063648223877, + "learning_rate": 9.65764371894961e-05, + "loss": 0.08219894766807556, + "step": 24130 + }, + { + "epoch": 3.426543647977289, + "grad_norm": 0.24362793564796448, + "learning_rate": 9.65750177430802e-05, + "loss": 0.07828856706619262, + "step": 24140 + }, + { + "epoch": 3.4279630943931867, + "grad_norm": 3.3321142196655273, + "learning_rate": 9.65735982966643e-05, + "loss": 0.052914398908615115, + "step": 24150 + }, + { + "epoch": 3.4293825408090846, + "grad_norm": 6.5169291496276855, + "learning_rate": 9.657217885024841e-05, + "loss": 0.0918683409690857, + "step": 24160 + }, + { + "epoch": 3.4308019872249824, + "grad_norm": 1.8033021688461304, + "learning_rate": 9.657075940383251e-05, + "loss": 0.07939133048057556, + "step": 24170 + }, + { + "epoch": 3.43222143364088, + "grad_norm": 0.5477492213249207, + "learning_rate": 9.656933995741662e-05, + "loss": 0.08349984288215637, + "step": 24180 + }, + { + "epoch": 3.4336408800567777, + "grad_norm": 5.996103763580322, + "learning_rate": 9.656792051100072e-05, + "loss": 0.05642620325088501, + "step": 24190 + }, + { + "epoch": 3.4350603264726756, + "grad_norm": 10.91261100769043, + "learning_rate": 9.656650106458481e-05, + "loss": 0.10933701992034912, + "step": 24200 + }, + { + "epoch": 3.4364797728885734, + "grad_norm": 2.225350856781006, + "learning_rate": 9.656508161816892e-05, + "loss": 0.09172443151474, + "step": 24210 + }, + { + "epoch": 3.4378992193044713, + "grad_norm": 12.634965896606445, + "learning_rate": 9.656366217175302e-05, + "loss": 0.11917402744293212, + "step": 24220 + }, + { + "epoch": 3.439318665720369, + "grad_norm": 1.6125768423080444, + "learning_rate": 9.656224272533713e-05, + "loss": 0.06305748820304871, + "step": 24230 + }, + { + "epoch": 3.440738112136267, + "grad_norm": 0.32264623045921326, + "learning_rate": 9.656082327892123e-05, + "loss": 0.053128784894943236, + "step": 24240 + }, + { + "epoch": 3.4421575585521644, + "grad_norm": 1.5485633611679077, + "learning_rate": 9.655940383250533e-05, + "loss": 0.09052397012710571, + "step": 24250 + }, + { + "epoch": 3.4435770049680623, + "grad_norm": 8.407336235046387, + "learning_rate": 9.655798438608942e-05, + "loss": 0.0869211733341217, + "step": 24260 + }, + { + "epoch": 3.44499645138396, + "grad_norm": 4.730905532836914, + "learning_rate": 9.655656493967353e-05, + "loss": 0.07399642467498779, + "step": 24270 + }, + { + "epoch": 3.446415897799858, + "grad_norm": 3.4000537395477295, + "learning_rate": 9.655514549325763e-05, + "loss": 0.047950705885887145, + "step": 24280 + }, + { + "epoch": 3.447835344215756, + "grad_norm": 1.1020469665527344, + "learning_rate": 9.655372604684174e-05, + "loss": 0.06868406534194946, + "step": 24290 + }, + { + "epoch": 3.4492547906316537, + "grad_norm": 7.190598964691162, + "learning_rate": 9.655230660042584e-05, + "loss": 0.0772173523902893, + "step": 24300 + }, + { + "epoch": 3.4506742370475516, + "grad_norm": 0.16195560991764069, + "learning_rate": 9.655088715400994e-05, + "loss": 0.05037579536437988, + "step": 24310 + }, + { + "epoch": 3.452093683463449, + "grad_norm": 5.206357955932617, + "learning_rate": 9.654946770759405e-05, + "loss": 0.05795242190361023, + "step": 24320 + }, + { + "epoch": 3.453513129879347, + "grad_norm": 3.8032917976379395, + "learning_rate": 9.654804826117815e-05, + "loss": 0.06166144609451294, + "step": 24330 + }, + { + "epoch": 3.4549325762952448, + "grad_norm": 7.195924282073975, + "learning_rate": 9.654662881476226e-05, + "loss": 0.07787706851959228, + "step": 24340 + }, + { + "epoch": 3.4563520227111426, + "grad_norm": 16.916200637817383, + "learning_rate": 9.654520936834634e-05, + "loss": 0.12234679460525513, + "step": 24350 + }, + { + "epoch": 3.4577714691270405, + "grad_norm": 8.12978458404541, + "learning_rate": 9.654378992193045e-05, + "loss": 0.08913070559501649, + "step": 24360 + }, + { + "epoch": 3.4591909155429383, + "grad_norm": 5.649082660675049, + "learning_rate": 9.654237047551455e-05, + "loss": 0.06851221919059754, + "step": 24370 + }, + { + "epoch": 3.460610361958836, + "grad_norm": 9.085246086120605, + "learning_rate": 9.654095102909866e-05, + "loss": 0.07936888933181763, + "step": 24380 + }, + { + "epoch": 3.4620298083747336, + "grad_norm": 6.739210605621338, + "learning_rate": 9.653953158268276e-05, + "loss": 0.0680124282836914, + "step": 24390 + }, + { + "epoch": 3.4634492547906315, + "grad_norm": 4.914496421813965, + "learning_rate": 9.653811213626686e-05, + "loss": 0.07251676321029663, + "step": 24400 + }, + { + "epoch": 3.4648687012065293, + "grad_norm": 3.8612000942230225, + "learning_rate": 9.653669268985097e-05, + "loss": 0.08312456011772155, + "step": 24410 + }, + { + "epoch": 3.466288147622427, + "grad_norm": 5.080418109893799, + "learning_rate": 9.653527324343506e-05, + "loss": 0.08824545741081238, + "step": 24420 + }, + { + "epoch": 3.467707594038325, + "grad_norm": 2.7461204528808594, + "learning_rate": 9.653385379701917e-05, + "loss": 0.04693276584148407, + "step": 24430 + }, + { + "epoch": 3.469127040454223, + "grad_norm": 2.284554958343506, + "learning_rate": 9.653243435060327e-05, + "loss": 0.10196805000305176, + "step": 24440 + }, + { + "epoch": 3.470546486870121, + "grad_norm": 6.074938774108887, + "learning_rate": 9.653101490418737e-05, + "loss": 0.12747013568878174, + "step": 24450 + }, + { + "epoch": 3.471965933286018, + "grad_norm": 4.511362075805664, + "learning_rate": 9.652959545777147e-05, + "loss": 0.115402352809906, + "step": 24460 + }, + { + "epoch": 3.473385379701916, + "grad_norm": 2.1728434562683105, + "learning_rate": 9.652817601135558e-05, + "loss": 0.050969237089157106, + "step": 24470 + }, + { + "epoch": 3.474804826117814, + "grad_norm": 5.665693283081055, + "learning_rate": 9.652675656493967e-05, + "loss": 0.06567577123641968, + "step": 24480 + }, + { + "epoch": 3.476224272533712, + "grad_norm": 1.5518124103546143, + "learning_rate": 9.652533711852379e-05, + "loss": 0.05959618091583252, + "step": 24490 + }, + { + "epoch": 3.4776437189496097, + "grad_norm": 8.269552230834961, + "learning_rate": 9.652391767210788e-05, + "loss": 0.13263360261917115, + "step": 24500 + }, + { + "epoch": 3.4776437189496097, + "eval_accuracy": 0.9656641444649329, + "eval_loss": 0.10235972702503204, + "eval_runtime": 33.7613, + "eval_samples_per_second": 465.829, + "eval_steps_per_second": 14.573, + "step": 24500 + } + ], + "logging_steps": 10, + "max_steps": 704500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}