{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985520994557892, "eval_steps": 1000, "global_step": 22500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044380093309146184, "grad_norm": 725.3771362304688, "learning_rate": 9.999982664038074e-06, "loss": 85.6201, "step": 10 }, { "epoch": 0.0008876018661829237, "grad_norm": 471.7138977050781, "learning_rate": 9.999965328076145e-06, "loss": 58.4931, "step": 20 }, { "epoch": 0.0013314027992743854, "grad_norm": 330.69317626953125, "learning_rate": 9.999947992114218e-06, "loss": 39.9778, "step": 30 }, { "epoch": 0.0017752037323658474, "grad_norm": 283.2129821777344, "learning_rate": 9.999930656152291e-06, "loss": 30.2775, "step": 40 }, { "epoch": 0.002219004665457309, "grad_norm": 283.95977783203125, "learning_rate": 9.999913320190363e-06, "loss": 26.528, "step": 50 }, { "epoch": 0.0026628055985487707, "grad_norm": 327.2074890136719, "learning_rate": 9.999895984228436e-06, "loss": 25.2187, "step": 60 }, { "epoch": 0.0031066065316402327, "grad_norm": 290.6508483886719, "learning_rate": 9.999878648266509e-06, "loss": 23.4217, "step": 70 }, { "epoch": 0.0035504074647316947, "grad_norm": 268.8506774902344, "learning_rate": 9.99986131230458e-06, "loss": 22.3678, "step": 80 }, { "epoch": 0.003994208397823156, "grad_norm": 303.6665954589844, "learning_rate": 9.999843976342653e-06, "loss": 21.8689, "step": 90 }, { "epoch": 0.004438009330914618, "grad_norm": 221.8241424560547, "learning_rate": 9.999826640380727e-06, "loss": 21.7776, "step": 100 }, { "epoch": 0.00488181026400608, "grad_norm": 208.48538208007812, "learning_rate": 9.9998093044188e-06, "loss": 21.2487, "step": 110 }, { "epoch": 0.0053256111970975415, "grad_norm": 195.47113037109375, "learning_rate": 9.999791968456871e-06, "loss": 20.7403, "step": 120 }, { "epoch": 0.0057694121301890035, "grad_norm": 289.3889465332031, "learning_rate": 9.999774632494944e-06, "loss": 20.443, "step": 130 }, { "epoch": 0.0062132130632804655, "grad_norm": 321.8573303222656, "learning_rate": 9.999757296533017e-06, "loss": 20.8289, "step": 140 }, { "epoch": 0.0066570139963719275, "grad_norm": 297.0621337890625, "learning_rate": 9.999739960571089e-06, "loss": 19.7784, "step": 150 }, { "epoch": 0.0071008149294633895, "grad_norm": 240.9821014404297, "learning_rate": 9.999722624609162e-06, "loss": 19.0094, "step": 160 }, { "epoch": 0.007544615862554851, "grad_norm": 261.113037109375, "learning_rate": 9.999705288647235e-06, "loss": 19.1363, "step": 170 }, { "epoch": 0.007988416795646313, "grad_norm": 218.0191650390625, "learning_rate": 9.999687952685306e-06, "loss": 18.8289, "step": 180 }, { "epoch": 0.008432217728737776, "grad_norm": 310.36712646484375, "learning_rate": 9.999670616723379e-06, "loss": 19.4811, "step": 190 }, { "epoch": 0.008876018661829237, "grad_norm": 217.39454650878906, "learning_rate": 9.999653280761452e-06, "loss": 18.593, "step": 200 }, { "epoch": 0.009319819594920698, "grad_norm": 221.60879516601562, "learning_rate": 9.999635944799524e-06, "loss": 18.8921, "step": 210 }, { "epoch": 0.00976362052801216, "grad_norm": 254.957763671875, "learning_rate": 9.999618608837597e-06, "loss": 18.9504, "step": 220 }, { "epoch": 0.010207421461103622, "grad_norm": 230.76243591308594, "learning_rate": 9.99960127287567e-06, "loss": 18.8949, "step": 230 }, { "epoch": 0.010651222394195083, "grad_norm": 316.453857421875, "learning_rate": 9.999583936913741e-06, "loss": 18.6263, "step": 240 }, { "epoch": 0.011095023327286546, "grad_norm": 275.41265869140625, "learning_rate": 9.999566600951814e-06, "loss": 18.6316, "step": 250 }, { "epoch": 0.011538824260378007, "grad_norm": 275.3472595214844, "learning_rate": 9.999549264989887e-06, "loss": 18.6499, "step": 260 }, { "epoch": 0.01198262519346947, "grad_norm": 239.0996551513672, "learning_rate": 9.999531929027959e-06, "loss": 18.9865, "step": 270 }, { "epoch": 0.012426426126560931, "grad_norm": 176.33468627929688, "learning_rate": 9.999514593066032e-06, "loss": 18.2206, "step": 280 }, { "epoch": 0.012870227059652392, "grad_norm": 199.18878173828125, "learning_rate": 9.999497257104105e-06, "loss": 17.4809, "step": 290 }, { "epoch": 0.013314027992743855, "grad_norm": 212.03488159179688, "learning_rate": 9.999479921142176e-06, "loss": 17.9092, "step": 300 }, { "epoch": 0.013757828925835316, "grad_norm": 244.9204559326172, "learning_rate": 9.99946258518025e-06, "loss": 17.5332, "step": 310 }, { "epoch": 0.014201629858926779, "grad_norm": 215.74754333496094, "learning_rate": 9.999445249218322e-06, "loss": 18.2855, "step": 320 }, { "epoch": 0.01464543079201824, "grad_norm": 226.8377227783203, "learning_rate": 9.999427913256395e-06, "loss": 17.7032, "step": 330 }, { "epoch": 0.015089231725109701, "grad_norm": 242.4842987060547, "learning_rate": 9.999410577294467e-06, "loss": 17.7969, "step": 340 }, { "epoch": 0.015533032658201164, "grad_norm": 233.77847290039062, "learning_rate": 9.99939324133254e-06, "loss": 17.4911, "step": 350 }, { "epoch": 0.015976833591292625, "grad_norm": 291.48065185546875, "learning_rate": 9.999375905370613e-06, "loss": 17.5533, "step": 360 }, { "epoch": 0.016420634524384088, "grad_norm": 219.2302703857422, "learning_rate": 9.999358569408684e-06, "loss": 17.3406, "step": 370 }, { "epoch": 0.01686443545747555, "grad_norm": 219.14234924316406, "learning_rate": 9.999341233446757e-06, "loss": 17.208, "step": 380 }, { "epoch": 0.01730823639056701, "grad_norm": 224.26629638671875, "learning_rate": 9.99932389748483e-06, "loss": 17.2251, "step": 390 }, { "epoch": 0.017752037323658473, "grad_norm": 269.49090576171875, "learning_rate": 9.999306561522902e-06, "loss": 17.1339, "step": 400 }, { "epoch": 0.018195838256749936, "grad_norm": 268.406494140625, "learning_rate": 9.999289225560975e-06, "loss": 17.0892, "step": 410 }, { "epoch": 0.018639639189841396, "grad_norm": 319.0367431640625, "learning_rate": 9.999271889599048e-06, "loss": 17.3344, "step": 420 }, { "epoch": 0.01908344012293286, "grad_norm": 259.054443359375, "learning_rate": 9.99925455363712e-06, "loss": 17.0951, "step": 430 }, { "epoch": 0.01952724105602432, "grad_norm": 191.9784698486328, "learning_rate": 9.999237217675193e-06, "loss": 16.1768, "step": 440 }, { "epoch": 0.01997104198911578, "grad_norm": 214.4703826904297, "learning_rate": 9.999219881713266e-06, "loss": 17.0767, "step": 450 }, { "epoch": 0.020414842922207244, "grad_norm": 271.8641662597656, "learning_rate": 9.999202545751337e-06, "loss": 16.9147, "step": 460 }, { "epoch": 0.020858643855298706, "grad_norm": 293.4891357421875, "learning_rate": 9.99918520978941e-06, "loss": 17.3521, "step": 470 }, { "epoch": 0.021302444788390166, "grad_norm": 200.57672119140625, "learning_rate": 9.999167873827483e-06, "loss": 16.7094, "step": 480 }, { "epoch": 0.02174624572148163, "grad_norm": 167.0054931640625, "learning_rate": 9.999150537865555e-06, "loss": 16.874, "step": 490 }, { "epoch": 0.02219004665457309, "grad_norm": 155.45993041992188, "learning_rate": 9.999133201903628e-06, "loss": 16.6857, "step": 500 }, { "epoch": 0.022633847587664555, "grad_norm": 169.74298095703125, "learning_rate": 9.9991158659417e-06, "loss": 16.9436, "step": 510 }, { "epoch": 0.023077648520756014, "grad_norm": 175.02944946289062, "learning_rate": 9.999098529979772e-06, "loss": 16.4027, "step": 520 }, { "epoch": 0.023521449453847477, "grad_norm": 206.35028076171875, "learning_rate": 9.999081194017845e-06, "loss": 17.196, "step": 530 }, { "epoch": 0.02396525038693894, "grad_norm": 175.73011779785156, "learning_rate": 9.999063858055918e-06, "loss": 16.2829, "step": 540 }, { "epoch": 0.0244090513200304, "grad_norm": 193.17225646972656, "learning_rate": 9.999046522093991e-06, "loss": 16.3072, "step": 550 }, { "epoch": 0.024852852253121862, "grad_norm": 232.3939971923828, "learning_rate": 9.999029186132063e-06, "loss": 16.2658, "step": 560 }, { "epoch": 0.025296653186213325, "grad_norm": 200.2445526123047, "learning_rate": 9.999011850170136e-06, "loss": 16.8102, "step": 570 }, { "epoch": 0.025740454119304784, "grad_norm": 235.8720703125, "learning_rate": 9.998994514208209e-06, "loss": 16.5103, "step": 580 }, { "epoch": 0.026184255052396247, "grad_norm": 174.5557403564453, "learning_rate": 9.99897717824628e-06, "loss": 16.4585, "step": 590 }, { "epoch": 0.02662805598548771, "grad_norm": 211.39187622070312, "learning_rate": 9.998959842284353e-06, "loss": 16.0689, "step": 600 }, { "epoch": 0.027071856918579173, "grad_norm": 274.6959228515625, "learning_rate": 9.998942506322426e-06, "loss": 15.9399, "step": 610 }, { "epoch": 0.027515657851670632, "grad_norm": 220.5891571044922, "learning_rate": 9.998925170360498e-06, "loss": 16.2098, "step": 620 }, { "epoch": 0.027959458784762095, "grad_norm": 239.71881103515625, "learning_rate": 9.998907834398571e-06, "loss": 16.6608, "step": 630 }, { "epoch": 0.028403259717853558, "grad_norm": 192.038818359375, "learning_rate": 9.998890498436644e-06, "loss": 16.0857, "step": 640 }, { "epoch": 0.028847060650945017, "grad_norm": 197.78492736816406, "learning_rate": 9.998873162474715e-06, "loss": 16.4402, "step": 650 }, { "epoch": 0.02929086158403648, "grad_norm": 180.5525360107422, "learning_rate": 9.998855826512788e-06, "loss": 15.6789, "step": 660 }, { "epoch": 0.029734662517127943, "grad_norm": 190.83297729492188, "learning_rate": 9.998838490550861e-06, "loss": 16.3187, "step": 670 }, { "epoch": 0.030178463450219403, "grad_norm": 259.5210266113281, "learning_rate": 9.998821154588933e-06, "loss": 15.9802, "step": 680 }, { "epoch": 0.030622264383310865, "grad_norm": 193.27146911621094, "learning_rate": 9.998803818627006e-06, "loss": 16.3872, "step": 690 }, { "epoch": 0.03106606531640233, "grad_norm": 187.4614715576172, "learning_rate": 9.998786482665079e-06, "loss": 15.7645, "step": 700 }, { "epoch": 0.03150986624949379, "grad_norm": 201.60263061523438, "learning_rate": 9.99876914670315e-06, "loss": 15.6817, "step": 710 }, { "epoch": 0.03195366718258525, "grad_norm": 170.6359100341797, "learning_rate": 9.998751810741223e-06, "loss": 15.8353, "step": 720 }, { "epoch": 0.03239746811567671, "grad_norm": 239.1863555908203, "learning_rate": 9.998734474779297e-06, "loss": 15.6572, "step": 730 }, { "epoch": 0.032841269048768176, "grad_norm": 174.40811157226562, "learning_rate": 9.998717138817368e-06, "loss": 16.2995, "step": 740 }, { "epoch": 0.033285069981859636, "grad_norm": 208.86151123046875, "learning_rate": 9.998699802855441e-06, "loss": 15.5835, "step": 750 }, { "epoch": 0.0337288709149511, "grad_norm": 186.65267944335938, "learning_rate": 9.998682466893514e-06, "loss": 15.7859, "step": 760 }, { "epoch": 0.03417267184804256, "grad_norm": 178.12464904785156, "learning_rate": 9.998665130931587e-06, "loss": 15.9824, "step": 770 }, { "epoch": 0.03461647278113402, "grad_norm": 213.6521759033203, "learning_rate": 9.998647794969659e-06, "loss": 15.8112, "step": 780 }, { "epoch": 0.03506027371422549, "grad_norm": 158.35516357421875, "learning_rate": 9.998630459007732e-06, "loss": 15.7298, "step": 790 }, { "epoch": 0.03550407464731695, "grad_norm": 221.17010498046875, "learning_rate": 9.998613123045805e-06, "loss": 15.4734, "step": 800 }, { "epoch": 0.035947875580408406, "grad_norm": 218.5003204345703, "learning_rate": 9.998595787083876e-06, "loss": 16.2934, "step": 810 }, { "epoch": 0.03639167651349987, "grad_norm": 162.29537963867188, "learning_rate": 9.99857845112195e-06, "loss": 15.4424, "step": 820 }, { "epoch": 0.03683547744659133, "grad_norm": 233.5081787109375, "learning_rate": 9.998561115160022e-06, "loss": 15.3749, "step": 830 }, { "epoch": 0.03727927837968279, "grad_norm": 230.38172912597656, "learning_rate": 9.998543779198094e-06, "loss": 15.4327, "step": 840 }, { "epoch": 0.03772307931277426, "grad_norm": 187.3807373046875, "learning_rate": 9.998526443236167e-06, "loss": 15.2466, "step": 850 }, { "epoch": 0.03816688024586572, "grad_norm": 287.6422119140625, "learning_rate": 9.99850910727424e-06, "loss": 16.137, "step": 860 }, { "epoch": 0.038610681178957176, "grad_norm": 144.71505737304688, "learning_rate": 9.998491771312311e-06, "loss": 15.2375, "step": 870 }, { "epoch": 0.03905448211204864, "grad_norm": 195.58685302734375, "learning_rate": 9.998474435350384e-06, "loss": 15.2691, "step": 880 }, { "epoch": 0.0394982830451401, "grad_norm": 156.4313201904297, "learning_rate": 9.998457099388457e-06, "loss": 15.098, "step": 890 }, { "epoch": 0.03994208397823156, "grad_norm": 211.72653198242188, "learning_rate": 9.998439763426529e-06, "loss": 15.3399, "step": 900 }, { "epoch": 0.04038588491132303, "grad_norm": 188.71702575683594, "learning_rate": 9.998422427464602e-06, "loss": 15.563, "step": 910 }, { "epoch": 0.04082968584441449, "grad_norm": 183.34080505371094, "learning_rate": 9.998405091502675e-06, "loss": 15.077, "step": 920 }, { "epoch": 0.04127348677750595, "grad_norm": 195.2317657470703, "learning_rate": 9.998387755540748e-06, "loss": 15.5238, "step": 930 }, { "epoch": 0.04171728771059741, "grad_norm": 165.41636657714844, "learning_rate": 9.99837041957882e-06, "loss": 15.4294, "step": 940 }, { "epoch": 0.04216108864368887, "grad_norm": 161.98556518554688, "learning_rate": 9.998353083616892e-06, "loss": 15.3307, "step": 950 }, { "epoch": 0.04260488957678033, "grad_norm": 204.99436950683594, "learning_rate": 9.998335747654965e-06, "loss": 15.0829, "step": 960 }, { "epoch": 0.0430486905098718, "grad_norm": 153.96363830566406, "learning_rate": 9.998318411693037e-06, "loss": 14.9974, "step": 970 }, { "epoch": 0.04349249144296326, "grad_norm": 212.83502197265625, "learning_rate": 9.99830107573111e-06, "loss": 14.8463, "step": 980 }, { "epoch": 0.043936292376054724, "grad_norm": 171.2718505859375, "learning_rate": 9.998283739769183e-06, "loss": 15.22, "step": 990 }, { "epoch": 0.04438009330914618, "grad_norm": 159.00628662109375, "learning_rate": 9.998266403807254e-06, "loss": 15.608, "step": 1000 }, { "epoch": 0.04438009330914618, "eval_loss": 0.47256025671958923, "eval_runtime": 675.519, "eval_samples_per_second": 1797.715, "eval_steps_per_second": 56.179, "step": 1000 }, { "epoch": 0.04482389424223764, "grad_norm": 268.4474792480469, "learning_rate": 9.998249067845328e-06, "loss": 15.0797, "step": 1010 }, { "epoch": 0.04526769517532911, "grad_norm": 195.7564697265625, "learning_rate": 9.9982317318834e-06, "loss": 15.2636, "step": 1020 }, { "epoch": 0.04571149610842057, "grad_norm": 156.26730346679688, "learning_rate": 9.998214395921472e-06, "loss": 15.5175, "step": 1030 }, { "epoch": 0.04615529704151203, "grad_norm": 171.27798461914062, "learning_rate": 9.998197059959545e-06, "loss": 15.1707, "step": 1040 }, { "epoch": 0.046599097974603494, "grad_norm": 180.56385803222656, "learning_rate": 9.998179723997618e-06, "loss": 15.7545, "step": 1050 }, { "epoch": 0.047042898907694954, "grad_norm": 165.36468505859375, "learning_rate": 9.998162388035691e-06, "loss": 14.6627, "step": 1060 }, { "epoch": 0.04748669984078641, "grad_norm": 189.90077209472656, "learning_rate": 9.998145052073763e-06, "loss": 14.6988, "step": 1070 }, { "epoch": 0.04793050077387788, "grad_norm": 225.11740112304688, "learning_rate": 9.998127716111836e-06, "loss": 15.1959, "step": 1080 }, { "epoch": 0.04837430170696934, "grad_norm": 174.7201690673828, "learning_rate": 9.998110380149909e-06, "loss": 15.2072, "step": 1090 }, { "epoch": 0.0488181026400608, "grad_norm": 182.18833923339844, "learning_rate": 9.99809304418798e-06, "loss": 15.2541, "step": 1100 }, { "epoch": 0.049261903573152264, "grad_norm": 166.11903381347656, "learning_rate": 9.998075708226053e-06, "loss": 14.5574, "step": 1110 }, { "epoch": 0.049705704506243724, "grad_norm": 154.42739868164062, "learning_rate": 9.998058372264126e-06, "loss": 14.5568, "step": 1120 }, { "epoch": 0.05014950543933518, "grad_norm": 204.05294799804688, "learning_rate": 9.998041036302198e-06, "loss": 15.9922, "step": 1130 }, { "epoch": 0.05059330637242665, "grad_norm": 150.65687561035156, "learning_rate": 9.99802370034027e-06, "loss": 14.9494, "step": 1140 }, { "epoch": 0.05103710730551811, "grad_norm": 214.66461181640625, "learning_rate": 9.998006364378344e-06, "loss": 14.969, "step": 1150 }, { "epoch": 0.05148090823860957, "grad_norm": 150.3427734375, "learning_rate": 9.997989028416417e-06, "loss": 15.8738, "step": 1160 }, { "epoch": 0.051924709171701035, "grad_norm": 200.99319458007812, "learning_rate": 9.997971692454488e-06, "loss": 15.0359, "step": 1170 }, { "epoch": 0.052368510104792494, "grad_norm": 160.50755310058594, "learning_rate": 9.997954356492561e-06, "loss": 14.3384, "step": 1180 }, { "epoch": 0.05281231103788396, "grad_norm": 198.73057556152344, "learning_rate": 9.997937020530634e-06, "loss": 14.6252, "step": 1190 }, { "epoch": 0.05325611197097542, "grad_norm": 284.5395202636719, "learning_rate": 9.997919684568706e-06, "loss": 14.4951, "step": 1200 }, { "epoch": 0.05369991290406688, "grad_norm": 197.24905395507812, "learning_rate": 9.997902348606779e-06, "loss": 14.0584, "step": 1210 }, { "epoch": 0.054143713837158346, "grad_norm": 210.77964782714844, "learning_rate": 9.997885012644852e-06, "loss": 14.7978, "step": 1220 }, { "epoch": 0.054587514770249805, "grad_norm": 227.98147583007812, "learning_rate": 9.997867676682923e-06, "loss": 14.5782, "step": 1230 }, { "epoch": 0.055031315703341264, "grad_norm": 174.67752075195312, "learning_rate": 9.997850340720996e-06, "loss": 15.018, "step": 1240 }, { "epoch": 0.05547511663643273, "grad_norm": 211.7549285888672, "learning_rate": 9.99783300475907e-06, "loss": 14.7918, "step": 1250 }, { "epoch": 0.05591891756952419, "grad_norm": 170.83355712890625, "learning_rate": 9.997815668797141e-06, "loss": 14.4284, "step": 1260 }, { "epoch": 0.05636271850261565, "grad_norm": 200.0304412841797, "learning_rate": 9.997798332835214e-06, "loss": 14.2359, "step": 1270 }, { "epoch": 0.056806519435707116, "grad_norm": 198.56309509277344, "learning_rate": 9.997780996873287e-06, "loss": 15.2276, "step": 1280 }, { "epoch": 0.057250320368798575, "grad_norm": 193.32701110839844, "learning_rate": 9.99776366091136e-06, "loss": 14.369, "step": 1290 }, { "epoch": 0.057694121301890035, "grad_norm": 167.3224639892578, "learning_rate": 9.997746324949432e-06, "loss": 14.5175, "step": 1300 }, { "epoch": 0.0581379222349815, "grad_norm": 222.6139678955078, "learning_rate": 9.997728988987505e-06, "loss": 14.3153, "step": 1310 }, { "epoch": 0.05858172316807296, "grad_norm": 200.5888671875, "learning_rate": 9.997711653025578e-06, "loss": 15.0348, "step": 1320 }, { "epoch": 0.05902552410116442, "grad_norm": 152.54669189453125, "learning_rate": 9.997694317063649e-06, "loss": 14.2657, "step": 1330 }, { "epoch": 0.059469325034255886, "grad_norm": 208.6412811279297, "learning_rate": 9.997676981101722e-06, "loss": 14.4811, "step": 1340 }, { "epoch": 0.059913125967347346, "grad_norm": 163.86343383789062, "learning_rate": 9.997659645139795e-06, "loss": 14.364, "step": 1350 }, { "epoch": 0.060356926900438805, "grad_norm": 196.13272094726562, "learning_rate": 9.997642309177867e-06, "loss": 14.9816, "step": 1360 }, { "epoch": 0.06080072783353027, "grad_norm": 184.44252014160156, "learning_rate": 9.99762497321594e-06, "loss": 14.3186, "step": 1370 }, { "epoch": 0.06124452876662173, "grad_norm": 188.05926513671875, "learning_rate": 9.997607637254013e-06, "loss": 14.0344, "step": 1380 }, { "epoch": 0.06168832969971319, "grad_norm": 145.69866943359375, "learning_rate": 9.997590301292084e-06, "loss": 14.584, "step": 1390 }, { "epoch": 0.06213213063280466, "grad_norm": 158.0360107421875, "learning_rate": 9.997572965330157e-06, "loss": 14.2096, "step": 1400 }, { "epoch": 0.06257593156589612, "grad_norm": 197.41757202148438, "learning_rate": 9.99755562936823e-06, "loss": 14.7819, "step": 1410 }, { "epoch": 0.06301973249898758, "grad_norm": 164.63351440429688, "learning_rate": 9.997538293406303e-06, "loss": 14.3291, "step": 1420 }, { "epoch": 0.06346353343207904, "grad_norm": 186.9270782470703, "learning_rate": 9.997520957444375e-06, "loss": 14.3004, "step": 1430 }, { "epoch": 0.0639073343651705, "grad_norm": 167.5779266357422, "learning_rate": 9.997503621482448e-06, "loss": 14.0414, "step": 1440 }, { "epoch": 0.06435113529826196, "grad_norm": 148.85032653808594, "learning_rate": 9.997486285520521e-06, "loss": 14.3065, "step": 1450 }, { "epoch": 0.06479493623135342, "grad_norm": 222.2272491455078, "learning_rate": 9.997468949558592e-06, "loss": 14.6169, "step": 1460 }, { "epoch": 0.0652387371644449, "grad_norm": 144.12142944335938, "learning_rate": 9.997451613596665e-06, "loss": 15.0054, "step": 1470 }, { "epoch": 0.06568253809753635, "grad_norm": 205.4581298828125, "learning_rate": 9.997434277634738e-06, "loss": 14.3197, "step": 1480 }, { "epoch": 0.06612633903062781, "grad_norm": 183.8268280029297, "learning_rate": 9.99741694167281e-06, "loss": 14.046, "step": 1490 }, { "epoch": 0.06657013996371927, "grad_norm": 131.39759826660156, "learning_rate": 9.997399605710883e-06, "loss": 14.4233, "step": 1500 }, { "epoch": 0.06701394089681073, "grad_norm": 160.77137756347656, "learning_rate": 9.997382269748956e-06, "loss": 13.7336, "step": 1510 }, { "epoch": 0.0674577418299022, "grad_norm": 137.20225524902344, "learning_rate": 9.997364933787027e-06, "loss": 13.8353, "step": 1520 }, { "epoch": 0.06790154276299366, "grad_norm": 214.53268432617188, "learning_rate": 9.9973475978251e-06, "loss": 14.3842, "step": 1530 }, { "epoch": 0.06834534369608512, "grad_norm": 162.3836669921875, "learning_rate": 9.997330261863174e-06, "loss": 14.3846, "step": 1540 }, { "epoch": 0.06878914462917658, "grad_norm": 151.75082397460938, "learning_rate": 9.997312925901247e-06, "loss": 14.2962, "step": 1550 }, { "epoch": 0.06923294556226804, "grad_norm": 201.07388305664062, "learning_rate": 9.997295589939318e-06, "loss": 14.6052, "step": 1560 }, { "epoch": 0.0696767464953595, "grad_norm": 135.7477264404297, "learning_rate": 9.997278253977391e-06, "loss": 14.0033, "step": 1570 }, { "epoch": 0.07012054742845097, "grad_norm": 157.89686584472656, "learning_rate": 9.997260918015464e-06, "loss": 13.8008, "step": 1580 }, { "epoch": 0.07056434836154243, "grad_norm": 169.57518005371094, "learning_rate": 9.997243582053536e-06, "loss": 13.9898, "step": 1590 }, { "epoch": 0.0710081492946339, "grad_norm": 182.89100646972656, "learning_rate": 9.997226246091609e-06, "loss": 14.0484, "step": 1600 }, { "epoch": 0.07145195022772535, "grad_norm": 170.64639282226562, "learning_rate": 9.997208910129682e-06, "loss": 14.1716, "step": 1610 }, { "epoch": 0.07189575116081681, "grad_norm": 179.93946838378906, "learning_rate": 9.997191574167753e-06, "loss": 13.6771, "step": 1620 }, { "epoch": 0.07233955209390827, "grad_norm": 145.3518829345703, "learning_rate": 9.997174238205826e-06, "loss": 13.9909, "step": 1630 }, { "epoch": 0.07278335302699974, "grad_norm": 153.2678680419922, "learning_rate": 9.9971569022439e-06, "loss": 13.6993, "step": 1640 }, { "epoch": 0.0732271539600912, "grad_norm": 170.7726287841797, "learning_rate": 9.99713956628197e-06, "loss": 13.7267, "step": 1650 }, { "epoch": 0.07367095489318266, "grad_norm": 137.22825622558594, "learning_rate": 9.997122230320044e-06, "loss": 13.9653, "step": 1660 }, { "epoch": 0.07411475582627412, "grad_norm": 184.2819366455078, "learning_rate": 9.997104894358117e-06, "loss": 14.0155, "step": 1670 }, { "epoch": 0.07455855675936558, "grad_norm": 186.31808471679688, "learning_rate": 9.99708755839619e-06, "loss": 14.1664, "step": 1680 }, { "epoch": 0.07500235769245704, "grad_norm": 159.4346160888672, "learning_rate": 9.997070222434261e-06, "loss": 13.9029, "step": 1690 }, { "epoch": 0.07544615862554852, "grad_norm": 148.49266052246094, "learning_rate": 9.997052886472334e-06, "loss": 14.4053, "step": 1700 }, { "epoch": 0.07588995955863997, "grad_norm": 154.17291259765625, "learning_rate": 9.997035550510407e-06, "loss": 14.4536, "step": 1710 }, { "epoch": 0.07633376049173143, "grad_norm": 164.39495849609375, "learning_rate": 9.997018214548479e-06, "loss": 14.1982, "step": 1720 }, { "epoch": 0.0767775614248229, "grad_norm": 203.73599243164062, "learning_rate": 9.997000878586552e-06, "loss": 13.7706, "step": 1730 }, { "epoch": 0.07722136235791435, "grad_norm": 201.3606719970703, "learning_rate": 9.996983542624625e-06, "loss": 14.2293, "step": 1740 }, { "epoch": 0.07766516329100583, "grad_norm": 158.88832092285156, "learning_rate": 9.996966206662696e-06, "loss": 13.9448, "step": 1750 }, { "epoch": 0.07810896422409729, "grad_norm": 193.7799530029297, "learning_rate": 9.99694887070077e-06, "loss": 13.4655, "step": 1760 }, { "epoch": 0.07855276515718874, "grad_norm": 189.9005889892578, "learning_rate": 9.996931534738842e-06, "loss": 13.5407, "step": 1770 }, { "epoch": 0.0789965660902802, "grad_norm": 161.69204711914062, "learning_rate": 9.996914198776914e-06, "loss": 13.9047, "step": 1780 }, { "epoch": 0.07944036702337166, "grad_norm": 168.77694702148438, "learning_rate": 9.996896862814987e-06, "loss": 13.7892, "step": 1790 }, { "epoch": 0.07988416795646312, "grad_norm": 168.03192138671875, "learning_rate": 9.99687952685306e-06, "loss": 13.3482, "step": 1800 }, { "epoch": 0.0803279688895546, "grad_norm": 140.20399475097656, "learning_rate": 9.996862190891131e-06, "loss": 13.9784, "step": 1810 }, { "epoch": 0.08077176982264606, "grad_norm": 157.78277587890625, "learning_rate": 9.996844854929204e-06, "loss": 14.3328, "step": 1820 }, { "epoch": 0.08121557075573752, "grad_norm": 155.1420135498047, "learning_rate": 9.996827518967278e-06, "loss": 13.6884, "step": 1830 }, { "epoch": 0.08165937168882897, "grad_norm": 135.9541473388672, "learning_rate": 9.996810183005349e-06, "loss": 13.5477, "step": 1840 }, { "epoch": 0.08210317262192043, "grad_norm": 163.42974853515625, "learning_rate": 9.996792847043422e-06, "loss": 13.2751, "step": 1850 }, { "epoch": 0.0825469735550119, "grad_norm": 168.12744140625, "learning_rate": 9.996775511081495e-06, "loss": 13.8876, "step": 1860 }, { "epoch": 0.08299077448810337, "grad_norm": 173.35897827148438, "learning_rate": 9.996758175119568e-06, "loss": 14.0019, "step": 1870 }, { "epoch": 0.08343457542119483, "grad_norm": 160.2526397705078, "learning_rate": 9.99674083915764e-06, "loss": 13.6193, "step": 1880 }, { "epoch": 0.08387837635428629, "grad_norm": 140.53231811523438, "learning_rate": 9.996723503195713e-06, "loss": 13.6471, "step": 1890 }, { "epoch": 0.08432217728737774, "grad_norm": 134.29800415039062, "learning_rate": 9.996706167233786e-06, "loss": 13.6341, "step": 1900 }, { "epoch": 0.0847659782204692, "grad_norm": 176.61903381347656, "learning_rate": 9.996688831271857e-06, "loss": 13.7481, "step": 1910 }, { "epoch": 0.08520977915356066, "grad_norm": 141.41427612304688, "learning_rate": 9.99667149530993e-06, "loss": 13.5171, "step": 1920 }, { "epoch": 0.08565358008665214, "grad_norm": 155.45187377929688, "learning_rate": 9.996654159348003e-06, "loss": 13.2516, "step": 1930 }, { "epoch": 0.0860973810197436, "grad_norm": 130.82913208007812, "learning_rate": 9.996636823386075e-06, "loss": 13.1989, "step": 1940 }, { "epoch": 0.08654118195283506, "grad_norm": 158.8917694091797, "learning_rate": 9.996619487424148e-06, "loss": 13.1475, "step": 1950 }, { "epoch": 0.08698498288592652, "grad_norm": 175.22116088867188, "learning_rate": 9.99660215146222e-06, "loss": 13.5781, "step": 1960 }, { "epoch": 0.08742878381901797, "grad_norm": 152.9134979248047, "learning_rate": 9.996584815500292e-06, "loss": 13.7951, "step": 1970 }, { "epoch": 0.08787258475210945, "grad_norm": 139.04403686523438, "learning_rate": 9.996567479538365e-06, "loss": 13.6352, "step": 1980 }, { "epoch": 0.08831638568520091, "grad_norm": 166.22174072265625, "learning_rate": 9.996550143576438e-06, "loss": 14.035, "step": 1990 }, { "epoch": 0.08876018661829237, "grad_norm": 148.52734375, "learning_rate": 9.99653280761451e-06, "loss": 14.207, "step": 2000 }, { "epoch": 0.08876018661829237, "eval_loss": 0.4256989657878876, "eval_runtime": 673.4968, "eval_samples_per_second": 1803.113, "eval_steps_per_second": 56.348, "step": 2000 }, { "epoch": 0.08920398755138383, "grad_norm": 155.57510375976562, "learning_rate": 9.996515471652583e-06, "loss": 13.5324, "step": 2010 }, { "epoch": 0.08964778848447529, "grad_norm": 172.14755249023438, "learning_rate": 9.996498135690656e-06, "loss": 13.7738, "step": 2020 }, { "epoch": 0.09009158941756674, "grad_norm": 175.1804656982422, "learning_rate": 9.996480799728727e-06, "loss": 13.6314, "step": 2030 }, { "epoch": 0.09053539035065822, "grad_norm": 157.45474243164062, "learning_rate": 9.9964634637668e-06, "loss": 13.6318, "step": 2040 }, { "epoch": 0.09097919128374968, "grad_norm": 149.86843872070312, "learning_rate": 9.996446127804873e-06, "loss": 13.761, "step": 2050 }, { "epoch": 0.09142299221684114, "grad_norm": 139.11117553710938, "learning_rate": 9.996428791842945e-06, "loss": 13.7724, "step": 2060 }, { "epoch": 0.0918667931499326, "grad_norm": 207.54212951660156, "learning_rate": 9.996411455881018e-06, "loss": 13.5893, "step": 2070 }, { "epoch": 0.09231059408302406, "grad_norm": 160.33975219726562, "learning_rate": 9.996394119919091e-06, "loss": 13.624, "step": 2080 }, { "epoch": 0.09275439501611552, "grad_norm": 130.01426696777344, "learning_rate": 9.996376783957164e-06, "loss": 13.8974, "step": 2090 }, { "epoch": 0.09319819594920699, "grad_norm": 132.64810180664062, "learning_rate": 9.996359447995235e-06, "loss": 13.4078, "step": 2100 }, { "epoch": 0.09364199688229845, "grad_norm": 148.302734375, "learning_rate": 9.996342112033308e-06, "loss": 14.1865, "step": 2110 }, { "epoch": 0.09408579781538991, "grad_norm": 185.2609405517578, "learning_rate": 9.996324776071382e-06, "loss": 13.5624, "step": 2120 }, { "epoch": 0.09452959874848137, "grad_norm": 159.40829467773438, "learning_rate": 9.996307440109453e-06, "loss": 13.428, "step": 2130 }, { "epoch": 0.09497339968157283, "grad_norm": 162.26693725585938, "learning_rate": 9.996290104147526e-06, "loss": 14.0764, "step": 2140 }, { "epoch": 0.0954172006146643, "grad_norm": 144.18853759765625, "learning_rate": 9.996272768185599e-06, "loss": 13.581, "step": 2150 }, { "epoch": 0.09586100154775576, "grad_norm": 141.41928100585938, "learning_rate": 9.99625543222367e-06, "loss": 14.1607, "step": 2160 }, { "epoch": 0.09630480248084722, "grad_norm": 144.21542358398438, "learning_rate": 9.996238096261744e-06, "loss": 13.6053, "step": 2170 }, { "epoch": 0.09674860341393868, "grad_norm": 146.44114685058594, "learning_rate": 9.996220760299817e-06, "loss": 13.3627, "step": 2180 }, { "epoch": 0.09719240434703014, "grad_norm": 142.94996643066406, "learning_rate": 9.996203424337888e-06, "loss": 13.0106, "step": 2190 }, { "epoch": 0.0976362052801216, "grad_norm": 137.17572021484375, "learning_rate": 9.996186088375961e-06, "loss": 13.2873, "step": 2200 }, { "epoch": 0.09808000621321307, "grad_norm": 165.0562744140625, "learning_rate": 9.996168752414034e-06, "loss": 14.0056, "step": 2210 }, { "epoch": 0.09852380714630453, "grad_norm": 154.87319946289062, "learning_rate": 9.996151416452106e-06, "loss": 13.6771, "step": 2220 }, { "epoch": 0.09896760807939599, "grad_norm": 145.29185485839844, "learning_rate": 9.996134080490179e-06, "loss": 13.2175, "step": 2230 }, { "epoch": 0.09941140901248745, "grad_norm": 138.25790405273438, "learning_rate": 9.996116744528252e-06, "loss": 13.4874, "step": 2240 }, { "epoch": 0.09985520994557891, "grad_norm": 129.90760803222656, "learning_rate": 9.996099408566323e-06, "loss": 13.2961, "step": 2250 }, { "epoch": 0.10029901087867037, "grad_norm": 167.95828247070312, "learning_rate": 9.996082072604396e-06, "loss": 13.8555, "step": 2260 }, { "epoch": 0.10074281181176184, "grad_norm": 171.57943725585938, "learning_rate": 9.99606473664247e-06, "loss": 13.7328, "step": 2270 }, { "epoch": 0.1011866127448533, "grad_norm": 129.10914611816406, "learning_rate": 9.99604740068054e-06, "loss": 13.1205, "step": 2280 }, { "epoch": 0.10163041367794476, "grad_norm": 142.1722869873047, "learning_rate": 9.996030064718614e-06, "loss": 13.4908, "step": 2290 }, { "epoch": 0.10207421461103622, "grad_norm": 174.66026306152344, "learning_rate": 9.996012728756687e-06, "loss": 12.8445, "step": 2300 }, { "epoch": 0.10251801554412768, "grad_norm": 159.54783630371094, "learning_rate": 9.99599539279476e-06, "loss": 12.9509, "step": 2310 }, { "epoch": 0.10296181647721914, "grad_norm": 119.40794372558594, "learning_rate": 9.995978056832831e-06, "loss": 13.689, "step": 2320 }, { "epoch": 0.10340561741031061, "grad_norm": 114.7558364868164, "learning_rate": 9.995960720870904e-06, "loss": 12.7327, "step": 2330 }, { "epoch": 0.10384941834340207, "grad_norm": 163.34475708007812, "learning_rate": 9.995943384908977e-06, "loss": 13.5477, "step": 2340 }, { "epoch": 0.10429321927649353, "grad_norm": 169.064208984375, "learning_rate": 9.995926048947049e-06, "loss": 13.7607, "step": 2350 }, { "epoch": 0.10473702020958499, "grad_norm": 128.2354736328125, "learning_rate": 9.995908712985122e-06, "loss": 13.8691, "step": 2360 }, { "epoch": 0.10518082114267645, "grad_norm": 184.0960235595703, "learning_rate": 9.995891377023195e-06, "loss": 13.9122, "step": 2370 }, { "epoch": 0.10562462207576792, "grad_norm": 143.75611877441406, "learning_rate": 9.995874041061266e-06, "loss": 13.1847, "step": 2380 }, { "epoch": 0.10606842300885938, "grad_norm": 160.75901794433594, "learning_rate": 9.99585670509934e-06, "loss": 13.3183, "step": 2390 }, { "epoch": 0.10651222394195084, "grad_norm": 178.99951171875, "learning_rate": 9.995839369137413e-06, "loss": 12.9939, "step": 2400 }, { "epoch": 0.1069560248750423, "grad_norm": 161.1572723388672, "learning_rate": 9.995822033175484e-06, "loss": 12.9724, "step": 2410 }, { "epoch": 0.10739982580813376, "grad_norm": 138.41233825683594, "learning_rate": 9.995804697213557e-06, "loss": 13.3155, "step": 2420 }, { "epoch": 0.10784362674122522, "grad_norm": 186.98597717285156, "learning_rate": 9.99578736125163e-06, "loss": 13.385, "step": 2430 }, { "epoch": 0.10828742767431669, "grad_norm": 148.47291564941406, "learning_rate": 9.995770025289701e-06, "loss": 12.6492, "step": 2440 }, { "epoch": 0.10873122860740815, "grad_norm": 187.78671264648438, "learning_rate": 9.995752689327775e-06, "loss": 13.181, "step": 2450 }, { "epoch": 0.10917502954049961, "grad_norm": 119.649658203125, "learning_rate": 9.995735353365848e-06, "loss": 13.285, "step": 2460 }, { "epoch": 0.10961883047359107, "grad_norm": 165.08554077148438, "learning_rate": 9.995718017403919e-06, "loss": 13.2161, "step": 2470 }, { "epoch": 0.11006263140668253, "grad_norm": 201.1644287109375, "learning_rate": 9.995700681441992e-06, "loss": 13.8907, "step": 2480 }, { "epoch": 0.11050643233977399, "grad_norm": 127.61051940917969, "learning_rate": 9.995683345480065e-06, "loss": 13.1687, "step": 2490 }, { "epoch": 0.11095023327286546, "grad_norm": 152.43359375, "learning_rate": 9.995666009518137e-06, "loss": 13.7492, "step": 2500 }, { "epoch": 0.11139403420595692, "grad_norm": 142.9310760498047, "learning_rate": 9.99564867355621e-06, "loss": 13.2862, "step": 2510 }, { "epoch": 0.11183783513904838, "grad_norm": 136.95596313476562, "learning_rate": 9.995631337594283e-06, "loss": 13.2403, "step": 2520 }, { "epoch": 0.11228163607213984, "grad_norm": 157.4971160888672, "learning_rate": 9.995614001632356e-06, "loss": 12.9846, "step": 2530 }, { "epoch": 0.1127254370052313, "grad_norm": 157.4886474609375, "learning_rate": 9.995596665670427e-06, "loss": 13.6239, "step": 2540 }, { "epoch": 0.11316923793832276, "grad_norm": 179.8428192138672, "learning_rate": 9.9955793297085e-06, "loss": 13.562, "step": 2550 }, { "epoch": 0.11361303887141423, "grad_norm": 115.2100601196289, "learning_rate": 9.995561993746573e-06, "loss": 13.7602, "step": 2560 }, { "epoch": 0.11405683980450569, "grad_norm": 152.49363708496094, "learning_rate": 9.995544657784645e-06, "loss": 12.7727, "step": 2570 }, { "epoch": 0.11450064073759715, "grad_norm": 111.91357421875, "learning_rate": 9.995527321822718e-06, "loss": 13.2489, "step": 2580 }, { "epoch": 0.11494444167068861, "grad_norm": 149.418701171875, "learning_rate": 9.99550998586079e-06, "loss": 13.5211, "step": 2590 }, { "epoch": 0.11538824260378007, "grad_norm": 158.1101531982422, "learning_rate": 9.995492649898862e-06, "loss": 13.5069, "step": 2600 }, { "epoch": 0.11583204353687154, "grad_norm": 152.66778564453125, "learning_rate": 9.995475313936935e-06, "loss": 13.44, "step": 2610 }, { "epoch": 0.116275844469963, "grad_norm": 133.16883850097656, "learning_rate": 9.995457977975008e-06, "loss": 13.4715, "step": 2620 }, { "epoch": 0.11671964540305446, "grad_norm": 145.34190368652344, "learning_rate": 9.99544064201308e-06, "loss": 13.2012, "step": 2630 }, { "epoch": 0.11716344633614592, "grad_norm": 141.71560668945312, "learning_rate": 9.995423306051153e-06, "loss": 13.389, "step": 2640 }, { "epoch": 0.11760724726923738, "grad_norm": 158.38853454589844, "learning_rate": 9.995405970089226e-06, "loss": 13.6967, "step": 2650 }, { "epoch": 0.11805104820232884, "grad_norm": 153.3102264404297, "learning_rate": 9.995388634127297e-06, "loss": 12.9532, "step": 2660 }, { "epoch": 0.11849484913542031, "grad_norm": 168.13534545898438, "learning_rate": 9.99537129816537e-06, "loss": 12.9198, "step": 2670 }, { "epoch": 0.11893865006851177, "grad_norm": 152.37652587890625, "learning_rate": 9.995353962203443e-06, "loss": 12.762, "step": 2680 }, { "epoch": 0.11938245100160323, "grad_norm": 175.31565856933594, "learning_rate": 9.995336626241515e-06, "loss": 13.3768, "step": 2690 }, { "epoch": 0.11982625193469469, "grad_norm": 142.70001220703125, "learning_rate": 9.995319290279588e-06, "loss": 14.0593, "step": 2700 }, { "epoch": 0.12027005286778615, "grad_norm": 138.56088256835938, "learning_rate": 9.995301954317661e-06, "loss": 12.9747, "step": 2710 }, { "epoch": 0.12071385380087761, "grad_norm": 159.65451049804688, "learning_rate": 9.995284618355732e-06, "loss": 13.163, "step": 2720 }, { "epoch": 0.12115765473396908, "grad_norm": 119.24352264404297, "learning_rate": 9.995267282393805e-06, "loss": 12.5898, "step": 2730 }, { "epoch": 0.12160145566706054, "grad_norm": 107.53023529052734, "learning_rate": 9.995249946431879e-06, "loss": 13.4115, "step": 2740 }, { "epoch": 0.122045256600152, "grad_norm": 141.00604248046875, "learning_rate": 9.995232610469952e-06, "loss": 13.4099, "step": 2750 }, { "epoch": 0.12248905753324346, "grad_norm": 121.9310302734375, "learning_rate": 9.995215274508023e-06, "loss": 13.0975, "step": 2760 }, { "epoch": 0.12293285846633492, "grad_norm": 170.2215576171875, "learning_rate": 9.995197938546096e-06, "loss": 13.1425, "step": 2770 }, { "epoch": 0.12337665939942638, "grad_norm": 134.97998046875, "learning_rate": 9.99518060258417e-06, "loss": 13.0698, "step": 2780 }, { "epoch": 0.12382046033251785, "grad_norm": 141.1931915283203, "learning_rate": 9.99516326662224e-06, "loss": 12.9536, "step": 2790 }, { "epoch": 0.12426426126560931, "grad_norm": 123.69408416748047, "learning_rate": 9.995145930660314e-06, "loss": 13.1385, "step": 2800 }, { "epoch": 0.12470806219870077, "grad_norm": 132.7433319091797, "learning_rate": 9.995128594698387e-06, "loss": 12.7706, "step": 2810 }, { "epoch": 0.12515186313179225, "grad_norm": 146.10305786132812, "learning_rate": 9.995111258736458e-06, "loss": 13.1492, "step": 2820 }, { "epoch": 0.1255956640648837, "grad_norm": 217.5339813232422, "learning_rate": 9.995093922774531e-06, "loss": 13.2251, "step": 2830 }, { "epoch": 0.12603946499797516, "grad_norm": 178.1737518310547, "learning_rate": 9.995076586812604e-06, "loss": 12.8123, "step": 2840 }, { "epoch": 0.12648326593106662, "grad_norm": 123.01746368408203, "learning_rate": 9.995059250850676e-06, "loss": 12.8128, "step": 2850 }, { "epoch": 0.12692706686415808, "grad_norm": 173.1932830810547, "learning_rate": 9.995041914888749e-06, "loss": 13.3028, "step": 2860 }, { "epoch": 0.12737086779724954, "grad_norm": 129.58663940429688, "learning_rate": 9.995024578926822e-06, "loss": 12.685, "step": 2870 }, { "epoch": 0.127814668730341, "grad_norm": 163.02256774902344, "learning_rate": 9.995007242964893e-06, "loss": 12.8868, "step": 2880 }, { "epoch": 0.12825846966343246, "grad_norm": 129.8294677734375, "learning_rate": 9.994989907002966e-06, "loss": 12.6222, "step": 2890 }, { "epoch": 0.12870227059652392, "grad_norm": 133.46365356445312, "learning_rate": 9.99497257104104e-06, "loss": 12.5605, "step": 2900 }, { "epoch": 0.12914607152961538, "grad_norm": 141.15603637695312, "learning_rate": 9.99495523507911e-06, "loss": 12.972, "step": 2910 }, { "epoch": 0.12958987246270684, "grad_norm": 128.41879272460938, "learning_rate": 9.994937899117184e-06, "loss": 13.1774, "step": 2920 }, { "epoch": 0.13003367339579833, "grad_norm": 136.0733184814453, "learning_rate": 9.994920563155257e-06, "loss": 12.7431, "step": 2930 }, { "epoch": 0.1304774743288898, "grad_norm": 135.34146118164062, "learning_rate": 9.994903227193328e-06, "loss": 13.4061, "step": 2940 }, { "epoch": 0.13092127526198125, "grad_norm": 143.00442504882812, "learning_rate": 9.994885891231401e-06, "loss": 12.9928, "step": 2950 }, { "epoch": 0.1313650761950727, "grad_norm": 150.8383026123047, "learning_rate": 9.994868555269474e-06, "loss": 12.8895, "step": 2960 }, { "epoch": 0.13180887712816416, "grad_norm": 144.29466247558594, "learning_rate": 9.994851219307547e-06, "loss": 13.2952, "step": 2970 }, { "epoch": 0.13225267806125562, "grad_norm": 130.09901428222656, "learning_rate": 9.994833883345619e-06, "loss": 13.1275, "step": 2980 }, { "epoch": 0.13269647899434708, "grad_norm": 161.23716735839844, "learning_rate": 9.994816547383692e-06, "loss": 12.7923, "step": 2990 }, { "epoch": 0.13314027992743854, "grad_norm": 126.25406646728516, "learning_rate": 9.994799211421765e-06, "loss": 12.9331, "step": 3000 }, { "epoch": 0.13314027992743854, "eval_loss": 0.40445423126220703, "eval_runtime": 673.2876, "eval_samples_per_second": 1803.673, "eval_steps_per_second": 56.365, "step": 3000 }, { "epoch": 0.13358408086053, "grad_norm": 121.59950256347656, "learning_rate": 9.994781875459836e-06, "loss": 12.1823, "step": 3010 }, { "epoch": 0.13402788179362146, "grad_norm": 112.05023193359375, "learning_rate": 9.99476453949791e-06, "loss": 12.886, "step": 3020 }, { "epoch": 0.13447168272671292, "grad_norm": 129.76803588867188, "learning_rate": 9.994747203535983e-06, "loss": 12.7748, "step": 3030 }, { "epoch": 0.1349154836598044, "grad_norm": 124.96532440185547, "learning_rate": 9.994729867574054e-06, "loss": 13.1191, "step": 3040 }, { "epoch": 0.13535928459289587, "grad_norm": 141.1378631591797, "learning_rate": 9.994712531612127e-06, "loss": 13.4907, "step": 3050 }, { "epoch": 0.13580308552598733, "grad_norm": 160.61422729492188, "learning_rate": 9.9946951956502e-06, "loss": 13.6199, "step": 3060 }, { "epoch": 0.1362468864590788, "grad_norm": 136.35914611816406, "learning_rate": 9.994677859688271e-06, "loss": 13.2168, "step": 3070 }, { "epoch": 0.13669068739217025, "grad_norm": 127.0990982055664, "learning_rate": 9.994660523726345e-06, "loss": 12.9859, "step": 3080 }, { "epoch": 0.1371344883252617, "grad_norm": 138.41912841796875, "learning_rate": 9.994643187764418e-06, "loss": 12.4187, "step": 3090 }, { "epoch": 0.13757828925835316, "grad_norm": 140.53573608398438, "learning_rate": 9.994625851802489e-06, "loss": 13.0548, "step": 3100 }, { "epoch": 0.13802209019144462, "grad_norm": 140.9633331298828, "learning_rate": 9.994608515840562e-06, "loss": 12.8993, "step": 3110 }, { "epoch": 0.13846589112453608, "grad_norm": 161.15667724609375, "learning_rate": 9.994591179878635e-06, "loss": 12.7675, "step": 3120 }, { "epoch": 0.13890969205762754, "grad_norm": 131.9075469970703, "learning_rate": 9.994573843916707e-06, "loss": 13.4033, "step": 3130 }, { "epoch": 0.139353492990719, "grad_norm": 144.76168823242188, "learning_rate": 9.99455650795478e-06, "loss": 12.9296, "step": 3140 }, { "epoch": 0.13979729392381046, "grad_norm": 128.33094787597656, "learning_rate": 9.994539171992853e-06, "loss": 13.0926, "step": 3150 }, { "epoch": 0.14024109485690195, "grad_norm": 120.40109252929688, "learning_rate": 9.994521836030924e-06, "loss": 12.6591, "step": 3160 }, { "epoch": 0.1406848957899934, "grad_norm": 177.44715881347656, "learning_rate": 9.994504500068997e-06, "loss": 13.0056, "step": 3170 }, { "epoch": 0.14112869672308487, "grad_norm": 133.28228759765625, "learning_rate": 9.99448716410707e-06, "loss": 12.9662, "step": 3180 }, { "epoch": 0.14157249765617633, "grad_norm": 139.16880798339844, "learning_rate": 9.994469828145143e-06, "loss": 12.55, "step": 3190 }, { "epoch": 0.1420162985892678, "grad_norm": 125.42192077636719, "learning_rate": 9.994452492183215e-06, "loss": 13.1687, "step": 3200 }, { "epoch": 0.14246009952235925, "grad_norm": 167.23255920410156, "learning_rate": 9.994435156221288e-06, "loss": 13.1131, "step": 3210 }, { "epoch": 0.1429039004554507, "grad_norm": 160.59434509277344, "learning_rate": 9.994417820259361e-06, "loss": 12.6661, "step": 3220 }, { "epoch": 0.14334770138854216, "grad_norm": 157.50003051757812, "learning_rate": 9.994400484297432e-06, "loss": 13.0758, "step": 3230 }, { "epoch": 0.14379150232163362, "grad_norm": 152.7613067626953, "learning_rate": 9.994383148335505e-06, "loss": 12.6521, "step": 3240 }, { "epoch": 0.14423530325472508, "grad_norm": 120.58446502685547, "learning_rate": 9.994365812373578e-06, "loss": 12.9558, "step": 3250 }, { "epoch": 0.14467910418781654, "grad_norm": 130.59677124023438, "learning_rate": 9.99434847641165e-06, "loss": 12.919, "step": 3260 }, { "epoch": 0.14512290512090803, "grad_norm": 118.21358489990234, "learning_rate": 9.994331140449723e-06, "loss": 12.8244, "step": 3270 }, { "epoch": 0.1455667060539995, "grad_norm": 139.61123657226562, "learning_rate": 9.994313804487796e-06, "loss": 12.6272, "step": 3280 }, { "epoch": 0.14601050698709095, "grad_norm": 118.49354553222656, "learning_rate": 9.994296468525867e-06, "loss": 12.9009, "step": 3290 }, { "epoch": 0.1464543079201824, "grad_norm": 189.260498046875, "learning_rate": 9.99427913256394e-06, "loss": 12.7051, "step": 3300 }, { "epoch": 0.14689810885327387, "grad_norm": 146.55145263671875, "learning_rate": 9.994261796602014e-06, "loss": 12.7917, "step": 3310 }, { "epoch": 0.14734190978636533, "grad_norm": 140.71144104003906, "learning_rate": 9.994244460640085e-06, "loss": 12.2915, "step": 3320 }, { "epoch": 0.1477857107194568, "grad_norm": 153.54945373535156, "learning_rate": 9.994227124678158e-06, "loss": 12.4179, "step": 3330 }, { "epoch": 0.14822951165254825, "grad_norm": 113.84798431396484, "learning_rate": 9.994209788716231e-06, "loss": 12.8944, "step": 3340 }, { "epoch": 0.1486733125856397, "grad_norm": 140.952392578125, "learning_rate": 9.994192452754304e-06, "loss": 13.0441, "step": 3350 }, { "epoch": 0.14911711351873116, "grad_norm": 118.15951538085938, "learning_rate": 9.994175116792376e-06, "loss": 12.8967, "step": 3360 }, { "epoch": 0.14956091445182262, "grad_norm": 153.52896118164062, "learning_rate": 9.994157780830449e-06, "loss": 12.8702, "step": 3370 }, { "epoch": 0.15000471538491408, "grad_norm": 127.21448516845703, "learning_rate": 9.994140444868522e-06, "loss": 12.4004, "step": 3380 }, { "epoch": 0.15044851631800557, "grad_norm": 131.0957489013672, "learning_rate": 9.994123108906593e-06, "loss": 13.3359, "step": 3390 }, { "epoch": 0.15089231725109703, "grad_norm": 128.41253662109375, "learning_rate": 9.994105772944666e-06, "loss": 12.3271, "step": 3400 }, { "epoch": 0.1513361181841885, "grad_norm": 134.6806640625, "learning_rate": 9.99408843698274e-06, "loss": 12.5313, "step": 3410 }, { "epoch": 0.15177991911727995, "grad_norm": 119.5546646118164, "learning_rate": 9.99407110102081e-06, "loss": 12.3448, "step": 3420 }, { "epoch": 0.1522237200503714, "grad_norm": 138.1034393310547, "learning_rate": 9.994053765058884e-06, "loss": 13.0286, "step": 3430 }, { "epoch": 0.15266752098346287, "grad_norm": 110.58820343017578, "learning_rate": 9.994036429096957e-06, "loss": 12.455, "step": 3440 }, { "epoch": 0.15311132191655433, "grad_norm": 168.3693084716797, "learning_rate": 9.994019093135028e-06, "loss": 12.7619, "step": 3450 }, { "epoch": 0.1535551228496458, "grad_norm": 140.2900390625, "learning_rate": 9.994001757173101e-06, "loss": 12.8144, "step": 3460 }, { "epoch": 0.15399892378273725, "grad_norm": 108.32037353515625, "learning_rate": 9.993984421211174e-06, "loss": 12.6378, "step": 3470 }, { "epoch": 0.1544427247158287, "grad_norm": 137.5443115234375, "learning_rate": 9.993967085249247e-06, "loss": 12.2405, "step": 3480 }, { "epoch": 0.15488652564892016, "grad_norm": 157.03700256347656, "learning_rate": 9.993949749287319e-06, "loss": 12.654, "step": 3490 }, { "epoch": 0.15533032658201165, "grad_norm": 145.71405029296875, "learning_rate": 9.993932413325392e-06, "loss": 12.7073, "step": 3500 }, { "epoch": 0.1557741275151031, "grad_norm": 130.26255798339844, "learning_rate": 9.993915077363465e-06, "loss": 12.0506, "step": 3510 }, { "epoch": 0.15621792844819457, "grad_norm": 131.19674682617188, "learning_rate": 9.993897741401536e-06, "loss": 12.4799, "step": 3520 }, { "epoch": 0.15666172938128603, "grad_norm": 116.03231811523438, "learning_rate": 9.99388040543961e-06, "loss": 12.8077, "step": 3530 }, { "epoch": 0.1571055303143775, "grad_norm": 117.18372344970703, "learning_rate": 9.993863069477682e-06, "loss": 12.754, "step": 3540 }, { "epoch": 0.15754933124746895, "grad_norm": 141.51046752929688, "learning_rate": 9.993845733515754e-06, "loss": 12.9482, "step": 3550 }, { "epoch": 0.1579931321805604, "grad_norm": 145.37522888183594, "learning_rate": 9.993828397553827e-06, "loss": 12.9102, "step": 3560 }, { "epoch": 0.15843693311365187, "grad_norm": 121.03300476074219, "learning_rate": 9.9938110615919e-06, "loss": 12.5428, "step": 3570 }, { "epoch": 0.15888073404674333, "grad_norm": 97.84991455078125, "learning_rate": 9.993793725629971e-06, "loss": 12.2027, "step": 3580 }, { "epoch": 0.1593245349798348, "grad_norm": 139.6411590576172, "learning_rate": 9.993776389668044e-06, "loss": 12.6947, "step": 3590 }, { "epoch": 0.15976833591292625, "grad_norm": 128.70204162597656, "learning_rate": 9.993759053706118e-06, "loss": 12.7905, "step": 3600 }, { "epoch": 0.1602121368460177, "grad_norm": 118.03040313720703, "learning_rate": 9.99374171774419e-06, "loss": 12.8123, "step": 3610 }, { "epoch": 0.1606559377791092, "grad_norm": 111.89470672607422, "learning_rate": 9.993724381782262e-06, "loss": 12.5624, "step": 3620 }, { "epoch": 0.16109973871220065, "grad_norm": 129.9048614501953, "learning_rate": 9.993707045820335e-06, "loss": 12.6917, "step": 3630 }, { "epoch": 0.1615435396452921, "grad_norm": 127.17546844482422, "learning_rate": 9.993689709858408e-06, "loss": 13.1546, "step": 3640 }, { "epoch": 0.16198734057838357, "grad_norm": 117.04710388183594, "learning_rate": 9.99367237389648e-06, "loss": 12.452, "step": 3650 }, { "epoch": 0.16243114151147503, "grad_norm": 133.599853515625, "learning_rate": 9.993655037934553e-06, "loss": 12.5182, "step": 3660 }, { "epoch": 0.1628749424445665, "grad_norm": 158.9300994873047, "learning_rate": 9.993637701972626e-06, "loss": 12.0698, "step": 3670 }, { "epoch": 0.16331874337765795, "grad_norm": 125.10253143310547, "learning_rate": 9.993620366010697e-06, "loss": 12.5825, "step": 3680 }, { "epoch": 0.1637625443107494, "grad_norm": 129.07908630371094, "learning_rate": 9.99360303004877e-06, "loss": 12.5439, "step": 3690 }, { "epoch": 0.16420634524384087, "grad_norm": 119.1418685913086, "learning_rate": 9.993585694086843e-06, "loss": 12.564, "step": 3700 }, { "epoch": 0.16465014617693233, "grad_norm": 128.37750244140625, "learning_rate": 9.993568358124915e-06, "loss": 12.9106, "step": 3710 }, { "epoch": 0.1650939471100238, "grad_norm": 142.9105987548828, "learning_rate": 9.993551022162988e-06, "loss": 12.9379, "step": 3720 }, { "epoch": 0.16553774804311527, "grad_norm": 118.88358306884766, "learning_rate": 9.99353368620106e-06, "loss": 12.3977, "step": 3730 }, { "epoch": 0.16598154897620673, "grad_norm": 118.91588592529297, "learning_rate": 9.993516350239134e-06, "loss": 13.2059, "step": 3740 }, { "epoch": 0.1664253499092982, "grad_norm": 139.88050842285156, "learning_rate": 9.993499014277205e-06, "loss": 13.0386, "step": 3750 }, { "epoch": 0.16686915084238965, "grad_norm": 162.86024475097656, "learning_rate": 9.993481678315278e-06, "loss": 12.238, "step": 3760 }, { "epoch": 0.1673129517754811, "grad_norm": 127.96647644042969, "learning_rate": 9.993464342353351e-06, "loss": 13.2272, "step": 3770 }, { "epoch": 0.16775675270857257, "grad_norm": 112.12999725341797, "learning_rate": 9.993447006391423e-06, "loss": 12.5457, "step": 3780 }, { "epoch": 0.16820055364166403, "grad_norm": 155.6081085205078, "learning_rate": 9.993429670429496e-06, "loss": 12.3183, "step": 3790 }, { "epoch": 0.1686443545747555, "grad_norm": 138.3584442138672, "learning_rate": 9.993412334467569e-06, "loss": 12.4889, "step": 3800 }, { "epoch": 0.16908815550784695, "grad_norm": 115.07450866699219, "learning_rate": 9.99339499850564e-06, "loss": 12.9653, "step": 3810 }, { "epoch": 0.1695319564409384, "grad_norm": 126.59088897705078, "learning_rate": 9.993377662543713e-06, "loss": 12.9003, "step": 3820 }, { "epoch": 0.16997575737402987, "grad_norm": 108.5206069946289, "learning_rate": 9.993360326581786e-06, "loss": 13.1251, "step": 3830 }, { "epoch": 0.17041955830712133, "grad_norm": 124.42080688476562, "learning_rate": 9.993342990619858e-06, "loss": 12.9417, "step": 3840 }, { "epoch": 0.17086335924021281, "grad_norm": 106.24859619140625, "learning_rate": 9.993325654657931e-06, "loss": 12.3298, "step": 3850 }, { "epoch": 0.17130716017330427, "grad_norm": 112.69037628173828, "learning_rate": 9.993308318696004e-06, "loss": 12.3989, "step": 3860 }, { "epoch": 0.17175096110639573, "grad_norm": 154.77882385253906, "learning_rate": 9.993290982734077e-06, "loss": 13.245, "step": 3870 }, { "epoch": 0.1721947620394872, "grad_norm": 123.77738952636719, "learning_rate": 9.993273646772148e-06, "loss": 12.6657, "step": 3880 }, { "epoch": 0.17263856297257865, "grad_norm": 126.2992172241211, "learning_rate": 9.993256310810222e-06, "loss": 12.3853, "step": 3890 }, { "epoch": 0.1730823639056701, "grad_norm": 112.3527603149414, "learning_rate": 9.993238974848295e-06, "loss": 13.0207, "step": 3900 }, { "epoch": 0.17352616483876157, "grad_norm": 106.91303253173828, "learning_rate": 9.993221638886366e-06, "loss": 11.9467, "step": 3910 }, { "epoch": 0.17396996577185303, "grad_norm": 122.75757598876953, "learning_rate": 9.993204302924439e-06, "loss": 12.1821, "step": 3920 }, { "epoch": 0.1744137667049445, "grad_norm": 116.6715316772461, "learning_rate": 9.993186966962512e-06, "loss": 12.1554, "step": 3930 }, { "epoch": 0.17485756763803595, "grad_norm": 100.55429077148438, "learning_rate": 9.993169631000584e-06, "loss": 12.6786, "step": 3940 }, { "epoch": 0.1753013685711274, "grad_norm": 116.2847671508789, "learning_rate": 9.993152295038657e-06, "loss": 11.9186, "step": 3950 }, { "epoch": 0.1757451695042189, "grad_norm": 107.30577087402344, "learning_rate": 9.99313495907673e-06, "loss": 12.8207, "step": 3960 }, { "epoch": 0.17618897043731035, "grad_norm": 103.84146118164062, "learning_rate": 9.993117623114803e-06, "loss": 12.4032, "step": 3970 }, { "epoch": 0.17663277137040181, "grad_norm": 108.5525131225586, "learning_rate": 9.993100287152874e-06, "loss": 12.3449, "step": 3980 }, { "epoch": 0.17707657230349327, "grad_norm": 132.21353149414062, "learning_rate": 9.993082951190947e-06, "loss": 12.8847, "step": 3990 }, { "epoch": 0.17752037323658473, "grad_norm": 114.80290985107422, "learning_rate": 9.99306561522902e-06, "loss": 12.5004, "step": 4000 }, { "epoch": 0.17752037323658473, "eval_loss": 0.38956519961357117, "eval_runtime": 673.1566, "eval_samples_per_second": 1804.024, "eval_steps_per_second": 56.376, "step": 4000 }, { "epoch": 0.1779641741696762, "grad_norm": 123.59729766845703, "learning_rate": 9.993048279267092e-06, "loss": 11.8187, "step": 4010 }, { "epoch": 0.17840797510276765, "grad_norm": 121.13542175292969, "learning_rate": 9.993030943305165e-06, "loss": 12.569, "step": 4020 }, { "epoch": 0.1788517760358591, "grad_norm": 112.8544692993164, "learning_rate": 9.993013607343238e-06, "loss": 12.4908, "step": 4030 }, { "epoch": 0.17929557696895057, "grad_norm": 108.49791717529297, "learning_rate": 9.99299627138131e-06, "loss": 12.1717, "step": 4040 }, { "epoch": 0.17973937790204203, "grad_norm": 112.78793334960938, "learning_rate": 9.992978935419382e-06, "loss": 12.4246, "step": 4050 }, { "epoch": 0.1801831788351335, "grad_norm": 111.55317687988281, "learning_rate": 9.992961599457455e-06, "loss": 12.5279, "step": 4060 }, { "epoch": 0.18062697976822495, "grad_norm": 129.81776428222656, "learning_rate": 9.992944263495527e-06, "loss": 12.0691, "step": 4070 }, { "epoch": 0.18107078070131644, "grad_norm": 159.69786071777344, "learning_rate": 9.9929269275336e-06, "loss": 12.4083, "step": 4080 }, { "epoch": 0.1815145816344079, "grad_norm": 119.66799926757812, "learning_rate": 9.992909591571673e-06, "loss": 12.1316, "step": 4090 }, { "epoch": 0.18195838256749935, "grad_norm": 143.12181091308594, "learning_rate": 9.992892255609746e-06, "loss": 12.5121, "step": 4100 }, { "epoch": 0.18240218350059081, "grad_norm": 153.3807373046875, "learning_rate": 9.992874919647817e-06, "loss": 12.0202, "step": 4110 }, { "epoch": 0.18284598443368227, "grad_norm": 127.23503875732422, "learning_rate": 9.99285758368589e-06, "loss": 12.3906, "step": 4120 }, { "epoch": 0.18328978536677373, "grad_norm": 130.62158203125, "learning_rate": 9.992840247723964e-06, "loss": 12.0698, "step": 4130 }, { "epoch": 0.1837335862998652, "grad_norm": 126.1018295288086, "learning_rate": 9.992822911762035e-06, "loss": 12.4342, "step": 4140 }, { "epoch": 0.18417738723295665, "grad_norm": 131.96340942382812, "learning_rate": 9.992805575800108e-06, "loss": 12.6071, "step": 4150 }, { "epoch": 0.1846211881660481, "grad_norm": 130.8789520263672, "learning_rate": 9.992788239838181e-06, "loss": 12.323, "step": 4160 }, { "epoch": 0.18506498909913957, "grad_norm": 125.24624633789062, "learning_rate": 9.992770903876252e-06, "loss": 11.9601, "step": 4170 }, { "epoch": 0.18550879003223103, "grad_norm": 107.27322387695312, "learning_rate": 9.992753567914326e-06, "loss": 12.4659, "step": 4180 }, { "epoch": 0.18595259096532252, "grad_norm": 118.50186920166016, "learning_rate": 9.992736231952399e-06, "loss": 12.4699, "step": 4190 }, { "epoch": 0.18639639189841398, "grad_norm": 132.64605712890625, "learning_rate": 9.99271889599047e-06, "loss": 12.551, "step": 4200 }, { "epoch": 0.18684019283150544, "grad_norm": 129.10557556152344, "learning_rate": 9.992701560028543e-06, "loss": 12.5445, "step": 4210 }, { "epoch": 0.1872839937645969, "grad_norm": 118.41376495361328, "learning_rate": 9.992684224066616e-06, "loss": 12.4178, "step": 4220 }, { "epoch": 0.18772779469768835, "grad_norm": 115.65239715576172, "learning_rate": 9.992666888104688e-06, "loss": 12.4483, "step": 4230 }, { "epoch": 0.18817159563077981, "grad_norm": 125.76007080078125, "learning_rate": 9.99264955214276e-06, "loss": 13.088, "step": 4240 }, { "epoch": 0.18861539656387127, "grad_norm": 88.38092041015625, "learning_rate": 9.992632216180834e-06, "loss": 12.9315, "step": 4250 }, { "epoch": 0.18905919749696273, "grad_norm": 104.6368179321289, "learning_rate": 9.992614880218907e-06, "loss": 12.2731, "step": 4260 }, { "epoch": 0.1895029984300542, "grad_norm": 127.25080871582031, "learning_rate": 9.992597544256978e-06, "loss": 12.2454, "step": 4270 }, { "epoch": 0.18994679936314565, "grad_norm": 101.77681732177734, "learning_rate": 9.992580208295051e-06, "loss": 12.4424, "step": 4280 }, { "epoch": 0.1903906002962371, "grad_norm": 136.33221435546875, "learning_rate": 9.992562872333124e-06, "loss": 11.9918, "step": 4290 }, { "epoch": 0.1908344012293286, "grad_norm": 112.94047546386719, "learning_rate": 9.992545536371196e-06, "loss": 12.4194, "step": 4300 }, { "epoch": 0.19127820216242006, "grad_norm": 132.7883758544922, "learning_rate": 9.992528200409269e-06, "loss": 12.3539, "step": 4310 }, { "epoch": 0.19172200309551152, "grad_norm": 115.22777557373047, "learning_rate": 9.992510864447342e-06, "loss": 12.176, "step": 4320 }, { "epoch": 0.19216580402860298, "grad_norm": 133.0673828125, "learning_rate": 9.992493528485413e-06, "loss": 12.1488, "step": 4330 }, { "epoch": 0.19260960496169444, "grad_norm": 154.4739227294922, "learning_rate": 9.992476192523486e-06, "loss": 12.4385, "step": 4340 }, { "epoch": 0.1930534058947859, "grad_norm": 169.39413452148438, "learning_rate": 9.99245885656156e-06, "loss": 12.2436, "step": 4350 }, { "epoch": 0.19349720682787735, "grad_norm": 127.55028533935547, "learning_rate": 9.99244152059963e-06, "loss": 12.1869, "step": 4360 }, { "epoch": 0.19394100776096881, "grad_norm": 124.5598373413086, "learning_rate": 9.992424184637704e-06, "loss": 12.6008, "step": 4370 }, { "epoch": 0.19438480869406027, "grad_norm": 140.38067626953125, "learning_rate": 9.992406848675777e-06, "loss": 12.8844, "step": 4380 }, { "epoch": 0.19482860962715173, "grad_norm": 111.26557922363281, "learning_rate": 9.992389512713848e-06, "loss": 12.7417, "step": 4390 }, { "epoch": 0.1952724105602432, "grad_norm": 109.17289733886719, "learning_rate": 9.992372176751921e-06, "loss": 12.5357, "step": 4400 }, { "epoch": 0.19571621149333465, "grad_norm": 93.85588073730469, "learning_rate": 9.992354840789995e-06, "loss": 13.057, "step": 4410 }, { "epoch": 0.19616001242642614, "grad_norm": 132.5755157470703, "learning_rate": 9.992337504828066e-06, "loss": 11.8174, "step": 4420 }, { "epoch": 0.1966038133595176, "grad_norm": 137.10018920898438, "learning_rate": 9.992320168866139e-06, "loss": 12.8481, "step": 4430 }, { "epoch": 0.19704761429260906, "grad_norm": 114.44795989990234, "learning_rate": 9.992302832904212e-06, "loss": 12.6655, "step": 4440 }, { "epoch": 0.19749141522570052, "grad_norm": 101.33007049560547, "learning_rate": 9.992285496942283e-06, "loss": 11.2619, "step": 4450 }, { "epoch": 0.19793521615879198, "grad_norm": 121.8523941040039, "learning_rate": 9.992268160980357e-06, "loss": 11.8743, "step": 4460 }, { "epoch": 0.19837901709188344, "grad_norm": 133.41293334960938, "learning_rate": 9.99225082501843e-06, "loss": 12.4099, "step": 4470 }, { "epoch": 0.1988228180249749, "grad_norm": 121.90374755859375, "learning_rate": 9.992233489056503e-06, "loss": 12.8135, "step": 4480 }, { "epoch": 0.19926661895806635, "grad_norm": 108.74071502685547, "learning_rate": 9.992216153094574e-06, "loss": 11.9765, "step": 4490 }, { "epoch": 0.19971041989115781, "grad_norm": 112.89215850830078, "learning_rate": 9.992198817132647e-06, "loss": 12.9006, "step": 4500 }, { "epoch": 0.20015422082424927, "grad_norm": 132.44119262695312, "learning_rate": 9.99218148117072e-06, "loss": 12.6348, "step": 4510 }, { "epoch": 0.20059802175734073, "grad_norm": 132.33868408203125, "learning_rate": 9.992164145208792e-06, "loss": 12.3398, "step": 4520 }, { "epoch": 0.20104182269043222, "grad_norm": 125.20413970947266, "learning_rate": 9.992146809246865e-06, "loss": 12.8135, "step": 4530 }, { "epoch": 0.20148562362352368, "grad_norm": 104.17672729492188, "learning_rate": 9.992129473284938e-06, "loss": 12.1691, "step": 4540 }, { "epoch": 0.20192942455661514, "grad_norm": 145.2282257080078, "learning_rate": 9.992112137323009e-06, "loss": 12.2538, "step": 4550 }, { "epoch": 0.2023732254897066, "grad_norm": 130.9412078857422, "learning_rate": 9.992094801361082e-06, "loss": 12.1611, "step": 4560 }, { "epoch": 0.20281702642279806, "grad_norm": 104.62641143798828, "learning_rate": 9.992077465399155e-06, "loss": 12.3785, "step": 4570 }, { "epoch": 0.20326082735588952, "grad_norm": 106.15484619140625, "learning_rate": 9.992060129437227e-06, "loss": 12.139, "step": 4580 }, { "epoch": 0.20370462828898098, "grad_norm": 135.23989868164062, "learning_rate": 9.9920427934753e-06, "loss": 11.6559, "step": 4590 }, { "epoch": 0.20414842922207244, "grad_norm": 129.04571533203125, "learning_rate": 9.992025457513373e-06, "loss": 12.2296, "step": 4600 }, { "epoch": 0.2045922301551639, "grad_norm": 100.73985290527344, "learning_rate": 9.992008121551444e-06, "loss": 12.429, "step": 4610 }, { "epoch": 0.20503603108825535, "grad_norm": 120.50385284423828, "learning_rate": 9.991990785589517e-06, "loss": 12.6906, "step": 4620 }, { "epoch": 0.20547983202134681, "grad_norm": 119.28755187988281, "learning_rate": 9.99197344962759e-06, "loss": 12.0725, "step": 4630 }, { "epoch": 0.20592363295443827, "grad_norm": 122.34222412109375, "learning_rate": 9.991956113665662e-06, "loss": 12.5552, "step": 4640 }, { "epoch": 0.20636743388752976, "grad_norm": 123.09967041015625, "learning_rate": 9.991938777703735e-06, "loss": 11.6526, "step": 4650 }, { "epoch": 0.20681123482062122, "grad_norm": 97.26423645019531, "learning_rate": 9.991921441741808e-06, "loss": 12.1968, "step": 4660 }, { "epoch": 0.20725503575371268, "grad_norm": 114.7996826171875, "learning_rate": 9.99190410577988e-06, "loss": 12.6007, "step": 4670 }, { "epoch": 0.20769883668680414, "grad_norm": 140.515869140625, "learning_rate": 9.991886769817952e-06, "loss": 12.2736, "step": 4680 }, { "epoch": 0.2081426376198956, "grad_norm": 125.37648010253906, "learning_rate": 9.991869433856025e-06, "loss": 12.1014, "step": 4690 }, { "epoch": 0.20858643855298706, "grad_norm": 128.2080078125, "learning_rate": 9.991852097894099e-06, "loss": 11.7794, "step": 4700 }, { "epoch": 0.20903023948607852, "grad_norm": 115.89141082763672, "learning_rate": 9.99183476193217e-06, "loss": 12.2364, "step": 4710 }, { "epoch": 0.20947404041916998, "grad_norm": 118.23058319091797, "learning_rate": 9.991817425970243e-06, "loss": 11.7807, "step": 4720 }, { "epoch": 0.20991784135226144, "grad_norm": 120.32998657226562, "learning_rate": 9.991800090008316e-06, "loss": 12.3347, "step": 4730 }, { "epoch": 0.2103616422853529, "grad_norm": 131.60520935058594, "learning_rate": 9.991782754046387e-06, "loss": 12.5857, "step": 4740 }, { "epoch": 0.21080544321844435, "grad_norm": 123.4472885131836, "learning_rate": 9.99176541808446e-06, "loss": 12.5721, "step": 4750 }, { "epoch": 0.21124924415153584, "grad_norm": 115.33013153076172, "learning_rate": 9.991748082122534e-06, "loss": 12.2645, "step": 4760 }, { "epoch": 0.2116930450846273, "grad_norm": 128.06332397460938, "learning_rate": 9.991730746160605e-06, "loss": 11.9115, "step": 4770 }, { "epoch": 0.21213684601771876, "grad_norm": 133.6379852294922, "learning_rate": 9.991713410198678e-06, "loss": 12.2299, "step": 4780 }, { "epoch": 0.21258064695081022, "grad_norm": 99.40995025634766, "learning_rate": 9.991696074236751e-06, "loss": 12.232, "step": 4790 }, { "epoch": 0.21302444788390168, "grad_norm": 128.07489013671875, "learning_rate": 9.991678738274823e-06, "loss": 12.5043, "step": 4800 }, { "epoch": 0.21346824881699314, "grad_norm": 106.18391418457031, "learning_rate": 9.991661402312896e-06, "loss": 12.515, "step": 4810 }, { "epoch": 0.2139120497500846, "grad_norm": 103.52220153808594, "learning_rate": 9.991644066350969e-06, "loss": 12.5614, "step": 4820 }, { "epoch": 0.21435585068317606, "grad_norm": 144.36888122558594, "learning_rate": 9.99162673038904e-06, "loss": 12.0411, "step": 4830 }, { "epoch": 0.21479965161626752, "grad_norm": 119.41954803466797, "learning_rate": 9.991609394427113e-06, "loss": 12.0378, "step": 4840 }, { "epoch": 0.21524345254935898, "grad_norm": 97.38760375976562, "learning_rate": 9.991592058465186e-06, "loss": 12.0504, "step": 4850 }, { "epoch": 0.21568725348245044, "grad_norm": 123.15618896484375, "learning_rate": 9.991574722503258e-06, "loss": 12.4191, "step": 4860 }, { "epoch": 0.2161310544155419, "grad_norm": 121.77255249023438, "learning_rate": 9.99155738654133e-06, "loss": 11.9067, "step": 4870 }, { "epoch": 0.21657485534863338, "grad_norm": 128.0386962890625, "learning_rate": 9.991540050579404e-06, "loss": 12.9843, "step": 4880 }, { "epoch": 0.21701865628172484, "grad_norm": 102.69422149658203, "learning_rate": 9.991522714617475e-06, "loss": 12.2082, "step": 4890 }, { "epoch": 0.2174624572148163, "grad_norm": 125.6636734008789, "learning_rate": 9.991505378655548e-06, "loss": 12.5348, "step": 4900 }, { "epoch": 0.21790625814790776, "grad_norm": 135.658203125, "learning_rate": 9.991488042693621e-06, "loss": 11.9152, "step": 4910 }, { "epoch": 0.21835005908099922, "grad_norm": 99.29556274414062, "learning_rate": 9.991470706731694e-06, "loss": 12.5844, "step": 4920 }, { "epoch": 0.21879386001409068, "grad_norm": 95.45423126220703, "learning_rate": 9.991453370769766e-06, "loss": 12.926, "step": 4930 }, { "epoch": 0.21923766094718214, "grad_norm": 118.17921447753906, "learning_rate": 9.991436034807839e-06, "loss": 12.2757, "step": 4940 }, { "epoch": 0.2196814618802736, "grad_norm": 120.35387420654297, "learning_rate": 9.991418698845912e-06, "loss": 12.4541, "step": 4950 }, { "epoch": 0.22012526281336506, "grad_norm": 109.4905776977539, "learning_rate": 9.991401362883983e-06, "loss": 11.7813, "step": 4960 }, { "epoch": 0.22056906374645652, "grad_norm": 102.00797271728516, "learning_rate": 9.991384026922056e-06, "loss": 11.9456, "step": 4970 }, { "epoch": 0.22101286467954798, "grad_norm": 104.43452453613281, "learning_rate": 9.99136669096013e-06, "loss": 11.8744, "step": 4980 }, { "epoch": 0.22145666561263946, "grad_norm": 91.9101791381836, "learning_rate": 9.991349354998201e-06, "loss": 12.1663, "step": 4990 }, { "epoch": 0.22190046654573092, "grad_norm": 139.61109924316406, "learning_rate": 9.991332019036274e-06, "loss": 12.5729, "step": 5000 }, { "epoch": 0.22190046654573092, "eval_loss": 0.37864938378334045, "eval_runtime": 675.391, "eval_samples_per_second": 1798.056, "eval_steps_per_second": 56.19, "step": 5000 }, { "epoch": 0.22234426747882238, "grad_norm": 100.71528625488281, "learning_rate": 9.991314683074347e-06, "loss": 11.8638, "step": 5010 }, { "epoch": 0.22278806841191384, "grad_norm": 114.36324310302734, "learning_rate": 9.991297347112418e-06, "loss": 12.7659, "step": 5020 }, { "epoch": 0.2232318693450053, "grad_norm": 123.63908386230469, "learning_rate": 9.991280011150491e-06, "loss": 12.2795, "step": 5030 }, { "epoch": 0.22367567027809676, "grad_norm": 120.35507202148438, "learning_rate": 9.991262675188565e-06, "loss": 12.277, "step": 5040 }, { "epoch": 0.22411947121118822, "grad_norm": 109.37151336669922, "learning_rate": 9.991245339226636e-06, "loss": 11.6699, "step": 5050 }, { "epoch": 0.22456327214427968, "grad_norm": 105.27191925048828, "learning_rate": 9.991228003264709e-06, "loss": 12.0232, "step": 5060 }, { "epoch": 0.22500707307737114, "grad_norm": 129.59083557128906, "learning_rate": 9.991210667302782e-06, "loss": 11.7139, "step": 5070 }, { "epoch": 0.2254508740104626, "grad_norm": 102.64170837402344, "learning_rate": 9.991193331340853e-06, "loss": 12.3, "step": 5080 }, { "epoch": 0.22589467494355406, "grad_norm": 97.96053314208984, "learning_rate": 9.991175995378927e-06, "loss": 12.2807, "step": 5090 }, { "epoch": 0.22633847587664552, "grad_norm": 111.13671112060547, "learning_rate": 9.991158659417e-06, "loss": 12.2035, "step": 5100 }, { "epoch": 0.226782276809737, "grad_norm": 127.20590209960938, "learning_rate": 9.991141323455071e-06, "loss": 11.7232, "step": 5110 }, { "epoch": 0.22722607774282846, "grad_norm": 128.55152893066406, "learning_rate": 9.991123987493144e-06, "loss": 12.2339, "step": 5120 }, { "epoch": 0.22766987867591992, "grad_norm": 152.3295135498047, "learning_rate": 9.991106651531217e-06, "loss": 12.0261, "step": 5130 }, { "epoch": 0.22811367960901138, "grad_norm": 125.02354431152344, "learning_rate": 9.99108931556929e-06, "loss": 11.9501, "step": 5140 }, { "epoch": 0.22855748054210284, "grad_norm": 126.0276107788086, "learning_rate": 9.991071979607362e-06, "loss": 11.9311, "step": 5150 }, { "epoch": 0.2290012814751943, "grad_norm": 101.98572540283203, "learning_rate": 9.991054643645435e-06, "loss": 11.7741, "step": 5160 }, { "epoch": 0.22944508240828576, "grad_norm": 93.39753723144531, "learning_rate": 9.991037307683508e-06, "loss": 12.3775, "step": 5170 }, { "epoch": 0.22988888334137722, "grad_norm": 119.41414642333984, "learning_rate": 9.99101997172158e-06, "loss": 12.3944, "step": 5180 }, { "epoch": 0.23033268427446868, "grad_norm": 102.53070068359375, "learning_rate": 9.991002635759652e-06, "loss": 11.7407, "step": 5190 }, { "epoch": 0.23077648520756014, "grad_norm": 114.39293670654297, "learning_rate": 9.990985299797725e-06, "loss": 12.0433, "step": 5200 }, { "epoch": 0.2312202861406516, "grad_norm": 95.0862808227539, "learning_rate": 9.990967963835797e-06, "loss": 11.8278, "step": 5210 }, { "epoch": 0.23166408707374309, "grad_norm": 95.9500732421875, "learning_rate": 9.99095062787387e-06, "loss": 12.0699, "step": 5220 }, { "epoch": 0.23210788800683455, "grad_norm": 108.53816223144531, "learning_rate": 9.990933291911943e-06, "loss": 12.0234, "step": 5230 }, { "epoch": 0.232551688939926, "grad_norm": 108.30989837646484, "learning_rate": 9.990915955950014e-06, "loss": 11.899, "step": 5240 }, { "epoch": 0.23299548987301746, "grad_norm": 103.17509460449219, "learning_rate": 9.990898619988087e-06, "loss": 12.0308, "step": 5250 }, { "epoch": 0.23343929080610892, "grad_norm": 115.72237396240234, "learning_rate": 9.99088128402616e-06, "loss": 11.7059, "step": 5260 }, { "epoch": 0.23388309173920038, "grad_norm": 120.50923919677734, "learning_rate": 9.990863948064232e-06, "loss": 12.0906, "step": 5270 }, { "epoch": 0.23432689267229184, "grad_norm": 103.17623138427734, "learning_rate": 9.990846612102305e-06, "loss": 11.5532, "step": 5280 }, { "epoch": 0.2347706936053833, "grad_norm": 116.489013671875, "learning_rate": 9.990829276140378e-06, "loss": 11.8999, "step": 5290 }, { "epoch": 0.23521449453847476, "grad_norm": 102.14614868164062, "learning_rate": 9.99081194017845e-06, "loss": 12.2294, "step": 5300 }, { "epoch": 0.23565829547156622, "grad_norm": 127.21760559082031, "learning_rate": 9.990794604216522e-06, "loss": 12.0739, "step": 5310 }, { "epoch": 0.23610209640465768, "grad_norm": 129.3851318359375, "learning_rate": 9.990777268254595e-06, "loss": 12.2383, "step": 5320 }, { "epoch": 0.23654589733774914, "grad_norm": 107.56005096435547, "learning_rate": 9.990759932292667e-06, "loss": 11.7989, "step": 5330 }, { "epoch": 0.23698969827084063, "grad_norm": 96.8632583618164, "learning_rate": 9.99074259633074e-06, "loss": 12.1831, "step": 5340 }, { "epoch": 0.23743349920393209, "grad_norm": 106.97894287109375, "learning_rate": 9.990725260368813e-06, "loss": 12.0481, "step": 5350 }, { "epoch": 0.23787730013702355, "grad_norm": 91.43546295166016, "learning_rate": 9.990707924406886e-06, "loss": 11.3025, "step": 5360 }, { "epoch": 0.238321101070115, "grad_norm": 104.5177001953125, "learning_rate": 9.990690588444957e-06, "loss": 11.5902, "step": 5370 }, { "epoch": 0.23876490200320646, "grad_norm": 117.70657348632812, "learning_rate": 9.99067325248303e-06, "loss": 11.7063, "step": 5380 }, { "epoch": 0.23920870293629792, "grad_norm": 113.05216979980469, "learning_rate": 9.990655916521104e-06, "loss": 12.0053, "step": 5390 }, { "epoch": 0.23965250386938938, "grad_norm": 115.97754669189453, "learning_rate": 9.990638580559175e-06, "loss": 12.3194, "step": 5400 }, { "epoch": 0.24009630480248084, "grad_norm": 110.10850524902344, "learning_rate": 9.990621244597248e-06, "loss": 11.2671, "step": 5410 }, { "epoch": 0.2405401057355723, "grad_norm": 110.06661987304688, "learning_rate": 9.990603908635321e-06, "loss": 12.2819, "step": 5420 }, { "epoch": 0.24098390666866376, "grad_norm": 126.98220825195312, "learning_rate": 9.990586572673393e-06, "loss": 12.0827, "step": 5430 }, { "epoch": 0.24142770760175522, "grad_norm": 101.59259033203125, "learning_rate": 9.990569236711466e-06, "loss": 11.5167, "step": 5440 }, { "epoch": 0.2418715085348467, "grad_norm": 91.08474731445312, "learning_rate": 9.990551900749539e-06, "loss": 11.4829, "step": 5450 }, { "epoch": 0.24231530946793817, "grad_norm": 100.78240966796875, "learning_rate": 9.99053456478761e-06, "loss": 12.0629, "step": 5460 }, { "epoch": 0.24275911040102963, "grad_norm": 121.29551696777344, "learning_rate": 9.990517228825683e-06, "loss": 11.9303, "step": 5470 }, { "epoch": 0.24320291133412109, "grad_norm": 100.81685638427734, "learning_rate": 9.990499892863756e-06, "loss": 11.4778, "step": 5480 }, { "epoch": 0.24364671226721255, "grad_norm": 134.68348693847656, "learning_rate": 9.990482556901828e-06, "loss": 12.5336, "step": 5490 }, { "epoch": 0.244090513200304, "grad_norm": 103.34284973144531, "learning_rate": 9.9904652209399e-06, "loss": 11.9692, "step": 5500 }, { "epoch": 0.24453431413339546, "grad_norm": 99.77964782714844, "learning_rate": 9.990447884977974e-06, "loss": 11.5971, "step": 5510 }, { "epoch": 0.24497811506648692, "grad_norm": 93.10983276367188, "learning_rate": 9.990430549016045e-06, "loss": 12.3927, "step": 5520 }, { "epoch": 0.24542191599957838, "grad_norm": 109.86616516113281, "learning_rate": 9.990413213054118e-06, "loss": 11.945, "step": 5530 }, { "epoch": 0.24586571693266984, "grad_norm": 112.61761474609375, "learning_rate": 9.990395877092191e-06, "loss": 11.7543, "step": 5540 }, { "epoch": 0.2463095178657613, "grad_norm": 115.5595703125, "learning_rate": 9.990378541130263e-06, "loss": 12.0871, "step": 5550 }, { "epoch": 0.24675331879885276, "grad_norm": 108.1959228515625, "learning_rate": 9.990361205168336e-06, "loss": 12.3206, "step": 5560 }, { "epoch": 0.24719711973194425, "grad_norm": 102.17387390136719, "learning_rate": 9.990343869206409e-06, "loss": 11.5629, "step": 5570 }, { "epoch": 0.2476409206650357, "grad_norm": 107.28736114501953, "learning_rate": 9.990326533244482e-06, "loss": 11.5035, "step": 5580 }, { "epoch": 0.24808472159812717, "grad_norm": 101.89647674560547, "learning_rate": 9.990309197282553e-06, "loss": 12.1624, "step": 5590 }, { "epoch": 0.24852852253121863, "grad_norm": 86.52133178710938, "learning_rate": 9.990291861320626e-06, "loss": 12.3599, "step": 5600 }, { "epoch": 0.24897232346431009, "grad_norm": 118.91706848144531, "learning_rate": 9.9902745253587e-06, "loss": 12.4713, "step": 5610 }, { "epoch": 0.24941612439740155, "grad_norm": 92.26959991455078, "learning_rate": 9.990257189396771e-06, "loss": 11.8632, "step": 5620 }, { "epoch": 0.249859925330493, "grad_norm": 96.55217742919922, "learning_rate": 9.990239853434844e-06, "loss": 11.7402, "step": 5630 }, { "epoch": 0.2503037262635845, "grad_norm": 108.4555892944336, "learning_rate": 9.990222517472917e-06, "loss": 11.7055, "step": 5640 }, { "epoch": 0.2507475271966759, "grad_norm": 103.9815902709961, "learning_rate": 9.990205181510988e-06, "loss": 12.0786, "step": 5650 }, { "epoch": 0.2511913281297674, "grad_norm": 112.2682876586914, "learning_rate": 9.990187845549062e-06, "loss": 12.11, "step": 5660 }, { "epoch": 0.25163512906285884, "grad_norm": 90.49382019042969, "learning_rate": 9.990170509587135e-06, "loss": 12.3861, "step": 5670 }, { "epoch": 0.25207892999595033, "grad_norm": 115.12799072265625, "learning_rate": 9.990153173625206e-06, "loss": 12.0726, "step": 5680 }, { "epoch": 0.25252273092904176, "grad_norm": 102.43321990966797, "learning_rate": 9.990135837663279e-06, "loss": 11.9243, "step": 5690 }, { "epoch": 0.25296653186213325, "grad_norm": 133.68328857421875, "learning_rate": 9.990118501701352e-06, "loss": 12.2475, "step": 5700 }, { "epoch": 0.2534103327952247, "grad_norm": 103.25040435791016, "learning_rate": 9.990101165739424e-06, "loss": 11.596, "step": 5710 }, { "epoch": 0.25385413372831617, "grad_norm": 106.11602783203125, "learning_rate": 9.990083829777497e-06, "loss": 12.3109, "step": 5720 }, { "epoch": 0.2542979346614076, "grad_norm": 99.80020904541016, "learning_rate": 9.99006649381557e-06, "loss": 11.2771, "step": 5730 }, { "epoch": 0.2547417355944991, "grad_norm": 115.24018859863281, "learning_rate": 9.990049157853641e-06, "loss": 11.7818, "step": 5740 }, { "epoch": 0.2551855365275906, "grad_norm": 93.93407440185547, "learning_rate": 9.990031821891714e-06, "loss": 12.2078, "step": 5750 }, { "epoch": 0.255629337460682, "grad_norm": 97.67652130126953, "learning_rate": 9.990014485929787e-06, "loss": 11.739, "step": 5760 }, { "epoch": 0.2560731383937735, "grad_norm": 117.71260070800781, "learning_rate": 9.989997149967859e-06, "loss": 11.7933, "step": 5770 }, { "epoch": 0.2565169393268649, "grad_norm": 85.63137817382812, "learning_rate": 9.989979814005932e-06, "loss": 11.5123, "step": 5780 }, { "epoch": 0.2569607402599564, "grad_norm": 91.07071685791016, "learning_rate": 9.989962478044005e-06, "loss": 11.7954, "step": 5790 }, { "epoch": 0.25740454119304784, "grad_norm": 94.04301452636719, "learning_rate": 9.989945142082078e-06, "loss": 11.4023, "step": 5800 }, { "epoch": 0.25784834212613933, "grad_norm": 98.91206359863281, "learning_rate": 9.98992780612015e-06, "loss": 12.2875, "step": 5810 }, { "epoch": 0.25829214305923076, "grad_norm": 94.439208984375, "learning_rate": 9.989910470158222e-06, "loss": 11.4681, "step": 5820 }, { "epoch": 0.25873594399232225, "grad_norm": 96.2354507446289, "learning_rate": 9.989893134196295e-06, "loss": 11.2651, "step": 5830 }, { "epoch": 0.2591797449254137, "grad_norm": 99.5677490234375, "learning_rate": 9.989875798234367e-06, "loss": 11.7862, "step": 5840 }, { "epoch": 0.25962354585850517, "grad_norm": 117.41152954101562, "learning_rate": 9.98985846227244e-06, "loss": 12.0639, "step": 5850 }, { "epoch": 0.26006734679159665, "grad_norm": 92.46183013916016, "learning_rate": 9.989841126310513e-06, "loss": 11.5852, "step": 5860 }, { "epoch": 0.2605111477246881, "grad_norm": 116.44026947021484, "learning_rate": 9.989823790348584e-06, "loss": 11.847, "step": 5870 }, { "epoch": 0.2609549486577796, "grad_norm": 109.089111328125, "learning_rate": 9.989806454386657e-06, "loss": 11.9711, "step": 5880 }, { "epoch": 0.261398749590871, "grad_norm": 126.8874740600586, "learning_rate": 9.98978911842473e-06, "loss": 11.7392, "step": 5890 }, { "epoch": 0.2618425505239625, "grad_norm": 121.22462463378906, "learning_rate": 9.989771782462802e-06, "loss": 11.8794, "step": 5900 }, { "epoch": 0.2622863514570539, "grad_norm": 87.884521484375, "learning_rate": 9.989754446500875e-06, "loss": 11.6205, "step": 5910 }, { "epoch": 0.2627301523901454, "grad_norm": 102.1272201538086, "learning_rate": 9.989737110538948e-06, "loss": 11.8747, "step": 5920 }, { "epoch": 0.26317395332323684, "grad_norm": 100.64566802978516, "learning_rate": 9.989719774577021e-06, "loss": 12.3826, "step": 5930 }, { "epoch": 0.26361775425632833, "grad_norm": 94.28556823730469, "learning_rate": 9.989702438615092e-06, "loss": 11.7302, "step": 5940 }, { "epoch": 0.26406155518941976, "grad_norm": 109.41671752929688, "learning_rate": 9.989685102653166e-06, "loss": 11.0984, "step": 5950 }, { "epoch": 0.26450535612251125, "grad_norm": 92.14616394042969, "learning_rate": 9.989667766691239e-06, "loss": 11.6492, "step": 5960 }, { "epoch": 0.26494915705560274, "grad_norm": 95.84307861328125, "learning_rate": 9.98965043072931e-06, "loss": 12.0779, "step": 5970 }, { "epoch": 0.26539295798869417, "grad_norm": 102.63638305664062, "learning_rate": 9.989633094767383e-06, "loss": 12.4218, "step": 5980 }, { "epoch": 0.26583675892178565, "grad_norm": 110.38811492919922, "learning_rate": 9.989615758805456e-06, "loss": 12.0949, "step": 5990 }, { "epoch": 0.2662805598548771, "grad_norm": 88.99332427978516, "learning_rate": 9.989598422843528e-06, "loss": 12.2146, "step": 6000 }, { "epoch": 0.2662805598548771, "eval_loss": 0.3712849020957947, "eval_runtime": 674.6735, "eval_samples_per_second": 1799.968, "eval_steps_per_second": 56.249, "step": 6000 }, { "epoch": 0.2667243607879686, "grad_norm": 106.79357147216797, "learning_rate": 9.9895810868816e-06, "loss": 11.816, "step": 6010 }, { "epoch": 0.26716816172106, "grad_norm": 101.64129638671875, "learning_rate": 9.989563750919674e-06, "loss": 11.551, "step": 6020 }, { "epoch": 0.2676119626541515, "grad_norm": 124.5982437133789, "learning_rate": 9.989546414957745e-06, "loss": 11.2434, "step": 6030 }, { "epoch": 0.2680557635872429, "grad_norm": 86.55455780029297, "learning_rate": 9.989529078995818e-06, "loss": 11.6358, "step": 6040 }, { "epoch": 0.2684995645203344, "grad_norm": 104.90837860107422, "learning_rate": 9.989511743033891e-06, "loss": 11.7908, "step": 6050 }, { "epoch": 0.26894336545342584, "grad_norm": 91.85465240478516, "learning_rate": 9.989494407071964e-06, "loss": 11.675, "step": 6060 }, { "epoch": 0.26938716638651733, "grad_norm": 110.98849487304688, "learning_rate": 9.989477071110036e-06, "loss": 11.9682, "step": 6070 }, { "epoch": 0.2698309673196088, "grad_norm": 117.15697479248047, "learning_rate": 9.989459735148109e-06, "loss": 12.6112, "step": 6080 }, { "epoch": 0.27027476825270025, "grad_norm": 116.58313751220703, "learning_rate": 9.989442399186182e-06, "loss": 11.8428, "step": 6090 }, { "epoch": 0.27071856918579174, "grad_norm": 141.74298095703125, "learning_rate": 9.989425063224253e-06, "loss": 11.678, "step": 6100 }, { "epoch": 0.27116237011888317, "grad_norm": 117.08960723876953, "learning_rate": 9.989407727262326e-06, "loss": 11.6677, "step": 6110 }, { "epoch": 0.27160617105197465, "grad_norm": 102.23706817626953, "learning_rate": 9.9893903913004e-06, "loss": 11.5424, "step": 6120 }, { "epoch": 0.2720499719850661, "grad_norm": 100.09423065185547, "learning_rate": 9.98937305533847e-06, "loss": 12.0797, "step": 6130 }, { "epoch": 0.2724937729181576, "grad_norm": 113.59011840820312, "learning_rate": 9.989355719376544e-06, "loss": 12.0007, "step": 6140 }, { "epoch": 0.272937573851249, "grad_norm": 94.99832153320312, "learning_rate": 9.989338383414617e-06, "loss": 11.4787, "step": 6150 }, { "epoch": 0.2733813747843405, "grad_norm": 135.75721740722656, "learning_rate": 9.989321047452688e-06, "loss": 11.9264, "step": 6160 }, { "epoch": 0.2738251757174319, "grad_norm": 112.27375793457031, "learning_rate": 9.989303711490761e-06, "loss": 11.4877, "step": 6170 }, { "epoch": 0.2742689766505234, "grad_norm": 115.73841094970703, "learning_rate": 9.989286375528834e-06, "loss": 11.6287, "step": 6180 }, { "epoch": 0.27471277758361484, "grad_norm": 108.56095886230469, "learning_rate": 9.989269039566908e-06, "loss": 12.2162, "step": 6190 }, { "epoch": 0.27515657851670633, "grad_norm": 106.11347961425781, "learning_rate": 9.989251703604979e-06, "loss": 12.4926, "step": 6200 }, { "epoch": 0.2756003794497978, "grad_norm": 93.0553207397461, "learning_rate": 9.989234367643052e-06, "loss": 11.5661, "step": 6210 }, { "epoch": 0.27604418038288925, "grad_norm": 114.06632232666016, "learning_rate": 9.989217031681125e-06, "loss": 11.9102, "step": 6220 }, { "epoch": 0.27648798131598074, "grad_norm": 105.9919204711914, "learning_rate": 9.989199695719196e-06, "loss": 11.4711, "step": 6230 }, { "epoch": 0.27693178224907217, "grad_norm": 84.44627380371094, "learning_rate": 9.98918235975727e-06, "loss": 11.6304, "step": 6240 }, { "epoch": 0.27737558318216365, "grad_norm": 96.19264221191406, "learning_rate": 9.989165023795343e-06, "loss": 11.414, "step": 6250 }, { "epoch": 0.2778193841152551, "grad_norm": 127.19998931884766, "learning_rate": 9.989147687833414e-06, "loss": 11.9031, "step": 6260 }, { "epoch": 0.2782631850483466, "grad_norm": 108.23567199707031, "learning_rate": 9.989130351871487e-06, "loss": 11.9478, "step": 6270 }, { "epoch": 0.278706985981438, "grad_norm": 96.11833190917969, "learning_rate": 9.98911301590956e-06, "loss": 12.053, "step": 6280 }, { "epoch": 0.2791507869145295, "grad_norm": 89.6744613647461, "learning_rate": 9.989095679947633e-06, "loss": 11.7384, "step": 6290 }, { "epoch": 0.2795945878476209, "grad_norm": 88.66259002685547, "learning_rate": 9.989078343985705e-06, "loss": 11.6472, "step": 6300 }, { "epoch": 0.2800383887807124, "grad_norm": 112.74403381347656, "learning_rate": 9.989061008023778e-06, "loss": 11.8507, "step": 6310 }, { "epoch": 0.2804821897138039, "grad_norm": 100.31635284423828, "learning_rate": 9.98904367206185e-06, "loss": 11.3632, "step": 6320 }, { "epoch": 0.28092599064689533, "grad_norm": 92.06365966796875, "learning_rate": 9.989026336099922e-06, "loss": 11.9611, "step": 6330 }, { "epoch": 0.2813697915799868, "grad_norm": 106.90202331542969, "learning_rate": 9.989009000137995e-06, "loss": 11.9601, "step": 6340 }, { "epoch": 0.28181359251307825, "grad_norm": 93.49620056152344, "learning_rate": 9.988991664176068e-06, "loss": 12.0237, "step": 6350 }, { "epoch": 0.28225739344616974, "grad_norm": 101.77459716796875, "learning_rate": 9.98897432821414e-06, "loss": 11.6238, "step": 6360 }, { "epoch": 0.28270119437926117, "grad_norm": 94.77214813232422, "learning_rate": 9.988956992252213e-06, "loss": 11.6693, "step": 6370 }, { "epoch": 0.28314499531235265, "grad_norm": 110.98194885253906, "learning_rate": 9.988939656290286e-06, "loss": 11.8191, "step": 6380 }, { "epoch": 0.2835887962454441, "grad_norm": 96.43860626220703, "learning_rate": 9.988922320328357e-06, "loss": 11.2866, "step": 6390 }, { "epoch": 0.2840325971785356, "grad_norm": 125.56005096435547, "learning_rate": 9.98890498436643e-06, "loss": 12.1676, "step": 6400 }, { "epoch": 0.284476398111627, "grad_norm": 122.20761108398438, "learning_rate": 9.988887648404503e-06, "loss": 12.474, "step": 6410 }, { "epoch": 0.2849201990447185, "grad_norm": 101.35332489013672, "learning_rate": 9.988870312442576e-06, "loss": 11.733, "step": 6420 }, { "epoch": 0.28536399997781, "grad_norm": 97.87244415283203, "learning_rate": 9.988852976480648e-06, "loss": 11.9138, "step": 6430 }, { "epoch": 0.2858078009109014, "grad_norm": 106.80064392089844, "learning_rate": 9.988835640518721e-06, "loss": 12.4942, "step": 6440 }, { "epoch": 0.2862516018439929, "grad_norm": 106.28848266601562, "learning_rate": 9.988818304556794e-06, "loss": 11.9158, "step": 6450 }, { "epoch": 0.28669540277708433, "grad_norm": 100.0877914428711, "learning_rate": 9.988800968594865e-06, "loss": 11.2765, "step": 6460 }, { "epoch": 0.2871392037101758, "grad_norm": 100.36260223388672, "learning_rate": 9.988783632632938e-06, "loss": 11.7645, "step": 6470 }, { "epoch": 0.28758300464326725, "grad_norm": 102.455078125, "learning_rate": 9.988766296671012e-06, "loss": 11.5561, "step": 6480 }, { "epoch": 0.28802680557635874, "grad_norm": 91.07699584960938, "learning_rate": 9.988748960709083e-06, "loss": 11.8894, "step": 6490 }, { "epoch": 0.28847060650945017, "grad_norm": 96.68830108642578, "learning_rate": 9.988731624747156e-06, "loss": 11.7245, "step": 6500 }, { "epoch": 0.28891440744254165, "grad_norm": 86.85298919677734, "learning_rate": 9.988714288785229e-06, "loss": 11.685, "step": 6510 }, { "epoch": 0.2893582083756331, "grad_norm": 100.70494842529297, "learning_rate": 9.9886969528233e-06, "loss": 11.6402, "step": 6520 }, { "epoch": 0.2898020093087246, "grad_norm": 101.03349304199219, "learning_rate": 9.988679616861374e-06, "loss": 12.2626, "step": 6530 }, { "epoch": 0.29024581024181606, "grad_norm": 91.4577865600586, "learning_rate": 9.988662280899447e-06, "loss": 11.8911, "step": 6540 }, { "epoch": 0.2906896111749075, "grad_norm": 97.14340209960938, "learning_rate": 9.98864494493752e-06, "loss": 11.6866, "step": 6550 }, { "epoch": 0.291133412107999, "grad_norm": 94.9050064086914, "learning_rate": 9.988627608975591e-06, "loss": 11.713, "step": 6560 }, { "epoch": 0.2915772130410904, "grad_norm": 94.01126861572266, "learning_rate": 9.988610273013664e-06, "loss": 11.6159, "step": 6570 }, { "epoch": 0.2920210139741819, "grad_norm": 89.02586364746094, "learning_rate": 9.988592937051737e-06, "loss": 11.6739, "step": 6580 }, { "epoch": 0.29246481490727333, "grad_norm": 126.11679077148438, "learning_rate": 9.988575601089809e-06, "loss": 11.7169, "step": 6590 }, { "epoch": 0.2929086158403648, "grad_norm": 91.34590148925781, "learning_rate": 9.988558265127882e-06, "loss": 11.5195, "step": 6600 }, { "epoch": 0.29335241677345625, "grad_norm": 108.34649658203125, "learning_rate": 9.988540929165955e-06, "loss": 11.6583, "step": 6610 }, { "epoch": 0.29379621770654774, "grad_norm": 111.23458099365234, "learning_rate": 9.988523593204026e-06, "loss": 11.5678, "step": 6620 }, { "epoch": 0.29424001863963917, "grad_norm": 92.88959503173828, "learning_rate": 9.9885062572421e-06, "loss": 12.0539, "step": 6630 }, { "epoch": 0.29468381957273065, "grad_norm": 119.5103530883789, "learning_rate": 9.988488921280172e-06, "loss": 11.7499, "step": 6640 }, { "epoch": 0.2951276205058221, "grad_norm": 107.3758544921875, "learning_rate": 9.988471585318244e-06, "loss": 11.3844, "step": 6650 }, { "epoch": 0.2955714214389136, "grad_norm": 103.09275817871094, "learning_rate": 9.988454249356317e-06, "loss": 11.3347, "step": 6660 }, { "epoch": 0.29601522237200506, "grad_norm": 104.75767517089844, "learning_rate": 9.98843691339439e-06, "loss": 12.1328, "step": 6670 }, { "epoch": 0.2964590233050965, "grad_norm": 105.1933364868164, "learning_rate": 9.988419577432463e-06, "loss": 12.1966, "step": 6680 }, { "epoch": 0.296902824238188, "grad_norm": 95.23583221435547, "learning_rate": 9.988402241470534e-06, "loss": 11.925, "step": 6690 }, { "epoch": 0.2973466251712794, "grad_norm": 93.23466491699219, "learning_rate": 9.988384905508607e-06, "loss": 11.9893, "step": 6700 }, { "epoch": 0.2977904261043709, "grad_norm": 96.76582336425781, "learning_rate": 9.98836756954668e-06, "loss": 11.4291, "step": 6710 }, { "epoch": 0.29823422703746233, "grad_norm": 95.39541625976562, "learning_rate": 9.988350233584752e-06, "loss": 11.6534, "step": 6720 }, { "epoch": 0.2986780279705538, "grad_norm": 111.02050018310547, "learning_rate": 9.988332897622825e-06, "loss": 11.215, "step": 6730 }, { "epoch": 0.29912182890364525, "grad_norm": 93.41612243652344, "learning_rate": 9.988315561660898e-06, "loss": 11.8045, "step": 6740 }, { "epoch": 0.29956562983673674, "grad_norm": 103.25040435791016, "learning_rate": 9.98829822569897e-06, "loss": 11.7352, "step": 6750 }, { "epoch": 0.30000943076982817, "grad_norm": 110.95781707763672, "learning_rate": 9.988280889737043e-06, "loss": 11.9544, "step": 6760 }, { "epoch": 0.30045323170291965, "grad_norm": 90.73185729980469, "learning_rate": 9.988263553775116e-06, "loss": 11.5593, "step": 6770 }, { "epoch": 0.30089703263601114, "grad_norm": 90.01508331298828, "learning_rate": 9.988246217813187e-06, "loss": 11.1305, "step": 6780 }, { "epoch": 0.3013408335691026, "grad_norm": 99.30635070800781, "learning_rate": 9.98822888185126e-06, "loss": 11.1892, "step": 6790 }, { "epoch": 0.30178463450219406, "grad_norm": 112.32921600341797, "learning_rate": 9.988211545889333e-06, "loss": 11.9221, "step": 6800 }, { "epoch": 0.3022284354352855, "grad_norm": 97.16471099853516, "learning_rate": 9.988194209927405e-06, "loss": 11.8916, "step": 6810 }, { "epoch": 0.302672236368377, "grad_norm": 93.2406234741211, "learning_rate": 9.988176873965478e-06, "loss": 11.4169, "step": 6820 }, { "epoch": 0.3031160373014684, "grad_norm": 111.53374481201172, "learning_rate": 9.98815953800355e-06, "loss": 11.1438, "step": 6830 }, { "epoch": 0.3035598382345599, "grad_norm": 94.99114990234375, "learning_rate": 9.988142202041622e-06, "loss": 12.0838, "step": 6840 }, { "epoch": 0.30400363916765133, "grad_norm": 117.2374496459961, "learning_rate": 9.988124866079695e-06, "loss": 11.4959, "step": 6850 }, { "epoch": 0.3044474401007428, "grad_norm": 97.72772216796875, "learning_rate": 9.988107530117768e-06, "loss": 11.8559, "step": 6860 }, { "epoch": 0.30489124103383425, "grad_norm": 105.31409454345703, "learning_rate": 9.98809019415584e-06, "loss": 11.537, "step": 6870 }, { "epoch": 0.30533504196692574, "grad_norm": 125.29707336425781, "learning_rate": 9.988072858193913e-06, "loss": 11.4942, "step": 6880 }, { "epoch": 0.3057788429000172, "grad_norm": 101.15106201171875, "learning_rate": 9.988055522231986e-06, "loss": 11.3884, "step": 6890 }, { "epoch": 0.30622264383310865, "grad_norm": 97.56317138671875, "learning_rate": 9.988038186270059e-06, "loss": 11.5489, "step": 6900 }, { "epoch": 0.30666644476620014, "grad_norm": 112.57002258300781, "learning_rate": 9.98802085030813e-06, "loss": 11.3843, "step": 6910 }, { "epoch": 0.3071102456992916, "grad_norm": 100.37068176269531, "learning_rate": 9.988003514346203e-06, "loss": 11.4439, "step": 6920 }, { "epoch": 0.30755404663238306, "grad_norm": 119.5300521850586, "learning_rate": 9.987986178384276e-06, "loss": 11.4964, "step": 6930 }, { "epoch": 0.3079978475654745, "grad_norm": 88.61663055419922, "learning_rate": 9.987968842422348e-06, "loss": 11.6089, "step": 6940 }, { "epoch": 0.308441648498566, "grad_norm": 101.71082305908203, "learning_rate": 9.98795150646042e-06, "loss": 11.5646, "step": 6950 }, { "epoch": 0.3088854494316574, "grad_norm": 111.21810913085938, "learning_rate": 9.987934170498494e-06, "loss": 11.2611, "step": 6960 }, { "epoch": 0.3093292503647489, "grad_norm": 110.25248718261719, "learning_rate": 9.987916834536565e-06, "loss": 11.5723, "step": 6970 }, { "epoch": 0.30977305129784033, "grad_norm": 104.40763092041016, "learning_rate": 9.987899498574638e-06, "loss": 12.0642, "step": 6980 }, { "epoch": 0.3102168522309318, "grad_norm": 89.2273941040039, "learning_rate": 9.987882162612711e-06, "loss": 11.4713, "step": 6990 }, { "epoch": 0.3106606531640233, "grad_norm": 85.18675994873047, "learning_rate": 9.987864826650783e-06, "loss": 11.8243, "step": 7000 }, { "epoch": 0.3106606531640233, "eval_loss": 0.36315852403640747, "eval_runtime": 674.7882, "eval_samples_per_second": 1799.663, "eval_steps_per_second": 56.24, "step": 7000 }, { "epoch": 0.31110445409711474, "grad_norm": 123.10943603515625, "learning_rate": 9.987847490688856e-06, "loss": 11.6931, "step": 7010 }, { "epoch": 0.3115482550302062, "grad_norm": 130.8447723388672, "learning_rate": 9.987830154726929e-06, "loss": 11.5164, "step": 7020 }, { "epoch": 0.31199205596329765, "grad_norm": 101.97557830810547, "learning_rate": 9.987812818765e-06, "loss": 11.8152, "step": 7030 }, { "epoch": 0.31243585689638914, "grad_norm": 93.44794464111328, "learning_rate": 9.987795482803073e-06, "loss": 11.6298, "step": 7040 }, { "epoch": 0.3128796578294806, "grad_norm": 97.57392120361328, "learning_rate": 9.987778146841147e-06, "loss": 11.3152, "step": 7050 }, { "epoch": 0.31332345876257206, "grad_norm": 102.27783203125, "learning_rate": 9.987760810879218e-06, "loss": 10.9668, "step": 7060 }, { "epoch": 0.3137672596956635, "grad_norm": 108.09846496582031, "learning_rate": 9.987743474917291e-06, "loss": 11.4038, "step": 7070 }, { "epoch": 0.314211060628755, "grad_norm": 90.89788055419922, "learning_rate": 9.987726138955364e-06, "loss": 11.3717, "step": 7080 }, { "epoch": 0.3146548615618464, "grad_norm": 115.7634506225586, "learning_rate": 9.987708802993435e-06, "loss": 11.682, "step": 7090 }, { "epoch": 0.3150986624949379, "grad_norm": 109.29789733886719, "learning_rate": 9.987691467031509e-06, "loss": 11.7889, "step": 7100 }, { "epoch": 0.3155424634280294, "grad_norm": 95.27775573730469, "learning_rate": 9.987674131069582e-06, "loss": 11.2461, "step": 7110 }, { "epoch": 0.3159862643611208, "grad_norm": 110.1352310180664, "learning_rate": 9.987656795107655e-06, "loss": 11.3557, "step": 7120 }, { "epoch": 0.3164300652942123, "grad_norm": 82.80999755859375, "learning_rate": 9.987639459145726e-06, "loss": 11.4224, "step": 7130 }, { "epoch": 0.31687386622730374, "grad_norm": 94.51629638671875, "learning_rate": 9.987622123183799e-06, "loss": 11.6793, "step": 7140 }, { "epoch": 0.3173176671603952, "grad_norm": 106.12570190429688, "learning_rate": 9.987604787221872e-06, "loss": 11.3187, "step": 7150 }, { "epoch": 0.31776146809348665, "grad_norm": 110.01351928710938, "learning_rate": 9.987587451259944e-06, "loss": 11.3371, "step": 7160 }, { "epoch": 0.31820526902657814, "grad_norm": 97.4388427734375, "learning_rate": 9.987570115298017e-06, "loss": 11.5326, "step": 7170 }, { "epoch": 0.3186490699596696, "grad_norm": 80.90787506103516, "learning_rate": 9.98755277933609e-06, "loss": 11.7102, "step": 7180 }, { "epoch": 0.31909287089276106, "grad_norm": 102.96366882324219, "learning_rate": 9.987535443374161e-06, "loss": 11.9188, "step": 7190 }, { "epoch": 0.3195366718258525, "grad_norm": 101.80105590820312, "learning_rate": 9.987518107412234e-06, "loss": 12.0429, "step": 7200 }, { "epoch": 0.319980472758944, "grad_norm": 90.94136047363281, "learning_rate": 9.987500771450307e-06, "loss": 10.825, "step": 7210 }, { "epoch": 0.3204242736920354, "grad_norm": 109.56967163085938, "learning_rate": 9.987483435488379e-06, "loss": 11.5279, "step": 7220 }, { "epoch": 0.3208680746251269, "grad_norm": 95.71635437011719, "learning_rate": 9.987466099526452e-06, "loss": 11.6178, "step": 7230 }, { "epoch": 0.3213118755582184, "grad_norm": 93.75989532470703, "learning_rate": 9.987448763564525e-06, "loss": 11.4102, "step": 7240 }, { "epoch": 0.3217556764913098, "grad_norm": 100.95425415039062, "learning_rate": 9.987431427602596e-06, "loss": 11.7016, "step": 7250 }, { "epoch": 0.3221994774244013, "grad_norm": 97.52702331542969, "learning_rate": 9.98741409164067e-06, "loss": 11.8841, "step": 7260 }, { "epoch": 0.32264327835749274, "grad_norm": 91.30542755126953, "learning_rate": 9.987396755678742e-06, "loss": 11.4776, "step": 7270 }, { "epoch": 0.3230870792905842, "grad_norm": 106.42786407470703, "learning_rate": 9.987379419716814e-06, "loss": 11.5718, "step": 7280 }, { "epoch": 0.32353088022367565, "grad_norm": 93.21934509277344, "learning_rate": 9.987362083754887e-06, "loss": 11.1913, "step": 7290 }, { "epoch": 0.32397468115676714, "grad_norm": 102.5538101196289, "learning_rate": 9.98734474779296e-06, "loss": 11.5974, "step": 7300 }, { "epoch": 0.3244184820898586, "grad_norm": 96.53839874267578, "learning_rate": 9.987327411831033e-06, "loss": 11.6911, "step": 7310 }, { "epoch": 0.32486228302295006, "grad_norm": 94.2166519165039, "learning_rate": 9.987310075869104e-06, "loss": 11.4638, "step": 7320 }, { "epoch": 0.3253060839560415, "grad_norm": 100.33325958251953, "learning_rate": 9.987292739907177e-06, "loss": 11.3276, "step": 7330 }, { "epoch": 0.325749884889133, "grad_norm": 94.38825988769531, "learning_rate": 9.98727540394525e-06, "loss": 11.408, "step": 7340 }, { "epoch": 0.32619368582222447, "grad_norm": 106.76789855957031, "learning_rate": 9.987258067983322e-06, "loss": 12.0534, "step": 7350 }, { "epoch": 0.3266374867553159, "grad_norm": 92.51216888427734, "learning_rate": 9.987240732021395e-06, "loss": 11.0724, "step": 7360 }, { "epoch": 0.3270812876884074, "grad_norm": 90.04621124267578, "learning_rate": 9.987223396059468e-06, "loss": 11.682, "step": 7370 }, { "epoch": 0.3275250886214988, "grad_norm": 112.20062255859375, "learning_rate": 9.98720606009754e-06, "loss": 11.4545, "step": 7380 }, { "epoch": 0.3279688895545903, "grad_norm": 98.15010833740234, "learning_rate": 9.987188724135613e-06, "loss": 11.6855, "step": 7390 }, { "epoch": 0.32841269048768174, "grad_norm": 79.66431427001953, "learning_rate": 9.987171388173686e-06, "loss": 11.2667, "step": 7400 }, { "epoch": 0.3288564914207732, "grad_norm": 103.56822204589844, "learning_rate": 9.987154052211757e-06, "loss": 11.5097, "step": 7410 }, { "epoch": 0.32930029235386465, "grad_norm": 101.66146850585938, "learning_rate": 9.98713671624983e-06, "loss": 11.2951, "step": 7420 }, { "epoch": 0.32974409328695614, "grad_norm": 95.05819702148438, "learning_rate": 9.987119380287903e-06, "loss": 11.6909, "step": 7430 }, { "epoch": 0.3301878942200476, "grad_norm": 97.97879028320312, "learning_rate": 9.987102044325975e-06, "loss": 10.9043, "step": 7440 }, { "epoch": 0.33063169515313906, "grad_norm": 139.6067352294922, "learning_rate": 9.987084708364048e-06, "loss": 11.3615, "step": 7450 }, { "epoch": 0.33107549608623055, "grad_norm": 106.51782989501953, "learning_rate": 9.98706737240212e-06, "loss": 11.3788, "step": 7460 }, { "epoch": 0.331519297019322, "grad_norm": 89.64512634277344, "learning_rate": 9.987050036440192e-06, "loss": 11.7731, "step": 7470 }, { "epoch": 0.33196309795241347, "grad_norm": 90.49308776855469, "learning_rate": 9.987032700478265e-06, "loss": 11.8977, "step": 7480 }, { "epoch": 0.3324068988855049, "grad_norm": 103.32622528076172, "learning_rate": 9.987015364516338e-06, "loss": 11.5663, "step": 7490 }, { "epoch": 0.3328506998185964, "grad_norm": 88.08988189697266, "learning_rate": 9.98699802855441e-06, "loss": 11.8236, "step": 7500 }, { "epoch": 0.3332945007516878, "grad_norm": 76.12377166748047, "learning_rate": 9.986980692592483e-06, "loss": 11.2932, "step": 7510 }, { "epoch": 0.3337383016847793, "grad_norm": 94.77151489257812, "learning_rate": 9.986963356630556e-06, "loss": 12.1823, "step": 7520 }, { "epoch": 0.33418210261787074, "grad_norm": 100.01810455322266, "learning_rate": 9.986946020668629e-06, "loss": 12.6788, "step": 7530 }, { "epoch": 0.3346259035509622, "grad_norm": 132.56448364257812, "learning_rate": 9.9869286847067e-06, "loss": 12.149, "step": 7540 }, { "epoch": 0.33506970448405365, "grad_norm": 113.610107421875, "learning_rate": 9.986911348744773e-06, "loss": 11.8811, "step": 7550 }, { "epoch": 0.33551350541714514, "grad_norm": 94.93478393554688, "learning_rate": 9.986894012782846e-06, "loss": 11.5749, "step": 7560 }, { "epoch": 0.33595730635023663, "grad_norm": 92.85311126708984, "learning_rate": 9.986876676820918e-06, "loss": 11.7571, "step": 7570 }, { "epoch": 0.33640110728332806, "grad_norm": 79.2991943359375, "learning_rate": 9.986859340858991e-06, "loss": 11.3534, "step": 7580 }, { "epoch": 0.33684490821641955, "grad_norm": 103.73065185546875, "learning_rate": 9.986842004897064e-06, "loss": 11.6249, "step": 7590 }, { "epoch": 0.337288709149511, "grad_norm": 79.18226623535156, "learning_rate": 9.986824668935135e-06, "loss": 11.634, "step": 7600 }, { "epoch": 0.33773251008260247, "grad_norm": 100.51226806640625, "learning_rate": 9.986807332973208e-06, "loss": 11.2835, "step": 7610 }, { "epoch": 0.3381763110156939, "grad_norm": 92.74190521240234, "learning_rate": 9.986789997011281e-06, "loss": 11.3273, "step": 7620 }, { "epoch": 0.3386201119487854, "grad_norm": 89.3246841430664, "learning_rate": 9.986772661049353e-06, "loss": 12.6279, "step": 7630 }, { "epoch": 0.3390639128818768, "grad_norm": 117.41140747070312, "learning_rate": 9.986755325087426e-06, "loss": 11.9215, "step": 7640 }, { "epoch": 0.3395077138149683, "grad_norm": 105.69800567626953, "learning_rate": 9.986737989125499e-06, "loss": 11.4488, "step": 7650 }, { "epoch": 0.33995151474805974, "grad_norm": 99.95671844482422, "learning_rate": 9.98672065316357e-06, "loss": 11.5145, "step": 7660 }, { "epoch": 0.3403953156811512, "grad_norm": 85.2020263671875, "learning_rate": 9.986703317201643e-06, "loss": 11.0807, "step": 7670 }, { "epoch": 0.34083911661424265, "grad_norm": 92.34159088134766, "learning_rate": 9.986685981239717e-06, "loss": 11.5631, "step": 7680 }, { "epoch": 0.34128291754733414, "grad_norm": 89.49488067626953, "learning_rate": 9.986668645277788e-06, "loss": 11.5445, "step": 7690 }, { "epoch": 0.34172671848042563, "grad_norm": 96.70406341552734, "learning_rate": 9.986651309315861e-06, "loss": 11.5824, "step": 7700 }, { "epoch": 0.34217051941351706, "grad_norm": 87.1051254272461, "learning_rate": 9.986633973353934e-06, "loss": 11.9654, "step": 7710 }, { "epoch": 0.34261432034660855, "grad_norm": 94.70158386230469, "learning_rate": 9.986616637392005e-06, "loss": 10.8494, "step": 7720 }, { "epoch": 0.3430581212797, "grad_norm": 110.57974243164062, "learning_rate": 9.986599301430079e-06, "loss": 11.244, "step": 7730 }, { "epoch": 0.34350192221279147, "grad_norm": 87.20235443115234, "learning_rate": 9.986581965468152e-06, "loss": 11.3683, "step": 7740 }, { "epoch": 0.3439457231458829, "grad_norm": 89.35726165771484, "learning_rate": 9.986564629506225e-06, "loss": 11.1392, "step": 7750 }, { "epoch": 0.3443895240789744, "grad_norm": 97.75891876220703, "learning_rate": 9.986547293544296e-06, "loss": 11.761, "step": 7760 }, { "epoch": 0.3448333250120658, "grad_norm": 84.02690887451172, "learning_rate": 9.98652995758237e-06, "loss": 11.2544, "step": 7770 }, { "epoch": 0.3452771259451573, "grad_norm": 89.77345275878906, "learning_rate": 9.986512621620442e-06, "loss": 12.2648, "step": 7780 }, { "epoch": 0.34572092687824874, "grad_norm": 96.23056030273438, "learning_rate": 9.986495285658514e-06, "loss": 11.7363, "step": 7790 }, { "epoch": 0.3461647278113402, "grad_norm": 83.2893295288086, "learning_rate": 9.986477949696587e-06, "loss": 11.6393, "step": 7800 }, { "epoch": 0.3466085287444317, "grad_norm": 88.02169036865234, "learning_rate": 9.98646061373466e-06, "loss": 11.9822, "step": 7810 }, { "epoch": 0.34705232967752314, "grad_norm": 90.59603881835938, "learning_rate": 9.986443277772731e-06, "loss": 11.1564, "step": 7820 }, { "epoch": 0.34749613061061463, "grad_norm": 80.67443084716797, "learning_rate": 9.986425941810804e-06, "loss": 11.7602, "step": 7830 }, { "epoch": 0.34793993154370606, "grad_norm": 110.11127471923828, "learning_rate": 9.986408605848877e-06, "loss": 11.5633, "step": 7840 }, { "epoch": 0.34838373247679755, "grad_norm": 93.8271255493164, "learning_rate": 9.986391269886949e-06, "loss": 11.7892, "step": 7850 }, { "epoch": 0.348827533409889, "grad_norm": 108.33939361572266, "learning_rate": 9.986373933925022e-06, "loss": 11.5551, "step": 7860 }, { "epoch": 0.34927133434298047, "grad_norm": 92.38509368896484, "learning_rate": 9.986356597963095e-06, "loss": 11.3955, "step": 7870 }, { "epoch": 0.3497151352760719, "grad_norm": 83.159423828125, "learning_rate": 9.986339262001166e-06, "loss": 11.7543, "step": 7880 }, { "epoch": 0.3501589362091634, "grad_norm": 90.84649658203125, "learning_rate": 9.98632192603924e-06, "loss": 11.3032, "step": 7890 }, { "epoch": 0.3506027371422548, "grad_norm": 98.68833923339844, "learning_rate": 9.986304590077312e-06, "loss": 11.6455, "step": 7900 }, { "epoch": 0.3510465380753463, "grad_norm": 96.10926055908203, "learning_rate": 9.986287254115384e-06, "loss": 11.6115, "step": 7910 }, { "epoch": 0.3514903390084378, "grad_norm": 108.88908386230469, "learning_rate": 9.986269918153457e-06, "loss": 11.0329, "step": 7920 }, { "epoch": 0.3519341399415292, "grad_norm": 85.24256896972656, "learning_rate": 9.98625258219153e-06, "loss": 11.3518, "step": 7930 }, { "epoch": 0.3523779408746207, "grad_norm": 100.0363540649414, "learning_rate": 9.986235246229601e-06, "loss": 11.4287, "step": 7940 }, { "epoch": 0.35282174180771214, "grad_norm": 101.91362762451172, "learning_rate": 9.986217910267674e-06, "loss": 10.8322, "step": 7950 }, { "epoch": 0.35326554274080363, "grad_norm": 78.89401245117188, "learning_rate": 9.986200574305748e-06, "loss": 11.2172, "step": 7960 }, { "epoch": 0.35370934367389506, "grad_norm": 99.1776123046875, "learning_rate": 9.98618323834382e-06, "loss": 11.6107, "step": 7970 }, { "epoch": 0.35415314460698655, "grad_norm": 88.57538604736328, "learning_rate": 9.986165902381892e-06, "loss": 11.4944, "step": 7980 }, { "epoch": 0.354596945540078, "grad_norm": 91.77239227294922, "learning_rate": 9.986148566419965e-06, "loss": 11.4736, "step": 7990 }, { "epoch": 0.35504074647316947, "grad_norm": 118.60701751708984, "learning_rate": 9.986131230458038e-06, "loss": 11.3651, "step": 8000 }, { "epoch": 0.35504074647316947, "eval_loss": 0.35784557461738586, "eval_runtime": 673.1581, "eval_samples_per_second": 1804.021, "eval_steps_per_second": 56.376, "step": 8000 }, { "epoch": 0.3554845474062609, "grad_norm": 78.45508575439453, "learning_rate": 9.98611389449611e-06, "loss": 11.5346, "step": 8010 }, { "epoch": 0.3559283483393524, "grad_norm": 93.72156524658203, "learning_rate": 9.986096558534183e-06, "loss": 11.7047, "step": 8020 }, { "epoch": 0.35637214927244387, "grad_norm": 108.60897064208984, "learning_rate": 9.986079222572256e-06, "loss": 11.5851, "step": 8030 }, { "epoch": 0.3568159502055353, "grad_norm": 98.00389862060547, "learning_rate": 9.986061886610327e-06, "loss": 11.8359, "step": 8040 }, { "epoch": 0.3572597511386268, "grad_norm": 96.44226837158203, "learning_rate": 9.9860445506484e-06, "loss": 11.6475, "step": 8050 }, { "epoch": 0.3577035520717182, "grad_norm": 80.3302993774414, "learning_rate": 9.986027214686473e-06, "loss": 11.3945, "step": 8060 }, { "epoch": 0.3581473530048097, "grad_norm": 96.11526489257812, "learning_rate": 9.986009878724545e-06, "loss": 11.0479, "step": 8070 }, { "epoch": 0.35859115393790114, "grad_norm": 97.33485412597656, "learning_rate": 9.985992542762618e-06, "loss": 11.8224, "step": 8080 }, { "epoch": 0.35903495487099263, "grad_norm": 82.29984283447266, "learning_rate": 9.98597520680069e-06, "loss": 11.5318, "step": 8090 }, { "epoch": 0.35947875580408406, "grad_norm": 93.84577941894531, "learning_rate": 9.985957870838762e-06, "loss": 11.4744, "step": 8100 }, { "epoch": 0.35992255673717555, "grad_norm": 84.21646881103516, "learning_rate": 9.985940534876835e-06, "loss": 10.9926, "step": 8110 }, { "epoch": 0.360366357670267, "grad_norm": 83.08773040771484, "learning_rate": 9.985923198914908e-06, "loss": 10.9091, "step": 8120 }, { "epoch": 0.36081015860335847, "grad_norm": 79.4489974975586, "learning_rate": 9.98590586295298e-06, "loss": 10.9493, "step": 8130 }, { "epoch": 0.3612539595364499, "grad_norm": 85.71393585205078, "learning_rate": 9.985888526991053e-06, "loss": 11.4266, "step": 8140 }, { "epoch": 0.3616977604695414, "grad_norm": 81.63021087646484, "learning_rate": 9.985871191029126e-06, "loss": 11.6757, "step": 8150 }, { "epoch": 0.36214156140263287, "grad_norm": 91.55906677246094, "learning_rate": 9.985853855067197e-06, "loss": 11.6191, "step": 8160 }, { "epoch": 0.3625853623357243, "grad_norm": 80.61488342285156, "learning_rate": 9.98583651910527e-06, "loss": 11.5607, "step": 8170 }, { "epoch": 0.3630291632688158, "grad_norm": 100.6302261352539, "learning_rate": 9.985819183143343e-06, "loss": 11.6338, "step": 8180 }, { "epoch": 0.3634729642019072, "grad_norm": 98.94048309326172, "learning_rate": 9.985801847181416e-06, "loss": 11.8984, "step": 8190 }, { "epoch": 0.3639167651349987, "grad_norm": 94.3434066772461, "learning_rate": 9.985784511219488e-06, "loss": 11.4629, "step": 8200 }, { "epoch": 0.36436056606809014, "grad_norm": 117.29963684082031, "learning_rate": 9.985767175257561e-06, "loss": 11.5717, "step": 8210 }, { "epoch": 0.36480436700118163, "grad_norm": 96.46138763427734, "learning_rate": 9.985749839295634e-06, "loss": 11.2935, "step": 8220 }, { "epoch": 0.36524816793427306, "grad_norm": 88.48851776123047, "learning_rate": 9.985732503333705e-06, "loss": 11.4625, "step": 8230 }, { "epoch": 0.36569196886736455, "grad_norm": 90.18971252441406, "learning_rate": 9.985715167371778e-06, "loss": 11.2449, "step": 8240 }, { "epoch": 0.366135769800456, "grad_norm": 87.7426986694336, "learning_rate": 9.985697831409852e-06, "loss": 11.3782, "step": 8250 }, { "epoch": 0.36657957073354747, "grad_norm": 97.00252532958984, "learning_rate": 9.985680495447923e-06, "loss": 11.3337, "step": 8260 }, { "epoch": 0.36702337166663895, "grad_norm": 109.61273193359375, "learning_rate": 9.985663159485996e-06, "loss": 11.0625, "step": 8270 }, { "epoch": 0.3674671725997304, "grad_norm": 86.43873596191406, "learning_rate": 9.985645823524069e-06, "loss": 10.7205, "step": 8280 }, { "epoch": 0.36791097353282187, "grad_norm": 110.65450286865234, "learning_rate": 9.98562848756214e-06, "loss": 11.3779, "step": 8290 }, { "epoch": 0.3683547744659133, "grad_norm": 97.357421875, "learning_rate": 9.985611151600214e-06, "loss": 11.049, "step": 8300 }, { "epoch": 0.3687985753990048, "grad_norm": 72.67398834228516, "learning_rate": 9.985593815638287e-06, "loss": 10.9327, "step": 8310 }, { "epoch": 0.3692423763320962, "grad_norm": 80.5442886352539, "learning_rate": 9.985576479676358e-06, "loss": 11.3067, "step": 8320 }, { "epoch": 0.3696861772651877, "grad_norm": 104.49150848388672, "learning_rate": 9.985559143714431e-06, "loss": 11.7007, "step": 8330 }, { "epoch": 0.37012997819827914, "grad_norm": 93.68840789794922, "learning_rate": 9.985541807752504e-06, "loss": 12.1018, "step": 8340 }, { "epoch": 0.37057377913137063, "grad_norm": 92.62474060058594, "learning_rate": 9.985524471790576e-06, "loss": 11.2158, "step": 8350 }, { "epoch": 0.37101758006446206, "grad_norm": 94.18134307861328, "learning_rate": 9.985507135828649e-06, "loss": 11.551, "step": 8360 }, { "epoch": 0.37146138099755355, "grad_norm": 97.85765838623047, "learning_rate": 9.985489799866722e-06, "loss": 11.6042, "step": 8370 }, { "epoch": 0.37190518193064503, "grad_norm": 88.52871704101562, "learning_rate": 9.985472463904795e-06, "loss": 11.8907, "step": 8380 }, { "epoch": 0.37234898286373647, "grad_norm": 95.93720245361328, "learning_rate": 9.985455127942866e-06, "loss": 11.1994, "step": 8390 }, { "epoch": 0.37279278379682795, "grad_norm": 73.47252655029297, "learning_rate": 9.98543779198094e-06, "loss": 11.1229, "step": 8400 }, { "epoch": 0.3732365847299194, "grad_norm": 87.63044738769531, "learning_rate": 9.985420456019012e-06, "loss": 11.2802, "step": 8410 }, { "epoch": 0.37368038566301087, "grad_norm": 85.62527465820312, "learning_rate": 9.985403120057084e-06, "loss": 11.4917, "step": 8420 }, { "epoch": 0.3741241865961023, "grad_norm": 84.97439575195312, "learning_rate": 9.985385784095157e-06, "loss": 11.4111, "step": 8430 }, { "epoch": 0.3745679875291938, "grad_norm": 91.50364685058594, "learning_rate": 9.98536844813323e-06, "loss": 11.4548, "step": 8440 }, { "epoch": 0.3750117884622852, "grad_norm": 91.25043487548828, "learning_rate": 9.985351112171301e-06, "loss": 11.793, "step": 8450 }, { "epoch": 0.3754555893953767, "grad_norm": 93.69058227539062, "learning_rate": 9.985333776209374e-06, "loss": 11.0371, "step": 8460 }, { "epoch": 0.37589939032846814, "grad_norm": 89.45205688476562, "learning_rate": 9.985316440247447e-06, "loss": 11.3039, "step": 8470 }, { "epoch": 0.37634319126155963, "grad_norm": 97.13536071777344, "learning_rate": 9.98529910428552e-06, "loss": 11.1143, "step": 8480 }, { "epoch": 0.3767869921946511, "grad_norm": 95.88386535644531, "learning_rate": 9.985281768323592e-06, "loss": 10.9521, "step": 8490 }, { "epoch": 0.37723079312774255, "grad_norm": 107.3424072265625, "learning_rate": 9.985264432361665e-06, "loss": 11.4563, "step": 8500 }, { "epoch": 0.37767459406083403, "grad_norm": 78.75535583496094, "learning_rate": 9.985247096399738e-06, "loss": 11.6325, "step": 8510 }, { "epoch": 0.37811839499392547, "grad_norm": 93.4799575805664, "learning_rate": 9.98522976043781e-06, "loss": 11.5845, "step": 8520 }, { "epoch": 0.37856219592701695, "grad_norm": 82.9742202758789, "learning_rate": 9.985212424475882e-06, "loss": 10.9067, "step": 8530 }, { "epoch": 0.3790059968601084, "grad_norm": 86.02015686035156, "learning_rate": 9.985195088513956e-06, "loss": 11.7097, "step": 8540 }, { "epoch": 0.37944979779319987, "grad_norm": 78.73582458496094, "learning_rate": 9.985177752552027e-06, "loss": 11.3756, "step": 8550 }, { "epoch": 0.3798935987262913, "grad_norm": 86.17765808105469, "learning_rate": 9.9851604165901e-06, "loss": 11.1242, "step": 8560 }, { "epoch": 0.3803373996593828, "grad_norm": 103.56576538085938, "learning_rate": 9.985143080628173e-06, "loss": 11.5288, "step": 8570 }, { "epoch": 0.3807812005924742, "grad_norm": 96.16366577148438, "learning_rate": 9.985125744666244e-06, "loss": 11.3524, "step": 8580 }, { "epoch": 0.3812250015255657, "grad_norm": 79.89984893798828, "learning_rate": 9.985108408704318e-06, "loss": 11.2257, "step": 8590 }, { "epoch": 0.3816688024586572, "grad_norm": 91.93770599365234, "learning_rate": 9.98509107274239e-06, "loss": 11.5471, "step": 8600 }, { "epoch": 0.38211260339174863, "grad_norm": 87.27505493164062, "learning_rate": 9.985073736780464e-06, "loss": 11.2449, "step": 8610 }, { "epoch": 0.3825564043248401, "grad_norm": 93.93415069580078, "learning_rate": 9.985056400818535e-06, "loss": 11.0035, "step": 8620 }, { "epoch": 0.38300020525793155, "grad_norm": 88.42649841308594, "learning_rate": 9.985039064856608e-06, "loss": 11.3949, "step": 8630 }, { "epoch": 0.38344400619102303, "grad_norm": 87.21992492675781, "learning_rate": 9.985021728894681e-06, "loss": 11.419, "step": 8640 }, { "epoch": 0.38388780712411447, "grad_norm": 96.35975646972656, "learning_rate": 9.985004392932753e-06, "loss": 10.8626, "step": 8650 }, { "epoch": 0.38433160805720595, "grad_norm": 90.79749298095703, "learning_rate": 9.984987056970826e-06, "loss": 11.9004, "step": 8660 }, { "epoch": 0.3847754089902974, "grad_norm": 88.30585479736328, "learning_rate": 9.984969721008899e-06, "loss": 11.8872, "step": 8670 }, { "epoch": 0.38521920992338887, "grad_norm": 68.6938247680664, "learning_rate": 9.98495238504697e-06, "loss": 11.1578, "step": 8680 }, { "epoch": 0.3856630108564803, "grad_norm": 87.89897918701172, "learning_rate": 9.984935049085043e-06, "loss": 11.0376, "step": 8690 }, { "epoch": 0.3861068117895718, "grad_norm": 103.39437103271484, "learning_rate": 9.984917713123116e-06, "loss": 11.6238, "step": 8700 }, { "epoch": 0.3865506127226632, "grad_norm": 82.58814239501953, "learning_rate": 9.984900377161188e-06, "loss": 11.2413, "step": 8710 }, { "epoch": 0.3869944136557547, "grad_norm": 73.86261749267578, "learning_rate": 9.98488304119926e-06, "loss": 10.9585, "step": 8720 }, { "epoch": 0.3874382145888462, "grad_norm": 100.28836822509766, "learning_rate": 9.984865705237334e-06, "loss": 10.9422, "step": 8730 }, { "epoch": 0.38788201552193763, "grad_norm": 88.46509552001953, "learning_rate": 9.984848369275407e-06, "loss": 11.1186, "step": 8740 }, { "epoch": 0.3883258164550291, "grad_norm": 90.18559265136719, "learning_rate": 9.984831033313478e-06, "loss": 11.438, "step": 8750 }, { "epoch": 0.38876961738812055, "grad_norm": 99.76158142089844, "learning_rate": 9.984813697351551e-06, "loss": 11.6788, "step": 8760 }, { "epoch": 0.38921341832121203, "grad_norm": 78.58843994140625, "learning_rate": 9.984796361389624e-06, "loss": 11.2294, "step": 8770 }, { "epoch": 0.38965721925430347, "grad_norm": 117.36835479736328, "learning_rate": 9.984779025427696e-06, "loss": 11.3633, "step": 8780 }, { "epoch": 0.39010102018739495, "grad_norm": 81.84542846679688, "learning_rate": 9.984761689465769e-06, "loss": 11.9371, "step": 8790 }, { "epoch": 0.3905448211204864, "grad_norm": 84.65067291259766, "learning_rate": 9.984744353503842e-06, "loss": 11.3477, "step": 8800 }, { "epoch": 0.3909886220535779, "grad_norm": 86.41151428222656, "learning_rate": 9.984727017541913e-06, "loss": 11.537, "step": 8810 }, { "epoch": 0.3914324229866693, "grad_norm": 85.5174331665039, "learning_rate": 9.984709681579986e-06, "loss": 11.5844, "step": 8820 }, { "epoch": 0.3918762239197608, "grad_norm": 103.24414825439453, "learning_rate": 9.98469234561806e-06, "loss": 10.8962, "step": 8830 }, { "epoch": 0.3923200248528523, "grad_norm": 101.08570098876953, "learning_rate": 9.984675009656131e-06, "loss": 11.1054, "step": 8840 }, { "epoch": 0.3927638257859437, "grad_norm": 89.28972625732422, "learning_rate": 9.984657673694204e-06, "loss": 10.9952, "step": 8850 }, { "epoch": 0.3932076267190352, "grad_norm": 97.5589370727539, "learning_rate": 9.984640337732277e-06, "loss": 11.2572, "step": 8860 }, { "epoch": 0.39365142765212663, "grad_norm": 85.52608489990234, "learning_rate": 9.98462300177035e-06, "loss": 11.2462, "step": 8870 }, { "epoch": 0.3940952285852181, "grad_norm": 88.88489532470703, "learning_rate": 9.984605665808422e-06, "loss": 11.1294, "step": 8880 }, { "epoch": 0.39453902951830955, "grad_norm": 98.8681411743164, "learning_rate": 9.984588329846495e-06, "loss": 11.6495, "step": 8890 }, { "epoch": 0.39498283045140103, "grad_norm": 91.84007263183594, "learning_rate": 9.984570993884568e-06, "loss": 11.3472, "step": 8900 }, { "epoch": 0.39542663138449247, "grad_norm": 68.3472671508789, "learning_rate": 9.984553657922639e-06, "loss": 11.424, "step": 8910 }, { "epoch": 0.39587043231758395, "grad_norm": 83.57421875, "learning_rate": 9.984536321960712e-06, "loss": 11.078, "step": 8920 }, { "epoch": 0.3963142332506754, "grad_norm": 87.4074935913086, "learning_rate": 9.984518985998785e-06, "loss": 11.178, "step": 8930 }, { "epoch": 0.3967580341837669, "grad_norm": 73.35061645507812, "learning_rate": 9.984501650036857e-06, "loss": 11.0786, "step": 8940 }, { "epoch": 0.39720183511685836, "grad_norm": 79.557861328125, "learning_rate": 9.98448431407493e-06, "loss": 11.3864, "step": 8950 }, { "epoch": 0.3976456360499498, "grad_norm": 86.81566619873047, "learning_rate": 9.984466978113003e-06, "loss": 10.7141, "step": 8960 }, { "epoch": 0.3980894369830413, "grad_norm": 86.90424346923828, "learning_rate": 9.984449642151074e-06, "loss": 10.7673, "step": 8970 }, { "epoch": 0.3985332379161327, "grad_norm": 92.93916320800781, "learning_rate": 9.984432306189147e-06, "loss": 11.2373, "step": 8980 }, { "epoch": 0.3989770388492242, "grad_norm": 96.02029418945312, "learning_rate": 9.98441497022722e-06, "loss": 11.1749, "step": 8990 }, { "epoch": 0.39942083978231563, "grad_norm": 83.88191223144531, "learning_rate": 9.984397634265293e-06, "loss": 11.7742, "step": 9000 }, { "epoch": 0.39942083978231563, "eval_loss": 0.35243040323257446, "eval_runtime": 674.8358, "eval_samples_per_second": 1799.535, "eval_steps_per_second": 56.236, "step": 9000 }, { "epoch": 0.3998646407154071, "grad_norm": 73.33333587646484, "learning_rate": 9.984380298303365e-06, "loss": 10.9059, "step": 9010 }, { "epoch": 0.40030844164849855, "grad_norm": 87.8101806640625, "learning_rate": 9.984362962341438e-06, "loss": 11.2553, "step": 9020 }, { "epoch": 0.40075224258159003, "grad_norm": 79.9136734008789, "learning_rate": 9.984345626379511e-06, "loss": 11.2405, "step": 9030 }, { "epoch": 0.40119604351468147, "grad_norm": 92.52330017089844, "learning_rate": 9.984328290417582e-06, "loss": 10.9439, "step": 9040 }, { "epoch": 0.40163984444777295, "grad_norm": 92.92615509033203, "learning_rate": 9.984310954455655e-06, "loss": 11.3613, "step": 9050 }, { "epoch": 0.40208364538086444, "grad_norm": 87.73091125488281, "learning_rate": 9.984293618493729e-06, "loss": 11.2295, "step": 9060 }, { "epoch": 0.4025274463139559, "grad_norm": 94.13227081298828, "learning_rate": 9.9842762825318e-06, "loss": 11.305, "step": 9070 }, { "epoch": 0.40297124724704736, "grad_norm": 77.8934097290039, "learning_rate": 9.984258946569873e-06, "loss": 11.0494, "step": 9080 }, { "epoch": 0.4034150481801388, "grad_norm": 76.98465728759766, "learning_rate": 9.984241610607946e-06, "loss": 11.0135, "step": 9090 }, { "epoch": 0.4038588491132303, "grad_norm": 86.28607177734375, "learning_rate": 9.984224274646019e-06, "loss": 11.2476, "step": 9100 }, { "epoch": 0.4043026500463217, "grad_norm": 83.22606658935547, "learning_rate": 9.98420693868409e-06, "loss": 11.472, "step": 9110 }, { "epoch": 0.4047464509794132, "grad_norm": 87.87577819824219, "learning_rate": 9.984189602722164e-06, "loss": 11.237, "step": 9120 }, { "epoch": 0.40519025191250463, "grad_norm": 93.7770004272461, "learning_rate": 9.984172266760237e-06, "loss": 11.2706, "step": 9130 }, { "epoch": 0.4056340528455961, "grad_norm": 80.65582275390625, "learning_rate": 9.984154930798308e-06, "loss": 11.3943, "step": 9140 }, { "epoch": 0.40607785377868755, "grad_norm": 86.08631896972656, "learning_rate": 9.984137594836381e-06, "loss": 11.2704, "step": 9150 }, { "epoch": 0.40652165471177903, "grad_norm": 79.62726593017578, "learning_rate": 9.984120258874454e-06, "loss": 11.5724, "step": 9160 }, { "epoch": 0.40696545564487047, "grad_norm": 88.90939331054688, "learning_rate": 9.984102922912526e-06, "loss": 11.2739, "step": 9170 }, { "epoch": 0.40740925657796195, "grad_norm": 92.309814453125, "learning_rate": 9.984085586950599e-06, "loss": 11.535, "step": 9180 }, { "epoch": 0.40785305751105344, "grad_norm": 83.53838348388672, "learning_rate": 9.984068250988672e-06, "loss": 11.3033, "step": 9190 }, { "epoch": 0.4082968584441449, "grad_norm": 92.0191421508789, "learning_rate": 9.984050915026743e-06, "loss": 11.8525, "step": 9200 }, { "epoch": 0.40874065937723636, "grad_norm": 96.62713623046875, "learning_rate": 9.984033579064816e-06, "loss": 10.9853, "step": 9210 }, { "epoch": 0.4091844603103278, "grad_norm": 92.35746765136719, "learning_rate": 9.98401624310289e-06, "loss": 11.1736, "step": 9220 }, { "epoch": 0.4096282612434193, "grad_norm": 87.0098876953125, "learning_rate": 9.98399890714096e-06, "loss": 11.3787, "step": 9230 }, { "epoch": 0.4100720621765107, "grad_norm": 75.16019439697266, "learning_rate": 9.983981571179034e-06, "loss": 11.0318, "step": 9240 }, { "epoch": 0.4105158631096022, "grad_norm": 89.38213348388672, "learning_rate": 9.983964235217107e-06, "loss": 10.8708, "step": 9250 }, { "epoch": 0.41095966404269363, "grad_norm": 78.94715881347656, "learning_rate": 9.983946899255178e-06, "loss": 11.1103, "step": 9260 }, { "epoch": 0.4114034649757851, "grad_norm": 93.04794311523438, "learning_rate": 9.983929563293251e-06, "loss": 11.2785, "step": 9270 }, { "epoch": 0.41184726590887655, "grad_norm": 91.3328857421875, "learning_rate": 9.983912227331324e-06, "loss": 11.3683, "step": 9280 }, { "epoch": 0.41229106684196803, "grad_norm": 83.09625244140625, "learning_rate": 9.983894891369397e-06, "loss": 10.9742, "step": 9290 }, { "epoch": 0.4127348677750595, "grad_norm": 88.08326721191406, "learning_rate": 9.983877555407469e-06, "loss": 11.1625, "step": 9300 }, { "epoch": 0.41317866870815095, "grad_norm": 102.78692626953125, "learning_rate": 9.983860219445542e-06, "loss": 11.5739, "step": 9310 }, { "epoch": 0.41362246964124244, "grad_norm": 95.94892883300781, "learning_rate": 9.983842883483615e-06, "loss": 10.9203, "step": 9320 }, { "epoch": 0.4140662705743339, "grad_norm": 116.64979553222656, "learning_rate": 9.983825547521686e-06, "loss": 11.2924, "step": 9330 }, { "epoch": 0.41451007150742536, "grad_norm": 83.40705871582031, "learning_rate": 9.98380821155976e-06, "loss": 12.1134, "step": 9340 }, { "epoch": 0.4149538724405168, "grad_norm": 92.19294738769531, "learning_rate": 9.983790875597833e-06, "loss": 11.2714, "step": 9350 }, { "epoch": 0.4153976733736083, "grad_norm": 78.88662719726562, "learning_rate": 9.983773539635904e-06, "loss": 10.875, "step": 9360 }, { "epoch": 0.4158414743066997, "grad_norm": 82.31551361083984, "learning_rate": 9.983756203673977e-06, "loss": 10.9331, "step": 9370 }, { "epoch": 0.4162852752397912, "grad_norm": 92.06917572021484, "learning_rate": 9.98373886771205e-06, "loss": 11.0438, "step": 9380 }, { "epoch": 0.41672907617288263, "grad_norm": 81.9530029296875, "learning_rate": 9.983721531750121e-06, "loss": 11.4246, "step": 9390 }, { "epoch": 0.4171728771059741, "grad_norm": 88.1327896118164, "learning_rate": 9.983704195788195e-06, "loss": 11.0952, "step": 9400 }, { "epoch": 0.4176166780390656, "grad_norm": 98.14168548583984, "learning_rate": 9.983686859826268e-06, "loss": 10.9578, "step": 9410 }, { "epoch": 0.41806047897215703, "grad_norm": 98.64930725097656, "learning_rate": 9.983669523864339e-06, "loss": 12.0788, "step": 9420 }, { "epoch": 0.4185042799052485, "grad_norm": 90.62484741210938, "learning_rate": 9.983652187902412e-06, "loss": 11.4321, "step": 9430 }, { "epoch": 0.41894808083833995, "grad_norm": 86.90058898925781, "learning_rate": 9.983634851940485e-06, "loss": 11.3379, "step": 9440 }, { "epoch": 0.41939188177143144, "grad_norm": 95.19513702392578, "learning_rate": 9.983617515978557e-06, "loss": 11.4713, "step": 9450 }, { "epoch": 0.4198356827045229, "grad_norm": 86.9664535522461, "learning_rate": 9.98360018001663e-06, "loss": 11.6491, "step": 9460 }, { "epoch": 0.42027948363761436, "grad_norm": 83.81656646728516, "learning_rate": 9.983582844054703e-06, "loss": 10.9501, "step": 9470 }, { "epoch": 0.4207232845707058, "grad_norm": 80.9144058227539, "learning_rate": 9.983565508092774e-06, "loss": 11.2825, "step": 9480 }, { "epoch": 0.4211670855037973, "grad_norm": 85.7936782836914, "learning_rate": 9.983548172130847e-06, "loss": 10.8854, "step": 9490 }, { "epoch": 0.4216108864368887, "grad_norm": 94.24036407470703, "learning_rate": 9.98353083616892e-06, "loss": 11.5424, "step": 9500 }, { "epoch": 0.4220546873699802, "grad_norm": 86.44004821777344, "learning_rate": 9.983513500206993e-06, "loss": 11.6961, "step": 9510 }, { "epoch": 0.4224984883030717, "grad_norm": 79.09918975830078, "learning_rate": 9.983496164245065e-06, "loss": 11.5077, "step": 9520 }, { "epoch": 0.4229422892361631, "grad_norm": 106.33499908447266, "learning_rate": 9.983478828283138e-06, "loss": 11.7945, "step": 9530 }, { "epoch": 0.4233860901692546, "grad_norm": 89.4134292602539, "learning_rate": 9.983461492321211e-06, "loss": 11.2233, "step": 9540 }, { "epoch": 0.42382989110234603, "grad_norm": 85.91355895996094, "learning_rate": 9.983444156359282e-06, "loss": 11.2504, "step": 9550 }, { "epoch": 0.4242736920354375, "grad_norm": 74.06096649169922, "learning_rate": 9.983426820397355e-06, "loss": 11.6198, "step": 9560 }, { "epoch": 0.42471749296852895, "grad_norm": 81.07852172851562, "learning_rate": 9.983409484435428e-06, "loss": 11.7593, "step": 9570 }, { "epoch": 0.42516129390162044, "grad_norm": 87.0907211303711, "learning_rate": 9.9833921484735e-06, "loss": 11.0808, "step": 9580 }, { "epoch": 0.4256050948347119, "grad_norm": 95.18062591552734, "learning_rate": 9.983374812511573e-06, "loss": 11.0543, "step": 9590 }, { "epoch": 0.42604889576780336, "grad_norm": 85.12203979492188, "learning_rate": 9.983357476549646e-06, "loss": 11.0986, "step": 9600 }, { "epoch": 0.4264926967008948, "grad_norm": 80.9763412475586, "learning_rate": 9.983340140587717e-06, "loss": 11.1147, "step": 9610 }, { "epoch": 0.4269364976339863, "grad_norm": 80.79862213134766, "learning_rate": 9.98332280462579e-06, "loss": 10.9127, "step": 9620 }, { "epoch": 0.4273802985670777, "grad_norm": 93.28567504882812, "learning_rate": 9.983305468663863e-06, "loss": 11.6276, "step": 9630 }, { "epoch": 0.4278240995001692, "grad_norm": 92.6642074584961, "learning_rate": 9.983288132701935e-06, "loss": 11.6817, "step": 9640 }, { "epoch": 0.4282679004332607, "grad_norm": 84.80957794189453, "learning_rate": 9.983270796740008e-06, "loss": 11.0266, "step": 9650 }, { "epoch": 0.4287117013663521, "grad_norm": 80.59945678710938, "learning_rate": 9.983253460778081e-06, "loss": 11.211, "step": 9660 }, { "epoch": 0.4291555022994436, "grad_norm": 83.12669372558594, "learning_rate": 9.983236124816152e-06, "loss": 11.2012, "step": 9670 }, { "epoch": 0.42959930323253503, "grad_norm": 92.85382080078125, "learning_rate": 9.983218788854225e-06, "loss": 10.8469, "step": 9680 }, { "epoch": 0.4300431041656265, "grad_norm": 89.2972640991211, "learning_rate": 9.983201452892299e-06, "loss": 11.5653, "step": 9690 }, { "epoch": 0.43048690509871795, "grad_norm": 82.58189392089844, "learning_rate": 9.98318411693037e-06, "loss": 11.5281, "step": 9700 }, { "epoch": 0.43093070603180944, "grad_norm": 75.15813446044922, "learning_rate": 9.983166780968443e-06, "loss": 11.4775, "step": 9710 }, { "epoch": 0.4313745069649009, "grad_norm": 87.26850128173828, "learning_rate": 9.983149445006516e-06, "loss": 11.4795, "step": 9720 }, { "epoch": 0.43181830789799236, "grad_norm": 80.3275375366211, "learning_rate": 9.98313210904459e-06, "loss": 10.6572, "step": 9730 }, { "epoch": 0.4322621088310838, "grad_norm": 77.77581024169922, "learning_rate": 9.98311477308266e-06, "loss": 11.1713, "step": 9740 }, { "epoch": 0.4327059097641753, "grad_norm": 86.08430480957031, "learning_rate": 9.983097437120734e-06, "loss": 11.2765, "step": 9750 }, { "epoch": 0.43314971069726677, "grad_norm": 80.2632827758789, "learning_rate": 9.983080101158807e-06, "loss": 11.1026, "step": 9760 }, { "epoch": 0.4335935116303582, "grad_norm": 90.77334594726562, "learning_rate": 9.983062765196878e-06, "loss": 11.2743, "step": 9770 }, { "epoch": 0.4340373125634497, "grad_norm": 85.51402282714844, "learning_rate": 9.983045429234951e-06, "loss": 11.1924, "step": 9780 }, { "epoch": 0.4344811134965411, "grad_norm": 87.34100341796875, "learning_rate": 9.983028093273024e-06, "loss": 11.5827, "step": 9790 }, { "epoch": 0.4349249144296326, "grad_norm": 104.64212036132812, "learning_rate": 9.983010757311096e-06, "loss": 11.2375, "step": 9800 }, { "epoch": 0.43536871536272403, "grad_norm": 70.56439208984375, "learning_rate": 9.982993421349169e-06, "loss": 11.2364, "step": 9810 }, { "epoch": 0.4358125162958155, "grad_norm": 83.87458038330078, "learning_rate": 9.982976085387242e-06, "loss": 11.6634, "step": 9820 }, { "epoch": 0.43625631722890695, "grad_norm": 85.76553344726562, "learning_rate": 9.982958749425313e-06, "loss": 11.5362, "step": 9830 }, { "epoch": 0.43670011816199844, "grad_norm": 89.0379867553711, "learning_rate": 9.982941413463386e-06, "loss": 11.1996, "step": 9840 }, { "epoch": 0.4371439190950899, "grad_norm": 103.95638275146484, "learning_rate": 9.98292407750146e-06, "loss": 10.9446, "step": 9850 }, { "epoch": 0.43758772002818136, "grad_norm": 82.41026306152344, "learning_rate": 9.98290674153953e-06, "loss": 11.219, "step": 9860 }, { "epoch": 0.43803152096127285, "grad_norm": 86.83589172363281, "learning_rate": 9.982889405577604e-06, "loss": 11.787, "step": 9870 }, { "epoch": 0.4384753218943643, "grad_norm": 80.76217651367188, "learning_rate": 9.982872069615677e-06, "loss": 10.6874, "step": 9880 }, { "epoch": 0.43891912282745577, "grad_norm": 79.48180389404297, "learning_rate": 9.982854733653748e-06, "loss": 11.0988, "step": 9890 }, { "epoch": 0.4393629237605472, "grad_norm": 96.88164520263672, "learning_rate": 9.982837397691821e-06, "loss": 11.2804, "step": 9900 }, { "epoch": 0.4398067246936387, "grad_norm": 78.75862121582031, "learning_rate": 9.982820061729894e-06, "loss": 10.8656, "step": 9910 }, { "epoch": 0.4402505256267301, "grad_norm": 74.44935607910156, "learning_rate": 9.982802725767966e-06, "loss": 11.1552, "step": 9920 }, { "epoch": 0.4406943265598216, "grad_norm": 90.43840789794922, "learning_rate": 9.982785389806039e-06, "loss": 10.7856, "step": 9930 }, { "epoch": 0.44113812749291303, "grad_norm": 100.49180603027344, "learning_rate": 9.982768053844112e-06, "loss": 11.1607, "step": 9940 }, { "epoch": 0.4415819284260045, "grad_norm": 78.78294372558594, "learning_rate": 9.982750717882185e-06, "loss": 11.0682, "step": 9950 }, { "epoch": 0.44202572935909595, "grad_norm": 98.88738250732422, "learning_rate": 9.982733381920256e-06, "loss": 11.5788, "step": 9960 }, { "epoch": 0.44246953029218744, "grad_norm": 83.48590087890625, "learning_rate": 9.98271604595833e-06, "loss": 11.1786, "step": 9970 }, { "epoch": 0.44291333122527893, "grad_norm": 71.23033142089844, "learning_rate": 9.982698709996403e-06, "loss": 10.6031, "step": 9980 }, { "epoch": 0.44335713215837036, "grad_norm": 92.292236328125, "learning_rate": 9.982681374034474e-06, "loss": 11.3427, "step": 9990 }, { "epoch": 0.44380093309146185, "grad_norm": 85.08587646484375, "learning_rate": 9.982664038072547e-06, "loss": 11.022, "step": 10000 }, { "epoch": 0.44380093309146185, "eval_loss": 0.34827741980552673, "eval_runtime": 673.7997, "eval_samples_per_second": 1802.303, "eval_steps_per_second": 56.322, "step": 10000 }, { "epoch": 0.4442447340245533, "grad_norm": 87.5740966796875, "learning_rate": 9.98264670211062e-06, "loss": 11.53, "step": 10010 }, { "epoch": 0.44468853495764477, "grad_norm": 79.04003143310547, "learning_rate": 9.982629366148691e-06, "loss": 11.2839, "step": 10020 }, { "epoch": 0.4451323358907362, "grad_norm": 93.09468841552734, "learning_rate": 9.982612030186765e-06, "loss": 11.685, "step": 10030 }, { "epoch": 0.4455761368238277, "grad_norm": 75.00133514404297, "learning_rate": 9.982594694224838e-06, "loss": 10.8954, "step": 10040 }, { "epoch": 0.4460199377569191, "grad_norm": 95.3443374633789, "learning_rate": 9.982577358262909e-06, "loss": 10.8405, "step": 10050 }, { "epoch": 0.4464637386900106, "grad_norm": 81.28312683105469, "learning_rate": 9.982560022300982e-06, "loss": 10.8736, "step": 10060 }, { "epoch": 0.44690753962310203, "grad_norm": 84.82075500488281, "learning_rate": 9.982542686339055e-06, "loss": 10.9681, "step": 10070 }, { "epoch": 0.4473513405561935, "grad_norm": 83.95282745361328, "learning_rate": 9.982525350377127e-06, "loss": 11.012, "step": 10080 }, { "epoch": 0.44779514148928495, "grad_norm": 77.86396026611328, "learning_rate": 9.9825080144152e-06, "loss": 11.2847, "step": 10090 }, { "epoch": 0.44823894242237644, "grad_norm": 87.20881652832031, "learning_rate": 9.982490678453273e-06, "loss": 11.2584, "step": 10100 }, { "epoch": 0.44868274335546793, "grad_norm": 100.79487609863281, "learning_rate": 9.982473342491344e-06, "loss": 11.8089, "step": 10110 }, { "epoch": 0.44912654428855936, "grad_norm": 81.89049530029297, "learning_rate": 9.982456006529417e-06, "loss": 11.5939, "step": 10120 }, { "epoch": 0.44957034522165085, "grad_norm": 91.9642562866211, "learning_rate": 9.98243867056749e-06, "loss": 10.6203, "step": 10130 }, { "epoch": 0.4500141461547423, "grad_norm": 98.22434997558594, "learning_rate": 9.982421334605562e-06, "loss": 10.6234, "step": 10140 }, { "epoch": 0.45045794708783377, "grad_norm": 92.3149185180664, "learning_rate": 9.982403998643635e-06, "loss": 11.1096, "step": 10150 }, { "epoch": 0.4509017480209252, "grad_norm": 96.91551971435547, "learning_rate": 9.982386662681708e-06, "loss": 11.3593, "step": 10160 }, { "epoch": 0.4513455489540167, "grad_norm": 77.00688934326172, "learning_rate": 9.982369326719781e-06, "loss": 11.0639, "step": 10170 }, { "epoch": 0.4517893498871081, "grad_norm": 81.16304016113281, "learning_rate": 9.982351990757852e-06, "loss": 11.2663, "step": 10180 }, { "epoch": 0.4522331508201996, "grad_norm": 79.47703552246094, "learning_rate": 9.982334654795925e-06, "loss": 10.8026, "step": 10190 }, { "epoch": 0.45267695175329103, "grad_norm": 95.82781219482422, "learning_rate": 9.982317318833998e-06, "loss": 10.9343, "step": 10200 }, { "epoch": 0.4531207526863825, "grad_norm": 86.30982208251953, "learning_rate": 9.98229998287207e-06, "loss": 10.8559, "step": 10210 }, { "epoch": 0.453564553619474, "grad_norm": 84.82122802734375, "learning_rate": 9.982282646910143e-06, "loss": 11.2375, "step": 10220 }, { "epoch": 0.45400835455256544, "grad_norm": 85.73993682861328, "learning_rate": 9.982265310948216e-06, "loss": 11.1234, "step": 10230 }, { "epoch": 0.45445215548565693, "grad_norm": 76.42994689941406, "learning_rate": 9.982247974986287e-06, "loss": 11.2003, "step": 10240 }, { "epoch": 0.45489595641874836, "grad_norm": 84.67989349365234, "learning_rate": 9.98223063902436e-06, "loss": 10.8626, "step": 10250 }, { "epoch": 0.45533975735183985, "grad_norm": 97.71283721923828, "learning_rate": 9.982213303062434e-06, "loss": 10.8125, "step": 10260 }, { "epoch": 0.4557835582849313, "grad_norm": 77.48724365234375, "learning_rate": 9.982195967100505e-06, "loss": 10.9105, "step": 10270 }, { "epoch": 0.45622735921802277, "grad_norm": 73.06230926513672, "learning_rate": 9.982178631138578e-06, "loss": 10.8443, "step": 10280 }, { "epoch": 0.4566711601511142, "grad_norm": 78.53364562988281, "learning_rate": 9.982161295176651e-06, "loss": 11.1985, "step": 10290 }, { "epoch": 0.4571149610842057, "grad_norm": 71.37035369873047, "learning_rate": 9.982143959214722e-06, "loss": 11.2463, "step": 10300 }, { "epoch": 0.4575587620172971, "grad_norm": 76.0063247680664, "learning_rate": 9.982126623252796e-06, "loss": 10.9098, "step": 10310 }, { "epoch": 0.4580025629503886, "grad_norm": 82.43836212158203, "learning_rate": 9.982109287290869e-06, "loss": 10.8438, "step": 10320 }, { "epoch": 0.4584463638834801, "grad_norm": 88.15770721435547, "learning_rate": 9.98209195132894e-06, "loss": 11.2367, "step": 10330 }, { "epoch": 0.4588901648165715, "grad_norm": 78.84662628173828, "learning_rate": 9.982074615367013e-06, "loss": 10.8668, "step": 10340 }, { "epoch": 0.459333965749663, "grad_norm": 93.12088012695312, "learning_rate": 9.982057279405086e-06, "loss": 11.3541, "step": 10350 }, { "epoch": 0.45977776668275444, "grad_norm": 80.99417114257812, "learning_rate": 9.982039943443158e-06, "loss": 11.7431, "step": 10360 }, { "epoch": 0.46022156761584593, "grad_norm": 78.97187805175781, "learning_rate": 9.98202260748123e-06, "loss": 11.2636, "step": 10370 }, { "epoch": 0.46066536854893736, "grad_norm": 77.9780502319336, "learning_rate": 9.982005271519304e-06, "loss": 11.3237, "step": 10380 }, { "epoch": 0.46110916948202885, "grad_norm": 94.6609115600586, "learning_rate": 9.981987935557377e-06, "loss": 11.1363, "step": 10390 }, { "epoch": 0.4615529704151203, "grad_norm": 94.51988220214844, "learning_rate": 9.981970599595448e-06, "loss": 11.1418, "step": 10400 }, { "epoch": 0.46199677134821177, "grad_norm": 89.58201599121094, "learning_rate": 9.981953263633521e-06, "loss": 11.0321, "step": 10410 }, { "epoch": 0.4624405722813032, "grad_norm": 88.75037384033203, "learning_rate": 9.981935927671594e-06, "loss": 11.1615, "step": 10420 }, { "epoch": 0.4628843732143947, "grad_norm": 72.32737731933594, "learning_rate": 9.981918591709666e-06, "loss": 10.7545, "step": 10430 }, { "epoch": 0.46332817414748617, "grad_norm": 94.58203887939453, "learning_rate": 9.981901255747739e-06, "loss": 10.8112, "step": 10440 }, { "epoch": 0.4637719750805776, "grad_norm": 88.47208404541016, "learning_rate": 9.981883919785812e-06, "loss": 11.0896, "step": 10450 }, { "epoch": 0.4642157760136691, "grad_norm": 86.8807601928711, "learning_rate": 9.981866583823883e-06, "loss": 10.6616, "step": 10460 }, { "epoch": 0.4646595769467605, "grad_norm": 67.94696807861328, "learning_rate": 9.981849247861956e-06, "loss": 10.5705, "step": 10470 }, { "epoch": 0.465103377879852, "grad_norm": 83.57070922851562, "learning_rate": 9.98183191190003e-06, "loss": 11.4698, "step": 10480 }, { "epoch": 0.46554717881294344, "grad_norm": 83.02405548095703, "learning_rate": 9.9818145759381e-06, "loss": 10.5724, "step": 10490 }, { "epoch": 0.46599097974603493, "grad_norm": 83.82646942138672, "learning_rate": 9.981797239976174e-06, "loss": 11.0542, "step": 10500 }, { "epoch": 0.46643478067912636, "grad_norm": 77.04849243164062, "learning_rate": 9.981779904014247e-06, "loss": 11.3302, "step": 10510 }, { "epoch": 0.46687858161221785, "grad_norm": 74.8342514038086, "learning_rate": 9.981762568052318e-06, "loss": 11.5481, "step": 10520 }, { "epoch": 0.4673223825453093, "grad_norm": 76.42134094238281, "learning_rate": 9.981745232090391e-06, "loss": 10.9186, "step": 10530 }, { "epoch": 0.46776618347840077, "grad_norm": 82.68692779541016, "learning_rate": 9.981727896128464e-06, "loss": 11.2538, "step": 10540 }, { "epoch": 0.46820998441149225, "grad_norm": 69.0387191772461, "learning_rate": 9.981710560166536e-06, "loss": 10.8114, "step": 10550 }, { "epoch": 0.4686537853445837, "grad_norm": 103.88386535644531, "learning_rate": 9.981693224204609e-06, "loss": 10.9474, "step": 10560 }, { "epoch": 0.46909758627767517, "grad_norm": 88.8089370727539, "learning_rate": 9.981675888242682e-06, "loss": 10.8754, "step": 10570 }, { "epoch": 0.4695413872107666, "grad_norm": 79.1522216796875, "learning_rate": 9.981658552280753e-06, "loss": 11.2002, "step": 10580 }, { "epoch": 0.4699851881438581, "grad_norm": 75.01962280273438, "learning_rate": 9.981641216318826e-06, "loss": 10.5971, "step": 10590 }, { "epoch": 0.4704289890769495, "grad_norm": 88.03787994384766, "learning_rate": 9.9816238803569e-06, "loss": 11.2155, "step": 10600 }, { "epoch": 0.470872790010041, "grad_norm": 88.27069854736328, "learning_rate": 9.981606544394973e-06, "loss": 11.1796, "step": 10610 }, { "epoch": 0.47131659094313244, "grad_norm": 91.5178451538086, "learning_rate": 9.981589208433044e-06, "loss": 10.8964, "step": 10620 }, { "epoch": 0.47176039187622393, "grad_norm": 89.97875213623047, "learning_rate": 9.981571872471117e-06, "loss": 11.0617, "step": 10630 }, { "epoch": 0.47220419280931536, "grad_norm": 77.71656036376953, "learning_rate": 9.98155453650919e-06, "loss": 11.2671, "step": 10640 }, { "epoch": 0.47264799374240685, "grad_norm": 90.4183120727539, "learning_rate": 9.981537200547262e-06, "loss": 10.8842, "step": 10650 }, { "epoch": 0.4730917946754983, "grad_norm": 109.40415954589844, "learning_rate": 9.981519864585335e-06, "loss": 10.8791, "step": 10660 }, { "epoch": 0.47353559560858977, "grad_norm": 79.65886688232422, "learning_rate": 9.981502528623408e-06, "loss": 11.3155, "step": 10670 }, { "epoch": 0.47397939654168125, "grad_norm": 75.3977279663086, "learning_rate": 9.981485192661479e-06, "loss": 10.6279, "step": 10680 }, { "epoch": 0.4744231974747727, "grad_norm": 90.0768051147461, "learning_rate": 9.981467856699552e-06, "loss": 10.8386, "step": 10690 }, { "epoch": 0.47486699840786417, "grad_norm": 78.82632446289062, "learning_rate": 9.981450520737625e-06, "loss": 10.8152, "step": 10700 }, { "epoch": 0.4753107993409556, "grad_norm": 82.59823608398438, "learning_rate": 9.981433184775697e-06, "loss": 11.0986, "step": 10710 }, { "epoch": 0.4757546002740471, "grad_norm": 82.44734954833984, "learning_rate": 9.98141584881377e-06, "loss": 10.8024, "step": 10720 }, { "epoch": 0.4761984012071385, "grad_norm": 91.71231079101562, "learning_rate": 9.981398512851843e-06, "loss": 10.8372, "step": 10730 }, { "epoch": 0.47664220214023, "grad_norm": 87.06108093261719, "learning_rate": 9.981381176889914e-06, "loss": 11.0266, "step": 10740 }, { "epoch": 0.47708600307332144, "grad_norm": 90.88961029052734, "learning_rate": 9.981363840927987e-06, "loss": 10.607, "step": 10750 }, { "epoch": 0.47752980400641293, "grad_norm": 90.09719848632812, "learning_rate": 9.98134650496606e-06, "loss": 11.0468, "step": 10760 }, { "epoch": 0.47797360493950436, "grad_norm": 87.11450958251953, "learning_rate": 9.981329169004132e-06, "loss": 10.7198, "step": 10770 }, { "epoch": 0.47841740587259585, "grad_norm": 80.5800552368164, "learning_rate": 9.981311833042205e-06, "loss": 11.1395, "step": 10780 }, { "epoch": 0.47886120680568733, "grad_norm": 90.17820739746094, "learning_rate": 9.981294497080278e-06, "loss": 11.3689, "step": 10790 }, { "epoch": 0.47930500773877877, "grad_norm": 84.49495697021484, "learning_rate": 9.981277161118351e-06, "loss": 10.9592, "step": 10800 }, { "epoch": 0.47974880867187025, "grad_norm": 94.85502624511719, "learning_rate": 9.981259825156422e-06, "loss": 10.6913, "step": 10810 }, { "epoch": 0.4801926096049617, "grad_norm": 79.14407348632812, "learning_rate": 9.981242489194495e-06, "loss": 11.8454, "step": 10820 }, { "epoch": 0.48063641053805317, "grad_norm": 91.49024200439453, "learning_rate": 9.981225153232568e-06, "loss": 11.292, "step": 10830 }, { "epoch": 0.4810802114711446, "grad_norm": 97.75911712646484, "learning_rate": 9.98120781727064e-06, "loss": 11.3453, "step": 10840 }, { "epoch": 0.4815240124042361, "grad_norm": 76.58828735351562, "learning_rate": 9.981190481308713e-06, "loss": 11.1765, "step": 10850 }, { "epoch": 0.4819678133373275, "grad_norm": 79.92506408691406, "learning_rate": 9.981173145346786e-06, "loss": 10.868, "step": 10860 }, { "epoch": 0.482411614270419, "grad_norm": 101.02202606201172, "learning_rate": 9.981155809384857e-06, "loss": 11.0264, "step": 10870 }, { "epoch": 0.48285541520351044, "grad_norm": 81.69430541992188, "learning_rate": 9.98113847342293e-06, "loss": 11.2259, "step": 10880 }, { "epoch": 0.48329921613660193, "grad_norm": 86.8892822265625, "learning_rate": 9.981121137461004e-06, "loss": 11.0128, "step": 10890 }, { "epoch": 0.4837430170696934, "grad_norm": 72.10417938232422, "learning_rate": 9.981103801499075e-06, "loss": 11.3905, "step": 10900 }, { "epoch": 0.48418681800278485, "grad_norm": 85.03720092773438, "learning_rate": 9.981086465537148e-06, "loss": 11.1796, "step": 10910 }, { "epoch": 0.48463061893587633, "grad_norm": 88.04219818115234, "learning_rate": 9.981069129575221e-06, "loss": 11.2465, "step": 10920 }, { "epoch": 0.48507441986896777, "grad_norm": 73.39752960205078, "learning_rate": 9.981051793613294e-06, "loss": 10.9952, "step": 10930 }, { "epoch": 0.48551822080205925, "grad_norm": 91.22920227050781, "learning_rate": 9.981034457651366e-06, "loss": 10.9643, "step": 10940 }, { "epoch": 0.4859620217351507, "grad_norm": 67.8987045288086, "learning_rate": 9.981017121689439e-06, "loss": 11.1133, "step": 10950 }, { "epoch": 0.48640582266824217, "grad_norm": 78.0125732421875, "learning_rate": 9.980999785727512e-06, "loss": 11.5816, "step": 10960 }, { "epoch": 0.4868496236013336, "grad_norm": 90.03052520751953, "learning_rate": 9.980982449765583e-06, "loss": 11.2881, "step": 10970 }, { "epoch": 0.4872934245344251, "grad_norm": 90.72000122070312, "learning_rate": 9.980965113803656e-06, "loss": 11.1198, "step": 10980 }, { "epoch": 0.4877372254675165, "grad_norm": 72.88054656982422, "learning_rate": 9.98094777784173e-06, "loss": 11.3579, "step": 10990 }, { "epoch": 0.488181026400608, "grad_norm": 84.08674621582031, "learning_rate": 9.9809304418798e-06, "loss": 10.871, "step": 11000 }, { "epoch": 0.488181026400608, "eval_loss": 0.34528353810310364, "eval_runtime": 674.0977, "eval_samples_per_second": 1801.506, "eval_steps_per_second": 56.297, "step": 11000 }, { "epoch": 0.4886248273336995, "grad_norm": 74.27079010009766, "learning_rate": 9.980913105917874e-06, "loss": 11.5738, "step": 11010 }, { "epoch": 0.48906862826679093, "grad_norm": 79.84703063964844, "learning_rate": 9.980895769955947e-06, "loss": 10.8113, "step": 11020 }, { "epoch": 0.4895124291998824, "grad_norm": 78.84832000732422, "learning_rate": 9.980878433994018e-06, "loss": 11.1961, "step": 11030 }, { "epoch": 0.48995623013297385, "grad_norm": 77.97138977050781, "learning_rate": 9.980861098032091e-06, "loss": 10.747, "step": 11040 }, { "epoch": 0.49040003106606533, "grad_norm": 68.38355255126953, "learning_rate": 9.980843762070164e-06, "loss": 10.9581, "step": 11050 }, { "epoch": 0.49084383199915677, "grad_norm": 90.01298522949219, "learning_rate": 9.980826426108237e-06, "loss": 11.2473, "step": 11060 }, { "epoch": 0.49128763293224825, "grad_norm": 69.46513366699219, "learning_rate": 9.980809090146309e-06, "loss": 10.7865, "step": 11070 }, { "epoch": 0.4917314338653397, "grad_norm": 77.08251190185547, "learning_rate": 9.980791754184382e-06, "loss": 10.7613, "step": 11080 }, { "epoch": 0.49217523479843117, "grad_norm": 86.79558563232422, "learning_rate": 9.980774418222455e-06, "loss": 10.4478, "step": 11090 }, { "epoch": 0.4926190357315226, "grad_norm": 78.37332153320312, "learning_rate": 9.980757082260526e-06, "loss": 11.3574, "step": 11100 }, { "epoch": 0.4930628366646141, "grad_norm": 77.12301635742188, "learning_rate": 9.9807397462986e-06, "loss": 10.9563, "step": 11110 }, { "epoch": 0.4935066375977055, "grad_norm": 84.5125961303711, "learning_rate": 9.980722410336672e-06, "loss": 10.7285, "step": 11120 }, { "epoch": 0.493950438530797, "grad_norm": 72.76651763916016, "learning_rate": 9.980705074374744e-06, "loss": 10.8235, "step": 11130 }, { "epoch": 0.4943942394638885, "grad_norm": 65.90115356445312, "learning_rate": 9.980687738412817e-06, "loss": 11.2566, "step": 11140 }, { "epoch": 0.49483804039697993, "grad_norm": 93.98876190185547, "learning_rate": 9.98067040245089e-06, "loss": 11.1416, "step": 11150 }, { "epoch": 0.4952818413300714, "grad_norm": 84.59708404541016, "learning_rate": 9.980653066488961e-06, "loss": 10.9902, "step": 11160 }, { "epoch": 0.49572564226316285, "grad_norm": 82.25418090820312, "learning_rate": 9.980635730527034e-06, "loss": 10.9595, "step": 11170 }, { "epoch": 0.49616944319625433, "grad_norm": 77.34684753417969, "learning_rate": 9.980618394565108e-06, "loss": 11.4648, "step": 11180 }, { "epoch": 0.49661324412934577, "grad_norm": 93.93881225585938, "learning_rate": 9.98060105860318e-06, "loss": 10.8638, "step": 11190 }, { "epoch": 0.49705704506243725, "grad_norm": 70.26705932617188, "learning_rate": 9.980583722641252e-06, "loss": 10.9112, "step": 11200 }, { "epoch": 0.4975008459955287, "grad_norm": 82.66825103759766, "learning_rate": 9.980566386679325e-06, "loss": 11.5958, "step": 11210 }, { "epoch": 0.49794464692862017, "grad_norm": 86.82162475585938, "learning_rate": 9.980549050717398e-06, "loss": 10.8187, "step": 11220 }, { "epoch": 0.4983884478617116, "grad_norm": 79.9798355102539, "learning_rate": 9.98053171475547e-06, "loss": 11.2143, "step": 11230 }, { "epoch": 0.4988322487948031, "grad_norm": 99.23787689208984, "learning_rate": 9.980514378793543e-06, "loss": 10.9515, "step": 11240 }, { "epoch": 0.4992760497278946, "grad_norm": 70.37545013427734, "learning_rate": 9.980497042831616e-06, "loss": 11.0358, "step": 11250 }, { "epoch": 0.499719850660986, "grad_norm": 91.39663696289062, "learning_rate": 9.980479706869687e-06, "loss": 10.7406, "step": 11260 }, { "epoch": 0.5001636515940775, "grad_norm": 81.30348205566406, "learning_rate": 9.98046237090776e-06, "loss": 11.2646, "step": 11270 }, { "epoch": 0.500607452527169, "grad_norm": 83.41373443603516, "learning_rate": 9.980445034945833e-06, "loss": 10.5715, "step": 11280 }, { "epoch": 0.5010512534602604, "grad_norm": 84.33602142333984, "learning_rate": 9.980427698983906e-06, "loss": 10.9119, "step": 11290 }, { "epoch": 0.5014950543933518, "grad_norm": 76.0499038696289, "learning_rate": 9.980410363021978e-06, "loss": 11.1319, "step": 11300 }, { "epoch": 0.5019388553264433, "grad_norm": 87.45086669921875, "learning_rate": 9.98039302706005e-06, "loss": 10.3676, "step": 11310 }, { "epoch": 0.5023826562595348, "grad_norm": 88.60616302490234, "learning_rate": 9.980375691098124e-06, "loss": 11.0152, "step": 11320 }, { "epoch": 0.5028264571926262, "grad_norm": 91.6775894165039, "learning_rate": 9.980358355136195e-06, "loss": 10.8913, "step": 11330 }, { "epoch": 0.5032702581257177, "grad_norm": 83.63784790039062, "learning_rate": 9.980341019174268e-06, "loss": 10.4357, "step": 11340 }, { "epoch": 0.5037140590588092, "grad_norm": 93.07415008544922, "learning_rate": 9.980323683212341e-06, "loss": 10.3792, "step": 11350 }, { "epoch": 0.5041578599919007, "grad_norm": 88.19844818115234, "learning_rate": 9.980306347250413e-06, "loss": 10.886, "step": 11360 }, { "epoch": 0.5046016609249921, "grad_norm": 79.55142211914062, "learning_rate": 9.980289011288486e-06, "loss": 10.8876, "step": 11370 }, { "epoch": 0.5050454618580835, "grad_norm": 85.38150787353516, "learning_rate": 9.980271675326559e-06, "loss": 11.1223, "step": 11380 }, { "epoch": 0.505489262791175, "grad_norm": 69.45784759521484, "learning_rate": 9.98025433936463e-06, "loss": 10.688, "step": 11390 }, { "epoch": 0.5059330637242665, "grad_norm": 83.27287292480469, "learning_rate": 9.980237003402703e-06, "loss": 11.482, "step": 11400 }, { "epoch": 0.506376864657358, "grad_norm": 87.00122833251953, "learning_rate": 9.980219667440777e-06, "loss": 11.2689, "step": 11410 }, { "epoch": 0.5068206655904494, "grad_norm": 78.08293914794922, "learning_rate": 9.98020233147885e-06, "loss": 10.4533, "step": 11420 }, { "epoch": 0.5072644665235408, "grad_norm": 71.37157440185547, "learning_rate": 9.980184995516921e-06, "loss": 10.856, "step": 11430 }, { "epoch": 0.5077082674566323, "grad_norm": 78.55634307861328, "learning_rate": 9.980167659554994e-06, "loss": 11.0349, "step": 11440 }, { "epoch": 0.5081520683897238, "grad_norm": 83.2050552368164, "learning_rate": 9.980150323593067e-06, "loss": 11.0373, "step": 11450 }, { "epoch": 0.5085958693228152, "grad_norm": 94.2475357055664, "learning_rate": 9.980132987631139e-06, "loss": 10.5705, "step": 11460 }, { "epoch": 0.5090396702559067, "grad_norm": 75.6529769897461, "learning_rate": 9.980115651669212e-06, "loss": 10.8084, "step": 11470 }, { "epoch": 0.5094834711889982, "grad_norm": 68.75302124023438, "learning_rate": 9.980098315707285e-06, "loss": 10.4888, "step": 11480 }, { "epoch": 0.5099272721220897, "grad_norm": 88.40451049804688, "learning_rate": 9.980080979745356e-06, "loss": 11.4719, "step": 11490 }, { "epoch": 0.5103710730551811, "grad_norm": 85.7581558227539, "learning_rate": 9.980063643783429e-06, "loss": 11.1014, "step": 11500 }, { "epoch": 0.5108148739882725, "grad_norm": 79.09970092773438, "learning_rate": 9.980046307821502e-06, "loss": 11.0976, "step": 11510 }, { "epoch": 0.511258674921364, "grad_norm": 83.1121597290039, "learning_rate": 9.980028971859574e-06, "loss": 10.8814, "step": 11520 }, { "epoch": 0.5117024758544555, "grad_norm": 77.43012237548828, "learning_rate": 9.980011635897647e-06, "loss": 10.775, "step": 11530 }, { "epoch": 0.512146276787547, "grad_norm": 68.88916778564453, "learning_rate": 9.97999429993572e-06, "loss": 11.1833, "step": 11540 }, { "epoch": 0.5125900777206384, "grad_norm": 70.91609954833984, "learning_rate": 9.979976963973793e-06, "loss": 10.6464, "step": 11550 }, { "epoch": 0.5130338786537298, "grad_norm": 83.58845520019531, "learning_rate": 9.979959628011864e-06, "loss": 10.9326, "step": 11560 }, { "epoch": 0.5134776795868213, "grad_norm": 81.35511016845703, "learning_rate": 9.979942292049937e-06, "loss": 11.163, "step": 11570 }, { "epoch": 0.5139214805199128, "grad_norm": 77.6694564819336, "learning_rate": 9.97992495608801e-06, "loss": 10.8365, "step": 11580 }, { "epoch": 0.5143652814530043, "grad_norm": 87.32294464111328, "learning_rate": 9.979907620126082e-06, "loss": 10.8808, "step": 11590 }, { "epoch": 0.5148090823860957, "grad_norm": 79.58880615234375, "learning_rate": 9.979890284164155e-06, "loss": 10.7793, "step": 11600 }, { "epoch": 0.5152528833191872, "grad_norm": 70.23893737792969, "learning_rate": 9.979872948202228e-06, "loss": 10.9845, "step": 11610 }, { "epoch": 0.5156966842522787, "grad_norm": 72.99483489990234, "learning_rate": 9.9798556122403e-06, "loss": 10.8218, "step": 11620 }, { "epoch": 0.5161404851853701, "grad_norm": 96.56175994873047, "learning_rate": 9.979838276278372e-06, "loss": 11.0247, "step": 11630 }, { "epoch": 0.5165842861184615, "grad_norm": 76.31138610839844, "learning_rate": 9.979820940316445e-06, "loss": 11.0729, "step": 11640 }, { "epoch": 0.517028087051553, "grad_norm": 77.18230438232422, "learning_rate": 9.979803604354517e-06, "loss": 10.3969, "step": 11650 }, { "epoch": 0.5174718879846445, "grad_norm": 90.09929656982422, "learning_rate": 9.97978626839259e-06, "loss": 11.3239, "step": 11660 }, { "epoch": 0.517915688917736, "grad_norm": 69.90628051757812, "learning_rate": 9.979768932430663e-06, "loss": 10.7297, "step": 11670 }, { "epoch": 0.5183594898508274, "grad_norm": 90.91302490234375, "learning_rate": 9.979751596468736e-06, "loss": 11.3096, "step": 11680 }, { "epoch": 0.5188032907839188, "grad_norm": 81.82889556884766, "learning_rate": 9.979734260506807e-06, "loss": 10.9807, "step": 11690 }, { "epoch": 0.5192470917170103, "grad_norm": 86.98174285888672, "learning_rate": 9.97971692454488e-06, "loss": 11.2847, "step": 11700 }, { "epoch": 0.5196908926501018, "grad_norm": 72.4999771118164, "learning_rate": 9.979699588582954e-06, "loss": 10.7933, "step": 11710 }, { "epoch": 0.5201346935831933, "grad_norm": 82.73175048828125, "learning_rate": 9.979682252621025e-06, "loss": 10.8406, "step": 11720 }, { "epoch": 0.5205784945162847, "grad_norm": 68.61962127685547, "learning_rate": 9.979664916659098e-06, "loss": 10.7406, "step": 11730 }, { "epoch": 0.5210222954493762, "grad_norm": 80.85415649414062, "learning_rate": 9.979647580697171e-06, "loss": 11.0003, "step": 11740 }, { "epoch": 0.5214660963824677, "grad_norm": 81.90907287597656, "learning_rate": 9.979630244735243e-06, "loss": 11.4761, "step": 11750 }, { "epoch": 0.5219098973155591, "grad_norm": 70.7921142578125, "learning_rate": 9.979612908773316e-06, "loss": 10.7498, "step": 11760 }, { "epoch": 0.5223536982486505, "grad_norm": 74.50489044189453, "learning_rate": 9.979595572811389e-06, "loss": 11.7802, "step": 11770 }, { "epoch": 0.522797499181742, "grad_norm": 75.52880859375, "learning_rate": 9.97957823684946e-06, "loss": 10.7195, "step": 11780 }, { "epoch": 0.5232413001148335, "grad_norm": 80.11971282958984, "learning_rate": 9.979560900887533e-06, "loss": 10.747, "step": 11790 }, { "epoch": 0.523685101047925, "grad_norm": 84.3338851928711, "learning_rate": 9.979543564925606e-06, "loss": 11.132, "step": 11800 }, { "epoch": 0.5241289019810164, "grad_norm": 80.16667938232422, "learning_rate": 9.979526228963678e-06, "loss": 11.2093, "step": 11810 }, { "epoch": 0.5245727029141078, "grad_norm": 70.78595733642578, "learning_rate": 9.97950889300175e-06, "loss": 10.7076, "step": 11820 }, { "epoch": 0.5250165038471993, "grad_norm": 75.8795394897461, "learning_rate": 9.979491557039824e-06, "loss": 10.7308, "step": 11830 }, { "epoch": 0.5254603047802908, "grad_norm": 87.91172790527344, "learning_rate": 9.979474221077895e-06, "loss": 10.3702, "step": 11840 }, { "epoch": 0.5259041057133823, "grad_norm": 86.1435317993164, "learning_rate": 9.979456885115968e-06, "loss": 10.6837, "step": 11850 }, { "epoch": 0.5263479066464737, "grad_norm": 74.28446960449219, "learning_rate": 9.979439549154041e-06, "loss": 11.3425, "step": 11860 }, { "epoch": 0.5267917075795652, "grad_norm": 77.42523956298828, "learning_rate": 9.979422213192113e-06, "loss": 10.7451, "step": 11870 }, { "epoch": 0.5272355085126567, "grad_norm": 71.55403900146484, "learning_rate": 9.979404877230186e-06, "loss": 11.2173, "step": 11880 }, { "epoch": 0.5276793094457481, "grad_norm": 79.82381439208984, "learning_rate": 9.979387541268259e-06, "loss": 10.9905, "step": 11890 }, { "epoch": 0.5281231103788395, "grad_norm": 90.72064208984375, "learning_rate": 9.979370205306332e-06, "loss": 10.8288, "step": 11900 }, { "epoch": 0.528566911311931, "grad_norm": 68.74069213867188, "learning_rate": 9.979352869344403e-06, "loss": 11.3552, "step": 11910 }, { "epoch": 0.5290107122450225, "grad_norm": 79.70250701904297, "learning_rate": 9.979335533382476e-06, "loss": 11.0605, "step": 11920 }, { "epoch": 0.529454513178114, "grad_norm": 77.91078186035156, "learning_rate": 9.97931819742055e-06, "loss": 10.8511, "step": 11930 }, { "epoch": 0.5298983141112055, "grad_norm": 75.19290924072266, "learning_rate": 9.979300861458621e-06, "loss": 11.1562, "step": 11940 }, { "epoch": 0.5303421150442968, "grad_norm": 86.28694152832031, "learning_rate": 9.979283525496694e-06, "loss": 10.9518, "step": 11950 }, { "epoch": 0.5307859159773883, "grad_norm": 83.85566711425781, "learning_rate": 9.979266189534767e-06, "loss": 10.5831, "step": 11960 }, { "epoch": 0.5312297169104798, "grad_norm": 89.48451232910156, "learning_rate": 9.979248853572838e-06, "loss": 11.1542, "step": 11970 }, { "epoch": 0.5316735178435713, "grad_norm": 74.22183227539062, "learning_rate": 9.979231517610911e-06, "loss": 11.742, "step": 11980 }, { "epoch": 0.5321173187766627, "grad_norm": 74.32493591308594, "learning_rate": 9.979214181648985e-06, "loss": 11.5062, "step": 11990 }, { "epoch": 0.5325611197097542, "grad_norm": 84.4752197265625, "learning_rate": 9.979196845687056e-06, "loss": 11.24, "step": 12000 }, { "epoch": 0.5325611197097542, "eval_loss": 0.3403577506542206, "eval_runtime": 674.8568, "eval_samples_per_second": 1799.479, "eval_steps_per_second": 56.234, "step": 12000 }, { "epoch": 0.5330049206428457, "grad_norm": 70.7219467163086, "learning_rate": 9.979179509725129e-06, "loss": 10.7596, "step": 12010 }, { "epoch": 0.5334487215759371, "grad_norm": 81.48099517822266, "learning_rate": 9.979162173763202e-06, "loss": 10.9401, "step": 12020 }, { "epoch": 0.5338925225090285, "grad_norm": 90.189208984375, "learning_rate": 9.979144837801273e-06, "loss": 11.0275, "step": 12030 }, { "epoch": 0.53433632344212, "grad_norm": 77.25494384765625, "learning_rate": 9.979127501839347e-06, "loss": 10.5871, "step": 12040 }, { "epoch": 0.5347801243752115, "grad_norm": 83.02628326416016, "learning_rate": 9.97911016587742e-06, "loss": 11.6188, "step": 12050 }, { "epoch": 0.535223925308303, "grad_norm": 73.51893615722656, "learning_rate": 9.979092829915491e-06, "loss": 11.0726, "step": 12060 }, { "epoch": 0.5356677262413945, "grad_norm": 84.49578857421875, "learning_rate": 9.979075493953564e-06, "loss": 11.4908, "step": 12070 }, { "epoch": 0.5361115271744858, "grad_norm": 71.82061767578125, "learning_rate": 9.979058157991637e-06, "loss": 10.8268, "step": 12080 }, { "epoch": 0.5365553281075773, "grad_norm": 79.56192016601562, "learning_rate": 9.979040822029709e-06, "loss": 10.6541, "step": 12090 }, { "epoch": 0.5369991290406688, "grad_norm": 92.98332977294922, "learning_rate": 9.979023486067782e-06, "loss": 10.8328, "step": 12100 }, { "epoch": 0.5374429299737603, "grad_norm": 78.83235931396484, "learning_rate": 9.979006150105855e-06, "loss": 10.7446, "step": 12110 }, { "epoch": 0.5378867309068517, "grad_norm": 85.79434204101562, "learning_rate": 9.978988814143928e-06, "loss": 11.2323, "step": 12120 }, { "epoch": 0.5383305318399432, "grad_norm": 74.76625061035156, "learning_rate": 9.978971478182e-06, "loss": 11.2963, "step": 12130 }, { "epoch": 0.5387743327730347, "grad_norm": 75.79556274414062, "learning_rate": 9.978954142220072e-06, "loss": 11.4193, "step": 12140 }, { "epoch": 0.5392181337061261, "grad_norm": 71.63494110107422, "learning_rate": 9.978936806258145e-06, "loss": 10.7985, "step": 12150 }, { "epoch": 0.5396619346392176, "grad_norm": 85.59138488769531, "learning_rate": 9.978919470296217e-06, "loss": 11.4485, "step": 12160 }, { "epoch": 0.540105735572309, "grad_norm": 76.133544921875, "learning_rate": 9.97890213433429e-06, "loss": 11.4676, "step": 12170 }, { "epoch": 0.5405495365054005, "grad_norm": 74.8232650756836, "learning_rate": 9.978884798372363e-06, "loss": 10.934, "step": 12180 }, { "epoch": 0.540993337438492, "grad_norm": 82.42066955566406, "learning_rate": 9.978867462410434e-06, "loss": 11.0111, "step": 12190 }, { "epoch": 0.5414371383715835, "grad_norm": 80.84729766845703, "learning_rate": 9.978850126448507e-06, "loss": 10.4483, "step": 12200 }, { "epoch": 0.5418809393046748, "grad_norm": 83.81796264648438, "learning_rate": 9.97883279048658e-06, "loss": 10.8341, "step": 12210 }, { "epoch": 0.5423247402377663, "grad_norm": 73.58375549316406, "learning_rate": 9.978815454524652e-06, "loss": 10.5816, "step": 12220 }, { "epoch": 0.5427685411708578, "grad_norm": 94.62816619873047, "learning_rate": 9.978798118562725e-06, "loss": 11.0264, "step": 12230 }, { "epoch": 0.5432123421039493, "grad_norm": 80.29084014892578, "learning_rate": 9.978780782600798e-06, "loss": 11.1186, "step": 12240 }, { "epoch": 0.5436561430370407, "grad_norm": 86.72004699707031, "learning_rate": 9.97876344663887e-06, "loss": 10.6926, "step": 12250 }, { "epoch": 0.5440999439701322, "grad_norm": 78.45811462402344, "learning_rate": 9.978746110676942e-06, "loss": 10.6757, "step": 12260 }, { "epoch": 0.5445437449032237, "grad_norm": 81.1561050415039, "learning_rate": 9.978728774715015e-06, "loss": 10.9724, "step": 12270 }, { "epoch": 0.5449875458363151, "grad_norm": 81.57537841796875, "learning_rate": 9.978711438753087e-06, "loss": 10.9973, "step": 12280 }, { "epoch": 0.5454313467694066, "grad_norm": 71.61268615722656, "learning_rate": 9.97869410279116e-06, "loss": 10.961, "step": 12290 }, { "epoch": 0.545875147702498, "grad_norm": 69.37332153320312, "learning_rate": 9.978676766829233e-06, "loss": 10.6944, "step": 12300 }, { "epoch": 0.5463189486355895, "grad_norm": 75.34327697753906, "learning_rate": 9.978659430867304e-06, "loss": 10.9383, "step": 12310 }, { "epoch": 0.546762749568681, "grad_norm": 81.64777374267578, "learning_rate": 9.978642094905377e-06, "loss": 10.8668, "step": 12320 }, { "epoch": 0.5472065505017725, "grad_norm": 77.13945770263672, "learning_rate": 9.97862475894345e-06, "loss": 10.5414, "step": 12330 }, { "epoch": 0.5476503514348638, "grad_norm": 68.74790954589844, "learning_rate": 9.978607422981524e-06, "loss": 10.5574, "step": 12340 }, { "epoch": 0.5480941523679553, "grad_norm": 73.5103988647461, "learning_rate": 9.978590087019595e-06, "loss": 10.6581, "step": 12350 }, { "epoch": 0.5485379533010468, "grad_norm": 74.06155395507812, "learning_rate": 9.978572751057668e-06, "loss": 10.9724, "step": 12360 }, { "epoch": 0.5489817542341383, "grad_norm": 83.41978454589844, "learning_rate": 9.978555415095741e-06, "loss": 10.8189, "step": 12370 }, { "epoch": 0.5494255551672297, "grad_norm": 75.58580780029297, "learning_rate": 9.978538079133813e-06, "loss": 10.9859, "step": 12380 }, { "epoch": 0.5498693561003212, "grad_norm": 88.49089050292969, "learning_rate": 9.978520743171886e-06, "loss": 10.7513, "step": 12390 }, { "epoch": 0.5503131570334127, "grad_norm": 58.82784652709961, "learning_rate": 9.978503407209959e-06, "loss": 10.5162, "step": 12400 }, { "epoch": 0.5507569579665041, "grad_norm": 83.45179748535156, "learning_rate": 9.97848607124803e-06, "loss": 10.8963, "step": 12410 }, { "epoch": 0.5512007588995956, "grad_norm": 78.52918243408203, "learning_rate": 9.978468735286103e-06, "loss": 11.3171, "step": 12420 }, { "epoch": 0.551644559832687, "grad_norm": 92.01652526855469, "learning_rate": 9.978451399324176e-06, "loss": 10.9977, "step": 12430 }, { "epoch": 0.5520883607657785, "grad_norm": 81.95563507080078, "learning_rate": 9.978434063362248e-06, "loss": 10.8333, "step": 12440 }, { "epoch": 0.55253216169887, "grad_norm": 77.15538024902344, "learning_rate": 9.97841672740032e-06, "loss": 11.1091, "step": 12450 }, { "epoch": 0.5529759626319615, "grad_norm": 77.65177917480469, "learning_rate": 9.978399391438394e-06, "loss": 10.9557, "step": 12460 }, { "epoch": 0.5534197635650528, "grad_norm": 78.06676483154297, "learning_rate": 9.978382055476465e-06, "loss": 10.6794, "step": 12470 }, { "epoch": 0.5538635644981443, "grad_norm": 74.32503509521484, "learning_rate": 9.978364719514538e-06, "loss": 11.1013, "step": 12480 }, { "epoch": 0.5543073654312358, "grad_norm": 81.63748931884766, "learning_rate": 9.978347383552611e-06, "loss": 11.074, "step": 12490 }, { "epoch": 0.5547511663643273, "grad_norm": 81.70726776123047, "learning_rate": 9.978330047590683e-06, "loss": 10.3868, "step": 12500 }, { "epoch": 0.5551949672974188, "grad_norm": 81.77082061767578, "learning_rate": 9.978312711628756e-06, "loss": 10.7695, "step": 12510 }, { "epoch": 0.5556387682305102, "grad_norm": 87.88277435302734, "learning_rate": 9.978295375666829e-06, "loss": 11.4438, "step": 12520 }, { "epoch": 0.5560825691636017, "grad_norm": 72.19029998779297, "learning_rate": 9.9782780397049e-06, "loss": 10.6795, "step": 12530 }, { "epoch": 0.5565263700966931, "grad_norm": 66.47805786132812, "learning_rate": 9.978260703742973e-06, "loss": 10.5683, "step": 12540 }, { "epoch": 0.5569701710297846, "grad_norm": 79.33062744140625, "learning_rate": 9.978243367781046e-06, "loss": 10.8922, "step": 12550 }, { "epoch": 0.557413971962876, "grad_norm": 67.08543395996094, "learning_rate": 9.97822603181912e-06, "loss": 10.2346, "step": 12560 }, { "epoch": 0.5578577728959675, "grad_norm": 73.28809356689453, "learning_rate": 9.978208695857191e-06, "loss": 11.0026, "step": 12570 }, { "epoch": 0.558301573829059, "grad_norm": 75.66752624511719, "learning_rate": 9.978191359895264e-06, "loss": 10.7005, "step": 12580 }, { "epoch": 0.5587453747621505, "grad_norm": 75.37193298339844, "learning_rate": 9.978174023933337e-06, "loss": 11.1813, "step": 12590 }, { "epoch": 0.5591891756952418, "grad_norm": 72.89427947998047, "learning_rate": 9.978156687971408e-06, "loss": 11.0323, "step": 12600 }, { "epoch": 0.5596329766283333, "grad_norm": 70.98633575439453, "learning_rate": 9.978139352009482e-06, "loss": 10.8357, "step": 12610 }, { "epoch": 0.5600767775614248, "grad_norm": 79.14083099365234, "learning_rate": 9.978122016047555e-06, "loss": 11.1229, "step": 12620 }, { "epoch": 0.5605205784945163, "grad_norm": 77.51022338867188, "learning_rate": 9.978104680085626e-06, "loss": 10.6968, "step": 12630 }, { "epoch": 0.5609643794276078, "grad_norm": 84.46963500976562, "learning_rate": 9.978087344123699e-06, "loss": 11.0026, "step": 12640 }, { "epoch": 0.5614081803606992, "grad_norm": 69.38136291503906, "learning_rate": 9.978070008161772e-06, "loss": 10.9979, "step": 12650 }, { "epoch": 0.5618519812937907, "grad_norm": 79.57415008544922, "learning_rate": 9.978052672199844e-06, "loss": 11.0241, "step": 12660 }, { "epoch": 0.5622957822268821, "grad_norm": 76.59188079833984, "learning_rate": 9.978035336237917e-06, "loss": 10.9994, "step": 12670 }, { "epoch": 0.5627395831599736, "grad_norm": 71.59693145751953, "learning_rate": 9.97801800027599e-06, "loss": 10.8948, "step": 12680 }, { "epoch": 0.563183384093065, "grad_norm": 87.1396484375, "learning_rate": 9.978000664314061e-06, "loss": 11.0541, "step": 12690 }, { "epoch": 0.5636271850261565, "grad_norm": 80.40076446533203, "learning_rate": 9.977983328352134e-06, "loss": 10.9173, "step": 12700 }, { "epoch": 0.564070985959248, "grad_norm": 70.14698791503906, "learning_rate": 9.977965992390207e-06, "loss": 10.8389, "step": 12710 }, { "epoch": 0.5645147868923395, "grad_norm": 77.14825439453125, "learning_rate": 9.977948656428279e-06, "loss": 10.7059, "step": 12720 }, { "epoch": 0.564958587825431, "grad_norm": 70.7795181274414, "learning_rate": 9.977931320466352e-06, "loss": 11.038, "step": 12730 }, { "epoch": 0.5654023887585223, "grad_norm": 91.8403549194336, "learning_rate": 9.977913984504425e-06, "loss": 11.0422, "step": 12740 }, { "epoch": 0.5658461896916138, "grad_norm": 74.08330535888672, "learning_rate": 9.977896648542496e-06, "loss": 11.2937, "step": 12750 }, { "epoch": 0.5662899906247053, "grad_norm": 65.14645385742188, "learning_rate": 9.97787931258057e-06, "loss": 10.9717, "step": 12760 }, { "epoch": 0.5667337915577968, "grad_norm": 71.15072631835938, "learning_rate": 9.977861976618642e-06, "loss": 10.9591, "step": 12770 }, { "epoch": 0.5671775924908882, "grad_norm": 76.79869079589844, "learning_rate": 9.977844640656715e-06, "loss": 10.5707, "step": 12780 }, { "epoch": 0.5676213934239797, "grad_norm": 85.81656646728516, "learning_rate": 9.977827304694787e-06, "loss": 11.0967, "step": 12790 }, { "epoch": 0.5680651943570711, "grad_norm": 62.44831848144531, "learning_rate": 9.97780996873286e-06, "loss": 10.7672, "step": 12800 }, { "epoch": 0.5685089952901626, "grad_norm": 79.21768188476562, "learning_rate": 9.977792632770933e-06, "loss": 10.8746, "step": 12810 }, { "epoch": 0.568952796223254, "grad_norm": 75.3280258178711, "learning_rate": 9.977775296809004e-06, "loss": 10.6935, "step": 12820 }, { "epoch": 0.5693965971563455, "grad_norm": 75.73821258544922, "learning_rate": 9.977757960847077e-06, "loss": 11.4327, "step": 12830 }, { "epoch": 0.569840398089437, "grad_norm": 77.80765533447266, "learning_rate": 9.97774062488515e-06, "loss": 10.4745, "step": 12840 }, { "epoch": 0.5702841990225285, "grad_norm": 75.14087677001953, "learning_rate": 9.977723288923222e-06, "loss": 11.1868, "step": 12850 }, { "epoch": 0.57072799995562, "grad_norm": 91.36666107177734, "learning_rate": 9.977705952961295e-06, "loss": 10.9202, "step": 12860 }, { "epoch": 0.5711718008887113, "grad_norm": 75.56893920898438, "learning_rate": 9.977688616999368e-06, "loss": 10.4079, "step": 12870 }, { "epoch": 0.5716156018218028, "grad_norm": 78.30206298828125, "learning_rate": 9.97767128103744e-06, "loss": 10.688, "step": 12880 }, { "epoch": 0.5720594027548943, "grad_norm": 67.02020263671875, "learning_rate": 9.977653945075512e-06, "loss": 11.0276, "step": 12890 }, { "epoch": 0.5725032036879858, "grad_norm": 82.73100280761719, "learning_rate": 9.977636609113586e-06, "loss": 10.2337, "step": 12900 }, { "epoch": 0.5729470046210772, "grad_norm": 68.80712127685547, "learning_rate": 9.977619273151657e-06, "loss": 10.8567, "step": 12910 }, { "epoch": 0.5733908055541687, "grad_norm": 84.04026794433594, "learning_rate": 9.97760193718973e-06, "loss": 10.6981, "step": 12920 }, { "epoch": 0.5738346064872601, "grad_norm": 76.38339233398438, "learning_rate": 9.977584601227803e-06, "loss": 10.8466, "step": 12930 }, { "epoch": 0.5742784074203516, "grad_norm": 72.25580596923828, "learning_rate": 9.977567265265874e-06, "loss": 10.8808, "step": 12940 }, { "epoch": 0.574722208353443, "grad_norm": 71.599853515625, "learning_rate": 9.977549929303948e-06, "loss": 11.0439, "step": 12950 }, { "epoch": 0.5751660092865345, "grad_norm": 70.68738555908203, "learning_rate": 9.97753259334202e-06, "loss": 10.314, "step": 12960 }, { "epoch": 0.575609810219626, "grad_norm": 85.19171905517578, "learning_rate": 9.977515257380092e-06, "loss": 10.8608, "step": 12970 }, { "epoch": 0.5760536111527175, "grad_norm": 69.8178482055664, "learning_rate": 9.977497921418165e-06, "loss": 10.6028, "step": 12980 }, { "epoch": 0.576497412085809, "grad_norm": 75.49757385253906, "learning_rate": 9.977480585456238e-06, "loss": 10.3789, "step": 12990 }, { "epoch": 0.5769412130189003, "grad_norm": 71.4937515258789, "learning_rate": 9.977463249494311e-06, "loss": 10.6222, "step": 13000 }, { "epoch": 0.5769412130189003, "eval_loss": 0.3380095958709717, "eval_runtime": 676.243, "eval_samples_per_second": 1795.791, "eval_steps_per_second": 56.119, "step": 13000 }, { "epoch": 0.5773850139519918, "grad_norm": 80.01882934570312, "learning_rate": 9.977445913532383e-06, "loss": 11.0475, "step": 13010 }, { "epoch": 0.5778288148850833, "grad_norm": 79.94010925292969, "learning_rate": 9.977428577570456e-06, "loss": 10.7743, "step": 13020 }, { "epoch": 0.5782726158181748, "grad_norm": 78.5164566040039, "learning_rate": 9.977411241608529e-06, "loss": 10.7199, "step": 13030 }, { "epoch": 0.5787164167512662, "grad_norm": 74.05512237548828, "learning_rate": 9.9773939056466e-06, "loss": 11.3993, "step": 13040 }, { "epoch": 0.5791602176843577, "grad_norm": 71.45179748535156, "learning_rate": 9.977376569684673e-06, "loss": 10.6827, "step": 13050 }, { "epoch": 0.5796040186174491, "grad_norm": 69.38019561767578, "learning_rate": 9.977359233722746e-06, "loss": 11.3264, "step": 13060 }, { "epoch": 0.5800478195505406, "grad_norm": 81.93159484863281, "learning_rate": 9.977341897760818e-06, "loss": 10.2077, "step": 13070 }, { "epoch": 0.5804916204836321, "grad_norm": 79.02632141113281, "learning_rate": 9.97732456179889e-06, "loss": 10.7618, "step": 13080 }, { "epoch": 0.5809354214167235, "grad_norm": 83.42288970947266, "learning_rate": 9.977307225836964e-06, "loss": 10.335, "step": 13090 }, { "epoch": 0.581379222349815, "grad_norm": 86.47683715820312, "learning_rate": 9.977289889875035e-06, "loss": 10.56, "step": 13100 }, { "epoch": 0.5818230232829065, "grad_norm": 87.43464660644531, "learning_rate": 9.977272553913108e-06, "loss": 11.0672, "step": 13110 }, { "epoch": 0.582266824215998, "grad_norm": 79.6264877319336, "learning_rate": 9.977255217951181e-06, "loss": 11.4854, "step": 13120 }, { "epoch": 0.5827106251490893, "grad_norm": 68.71961212158203, "learning_rate": 9.977237881989253e-06, "loss": 10.6174, "step": 13130 }, { "epoch": 0.5831544260821808, "grad_norm": 64.4797592163086, "learning_rate": 9.977220546027326e-06, "loss": 10.2814, "step": 13140 }, { "epoch": 0.5835982270152723, "grad_norm": 77.29915618896484, "learning_rate": 9.977203210065399e-06, "loss": 11.197, "step": 13150 }, { "epoch": 0.5840420279483638, "grad_norm": 71.70263671875, "learning_rate": 9.97718587410347e-06, "loss": 10.7686, "step": 13160 }, { "epoch": 0.5844858288814552, "grad_norm": 78.5580062866211, "learning_rate": 9.977168538141543e-06, "loss": 10.7684, "step": 13170 }, { "epoch": 0.5849296298145467, "grad_norm": 87.69745635986328, "learning_rate": 9.977151202179616e-06, "loss": 11.1967, "step": 13180 }, { "epoch": 0.5853734307476381, "grad_norm": 79.14582824707031, "learning_rate": 9.977133866217688e-06, "loss": 10.4763, "step": 13190 }, { "epoch": 0.5858172316807296, "grad_norm": 87.4928207397461, "learning_rate": 9.977116530255761e-06, "loss": 10.9238, "step": 13200 }, { "epoch": 0.5862610326138211, "grad_norm": 76.60111999511719, "learning_rate": 9.977099194293834e-06, "loss": 10.7317, "step": 13210 }, { "epoch": 0.5867048335469125, "grad_norm": 80.5156478881836, "learning_rate": 9.977081858331907e-06, "loss": 10.0798, "step": 13220 }, { "epoch": 0.587148634480004, "grad_norm": 76.1066665649414, "learning_rate": 9.977064522369978e-06, "loss": 10.5962, "step": 13230 }, { "epoch": 0.5875924354130955, "grad_norm": 76.55358123779297, "learning_rate": 9.977047186408052e-06, "loss": 10.4487, "step": 13240 }, { "epoch": 0.588036236346187, "grad_norm": 79.84068298339844, "learning_rate": 9.977029850446125e-06, "loss": 10.9349, "step": 13250 }, { "epoch": 0.5884800372792783, "grad_norm": 67.1181411743164, "learning_rate": 9.977012514484196e-06, "loss": 10.5641, "step": 13260 }, { "epoch": 0.5889238382123698, "grad_norm": 75.07349395751953, "learning_rate": 9.976995178522269e-06, "loss": 10.9706, "step": 13270 }, { "epoch": 0.5893676391454613, "grad_norm": 70.72966003417969, "learning_rate": 9.976977842560342e-06, "loss": 10.4643, "step": 13280 }, { "epoch": 0.5898114400785528, "grad_norm": 73.792724609375, "learning_rate": 9.976960506598414e-06, "loss": 10.8425, "step": 13290 }, { "epoch": 0.5902552410116442, "grad_norm": 67.8843994140625, "learning_rate": 9.976943170636487e-06, "loss": 10.7675, "step": 13300 }, { "epoch": 0.5906990419447357, "grad_norm": 64.45490264892578, "learning_rate": 9.97692583467456e-06, "loss": 9.6526, "step": 13310 }, { "epoch": 0.5911428428778271, "grad_norm": 83.5705337524414, "learning_rate": 9.976908498712631e-06, "loss": 11.1669, "step": 13320 }, { "epoch": 0.5915866438109186, "grad_norm": 71.82415008544922, "learning_rate": 9.976891162750704e-06, "loss": 10.5378, "step": 13330 }, { "epoch": 0.5920304447440101, "grad_norm": 80.347900390625, "learning_rate": 9.976873826788777e-06, "loss": 10.9272, "step": 13340 }, { "epoch": 0.5924742456771015, "grad_norm": 85.71224212646484, "learning_rate": 9.976856490826849e-06, "loss": 10.9012, "step": 13350 }, { "epoch": 0.592918046610193, "grad_norm": 79.83573150634766, "learning_rate": 9.976839154864922e-06, "loss": 10.5738, "step": 13360 }, { "epoch": 0.5933618475432845, "grad_norm": 74.77920532226562, "learning_rate": 9.976821818902995e-06, "loss": 10.9535, "step": 13370 }, { "epoch": 0.593805648476376, "grad_norm": 78.11053466796875, "learning_rate": 9.976804482941068e-06, "loss": 10.6037, "step": 13380 }, { "epoch": 0.5942494494094673, "grad_norm": 74.39498901367188, "learning_rate": 9.97678714697914e-06, "loss": 10.7146, "step": 13390 }, { "epoch": 0.5946932503425588, "grad_norm": 60.10570526123047, "learning_rate": 9.976769811017212e-06, "loss": 10.5725, "step": 13400 }, { "epoch": 0.5951370512756503, "grad_norm": 73.86632537841797, "learning_rate": 9.976752475055285e-06, "loss": 11.0233, "step": 13410 }, { "epoch": 0.5955808522087418, "grad_norm": 88.34583282470703, "learning_rate": 9.976735139093357e-06, "loss": 10.9409, "step": 13420 }, { "epoch": 0.5960246531418333, "grad_norm": 79.4537353515625, "learning_rate": 9.97671780313143e-06, "loss": 10.3377, "step": 13430 }, { "epoch": 0.5964684540749247, "grad_norm": 78.50579833984375, "learning_rate": 9.976700467169503e-06, "loss": 10.7454, "step": 13440 }, { "epoch": 0.5969122550080161, "grad_norm": 86.72323608398438, "learning_rate": 9.976683131207574e-06, "loss": 11.0431, "step": 13450 }, { "epoch": 0.5973560559411076, "grad_norm": 74.95063781738281, "learning_rate": 9.976665795245647e-06, "loss": 10.871, "step": 13460 }, { "epoch": 0.5977998568741991, "grad_norm": 81.35505676269531, "learning_rate": 9.97664845928372e-06, "loss": 10.8405, "step": 13470 }, { "epoch": 0.5982436578072905, "grad_norm": 76.54270935058594, "learning_rate": 9.976631123321794e-06, "loss": 10.7319, "step": 13480 }, { "epoch": 0.598687458740382, "grad_norm": 85.25194549560547, "learning_rate": 9.976613787359865e-06, "loss": 10.6786, "step": 13490 }, { "epoch": 0.5991312596734735, "grad_norm": 71.38860321044922, "learning_rate": 9.976596451397938e-06, "loss": 10.5208, "step": 13500 }, { "epoch": 0.599575060606565, "grad_norm": 96.68553924560547, "learning_rate": 9.976579115436011e-06, "loss": 11.0773, "step": 13510 }, { "epoch": 0.6000188615396563, "grad_norm": 84.14801025390625, "learning_rate": 9.976561779474083e-06, "loss": 10.6248, "step": 13520 }, { "epoch": 0.6004626624727478, "grad_norm": 79.10609436035156, "learning_rate": 9.976544443512156e-06, "loss": 10.942, "step": 13530 }, { "epoch": 0.6009064634058393, "grad_norm": 70.84247589111328, "learning_rate": 9.976527107550229e-06, "loss": 11.0524, "step": 13540 }, { "epoch": 0.6013502643389308, "grad_norm": 73.7003402709961, "learning_rate": 9.9765097715883e-06, "loss": 10.8864, "step": 13550 }, { "epoch": 0.6017940652720223, "grad_norm": 83.23018646240234, "learning_rate": 9.976492435626373e-06, "loss": 10.9026, "step": 13560 }, { "epoch": 0.6022378662051137, "grad_norm": 76.42780303955078, "learning_rate": 9.976475099664446e-06, "loss": 10.7181, "step": 13570 }, { "epoch": 0.6026816671382051, "grad_norm": 75.6198501586914, "learning_rate": 9.976457763702518e-06, "loss": 11.1458, "step": 13580 }, { "epoch": 0.6031254680712966, "grad_norm": 88.8167495727539, "learning_rate": 9.97644042774059e-06, "loss": 10.8837, "step": 13590 }, { "epoch": 0.6035692690043881, "grad_norm": 71.14923858642578, "learning_rate": 9.976423091778664e-06, "loss": 10.8807, "step": 13600 }, { "epoch": 0.6040130699374795, "grad_norm": 98.20439147949219, "learning_rate": 9.976405755816737e-06, "loss": 11.4224, "step": 13610 }, { "epoch": 0.604456870870571, "grad_norm": 75.78369140625, "learning_rate": 9.976388419854808e-06, "loss": 10.2893, "step": 13620 }, { "epoch": 0.6049006718036625, "grad_norm": 79.95215606689453, "learning_rate": 9.976371083892881e-06, "loss": 10.8222, "step": 13630 }, { "epoch": 0.605344472736754, "grad_norm": 83.35610961914062, "learning_rate": 9.976353747930954e-06, "loss": 10.7412, "step": 13640 }, { "epoch": 0.6057882736698454, "grad_norm": 75.5788345336914, "learning_rate": 9.976336411969026e-06, "loss": 10.5057, "step": 13650 }, { "epoch": 0.6062320746029368, "grad_norm": 72.3238296508789, "learning_rate": 9.976319076007099e-06, "loss": 11.2517, "step": 13660 }, { "epoch": 0.6066758755360283, "grad_norm": 75.04680633544922, "learning_rate": 9.976301740045172e-06, "loss": 10.5344, "step": 13670 }, { "epoch": 0.6071196764691198, "grad_norm": 63.219268798828125, "learning_rate": 9.976284404083243e-06, "loss": 10.9847, "step": 13680 }, { "epoch": 0.6075634774022113, "grad_norm": 87.58104705810547, "learning_rate": 9.976267068121316e-06, "loss": 11.4142, "step": 13690 }, { "epoch": 0.6080072783353027, "grad_norm": 70.27639770507812, "learning_rate": 9.97624973215939e-06, "loss": 10.9125, "step": 13700 }, { "epoch": 0.6084510792683941, "grad_norm": 75.89519500732422, "learning_rate": 9.97623239619746e-06, "loss": 10.8769, "step": 13710 }, { "epoch": 0.6088948802014856, "grad_norm": 78.69032287597656, "learning_rate": 9.976215060235534e-06, "loss": 10.4481, "step": 13720 }, { "epoch": 0.6093386811345771, "grad_norm": 73.8470230102539, "learning_rate": 9.976197724273607e-06, "loss": 10.5418, "step": 13730 }, { "epoch": 0.6097824820676685, "grad_norm": 74.75282287597656, "learning_rate": 9.97618038831168e-06, "loss": 10.6919, "step": 13740 }, { "epoch": 0.61022628300076, "grad_norm": 82.48995971679688, "learning_rate": 9.976163052349751e-06, "loss": 11.2039, "step": 13750 }, { "epoch": 0.6106700839338515, "grad_norm": 86.70307159423828, "learning_rate": 9.976145716387825e-06, "loss": 10.645, "step": 13760 }, { "epoch": 0.611113884866943, "grad_norm": 87.02120208740234, "learning_rate": 9.976128380425898e-06, "loss": 10.8805, "step": 13770 }, { "epoch": 0.6115576858000344, "grad_norm": 67.70435333251953, "learning_rate": 9.976111044463969e-06, "loss": 10.7013, "step": 13780 }, { "epoch": 0.6120014867331258, "grad_norm": 78.09877014160156, "learning_rate": 9.976093708502042e-06, "loss": 10.7801, "step": 13790 }, { "epoch": 0.6124452876662173, "grad_norm": 85.01838684082031, "learning_rate": 9.976076372540115e-06, "loss": 10.6342, "step": 13800 }, { "epoch": 0.6128890885993088, "grad_norm": 83.46656799316406, "learning_rate": 9.976059036578187e-06, "loss": 10.5866, "step": 13810 }, { "epoch": 0.6133328895324003, "grad_norm": 81.93741607666016, "learning_rate": 9.97604170061626e-06, "loss": 10.4993, "step": 13820 }, { "epoch": 0.6137766904654917, "grad_norm": 60.87858963012695, "learning_rate": 9.976024364654333e-06, "loss": 10.246, "step": 13830 }, { "epoch": 0.6142204913985831, "grad_norm": 85.0611343383789, "learning_rate": 9.976007028692404e-06, "loss": 11.0282, "step": 13840 }, { "epoch": 0.6146642923316746, "grad_norm": 65.17115783691406, "learning_rate": 9.975989692730477e-06, "loss": 10.2572, "step": 13850 }, { "epoch": 0.6151080932647661, "grad_norm": 67.50042724609375, "learning_rate": 9.97597235676855e-06, "loss": 10.2284, "step": 13860 }, { "epoch": 0.6155518941978575, "grad_norm": 89.74058532714844, "learning_rate": 9.975955020806623e-06, "loss": 10.8248, "step": 13870 }, { "epoch": 0.615995695130949, "grad_norm": 86.69886779785156, "learning_rate": 9.975937684844695e-06, "loss": 10.8384, "step": 13880 }, { "epoch": 0.6164394960640405, "grad_norm": 68.96355438232422, "learning_rate": 9.975920348882768e-06, "loss": 10.2142, "step": 13890 }, { "epoch": 0.616883296997132, "grad_norm": 91.22903442382812, "learning_rate": 9.97590301292084e-06, "loss": 11.3292, "step": 13900 }, { "epoch": 0.6173270979302234, "grad_norm": 67.73802185058594, "learning_rate": 9.975885676958912e-06, "loss": 10.8209, "step": 13910 }, { "epoch": 0.6177708988633148, "grad_norm": 80.69165802001953, "learning_rate": 9.975868340996985e-06, "loss": 10.6029, "step": 13920 }, { "epoch": 0.6182146997964063, "grad_norm": 73.6860580444336, "learning_rate": 9.975851005035058e-06, "loss": 10.3577, "step": 13930 }, { "epoch": 0.6186585007294978, "grad_norm": 69.81961822509766, "learning_rate": 9.97583366907313e-06, "loss": 10.3961, "step": 13940 }, { "epoch": 0.6191023016625893, "grad_norm": 76.92399597167969, "learning_rate": 9.975816333111203e-06, "loss": 11.1469, "step": 13950 }, { "epoch": 0.6195461025956807, "grad_norm": 72.5120849609375, "learning_rate": 9.975798997149276e-06, "loss": 10.7128, "step": 13960 }, { "epoch": 0.6199899035287721, "grad_norm": 70.09220123291016, "learning_rate": 9.975781661187347e-06, "loss": 10.5301, "step": 13970 }, { "epoch": 0.6204337044618636, "grad_norm": 67.22700500488281, "learning_rate": 9.97576432522542e-06, "loss": 10.1562, "step": 13980 }, { "epoch": 0.6208775053949551, "grad_norm": 69.6814956665039, "learning_rate": 9.975746989263493e-06, "loss": 11.2697, "step": 13990 }, { "epoch": 0.6213213063280466, "grad_norm": 81.13983917236328, "learning_rate": 9.975729653301567e-06, "loss": 10.9927, "step": 14000 }, { "epoch": 0.6213213063280466, "eval_loss": 0.3354085683822632, "eval_runtime": 673.6456, "eval_samples_per_second": 1802.715, "eval_steps_per_second": 56.335, "step": 14000 }, { "epoch": 0.621765107261138, "grad_norm": 73.27105712890625, "learning_rate": 9.975712317339638e-06, "loss": 10.5364, "step": 14010 }, { "epoch": 0.6222089081942295, "grad_norm": 74.4351577758789, "learning_rate": 9.975694981377711e-06, "loss": 10.6971, "step": 14020 }, { "epoch": 0.622652709127321, "grad_norm": 75.62439727783203, "learning_rate": 9.975677645415784e-06, "loss": 10.709, "step": 14030 }, { "epoch": 0.6230965100604124, "grad_norm": 71.058349609375, "learning_rate": 9.975660309453855e-06, "loss": 10.7634, "step": 14040 }, { "epoch": 0.6235403109935038, "grad_norm": 85.23941802978516, "learning_rate": 9.975642973491929e-06, "loss": 11.1897, "step": 14050 }, { "epoch": 0.6239841119265953, "grad_norm": 78.38990020751953, "learning_rate": 9.975625637530002e-06, "loss": 10.9573, "step": 14060 }, { "epoch": 0.6244279128596868, "grad_norm": 73.34696197509766, "learning_rate": 9.975608301568073e-06, "loss": 11.1118, "step": 14070 }, { "epoch": 0.6248717137927783, "grad_norm": 65.3375244140625, "learning_rate": 9.975590965606146e-06, "loss": 10.7498, "step": 14080 }, { "epoch": 0.6253155147258697, "grad_norm": 89.46272277832031, "learning_rate": 9.97557362964422e-06, "loss": 10.7714, "step": 14090 }, { "epoch": 0.6257593156589611, "grad_norm": 73.509521484375, "learning_rate": 9.975556293682292e-06, "loss": 10.5213, "step": 14100 }, { "epoch": 0.6262031165920526, "grad_norm": 58.2670783996582, "learning_rate": 9.975538957720364e-06, "loss": 10.8304, "step": 14110 }, { "epoch": 0.6266469175251441, "grad_norm": 71.2006607055664, "learning_rate": 9.975521621758437e-06, "loss": 10.727, "step": 14120 }, { "epoch": 0.6270907184582356, "grad_norm": 82.89636993408203, "learning_rate": 9.97550428579651e-06, "loss": 10.5115, "step": 14130 }, { "epoch": 0.627534519391327, "grad_norm": 69.2073974609375, "learning_rate": 9.975486949834581e-06, "loss": 10.3403, "step": 14140 }, { "epoch": 0.6279783203244185, "grad_norm": 71.10247039794922, "learning_rate": 9.975469613872654e-06, "loss": 11.1554, "step": 14150 }, { "epoch": 0.62842212125751, "grad_norm": 70.13909912109375, "learning_rate": 9.975452277910727e-06, "loss": 10.4193, "step": 14160 }, { "epoch": 0.6288659221906014, "grad_norm": 81.4350814819336, "learning_rate": 9.975434941948799e-06, "loss": 10.6163, "step": 14170 }, { "epoch": 0.6293097231236928, "grad_norm": 71.56867980957031, "learning_rate": 9.975417605986872e-06, "loss": 10.7015, "step": 14180 }, { "epoch": 0.6297535240567843, "grad_norm": 81.04633331298828, "learning_rate": 9.975400270024945e-06, "loss": 10.8405, "step": 14190 }, { "epoch": 0.6301973249898758, "grad_norm": 70.49227905273438, "learning_rate": 9.975382934063016e-06, "loss": 9.9988, "step": 14200 }, { "epoch": 0.6306411259229673, "grad_norm": 74.22464752197266, "learning_rate": 9.97536559810109e-06, "loss": 10.6188, "step": 14210 }, { "epoch": 0.6310849268560588, "grad_norm": 71.94355773925781, "learning_rate": 9.975348262139162e-06, "loss": 11.1014, "step": 14220 }, { "epoch": 0.6315287277891501, "grad_norm": 77.49762725830078, "learning_rate": 9.975330926177234e-06, "loss": 10.8575, "step": 14230 }, { "epoch": 0.6319725287222416, "grad_norm": 60.8555793762207, "learning_rate": 9.975313590215307e-06, "loss": 10.5589, "step": 14240 }, { "epoch": 0.6324163296553331, "grad_norm": 75.85652160644531, "learning_rate": 9.97529625425338e-06, "loss": 10.7895, "step": 14250 }, { "epoch": 0.6328601305884246, "grad_norm": 78.68632507324219, "learning_rate": 9.975278918291451e-06, "loss": 10.5087, "step": 14260 }, { "epoch": 0.633303931521516, "grad_norm": 67.67071533203125, "learning_rate": 9.975261582329524e-06, "loss": 10.6245, "step": 14270 }, { "epoch": 0.6337477324546075, "grad_norm": 71.56282806396484, "learning_rate": 9.975244246367597e-06, "loss": 10.2392, "step": 14280 }, { "epoch": 0.634191533387699, "grad_norm": 88.3572769165039, "learning_rate": 9.975226910405669e-06, "loss": 10.4799, "step": 14290 }, { "epoch": 0.6346353343207904, "grad_norm": 82.04935455322266, "learning_rate": 9.975209574443742e-06, "loss": 10.4068, "step": 14300 }, { "epoch": 0.6350791352538818, "grad_norm": 84.6961441040039, "learning_rate": 9.975192238481815e-06, "loss": 11.09, "step": 14310 }, { "epoch": 0.6355229361869733, "grad_norm": 71.76660919189453, "learning_rate": 9.975174902519888e-06, "loss": 10.3988, "step": 14320 }, { "epoch": 0.6359667371200648, "grad_norm": 80.33756256103516, "learning_rate": 9.97515756655796e-06, "loss": 11.2503, "step": 14330 }, { "epoch": 0.6364105380531563, "grad_norm": 62.710575103759766, "learning_rate": 9.975140230596033e-06, "loss": 10.4881, "step": 14340 }, { "epoch": 0.6368543389862478, "grad_norm": 71.74394989013672, "learning_rate": 9.975122894634106e-06, "loss": 10.9982, "step": 14350 }, { "epoch": 0.6372981399193391, "grad_norm": 68.05677795410156, "learning_rate": 9.975105558672177e-06, "loss": 10.6098, "step": 14360 }, { "epoch": 0.6377419408524306, "grad_norm": 78.377197265625, "learning_rate": 9.97508822271025e-06, "loss": 10.4365, "step": 14370 }, { "epoch": 0.6381857417855221, "grad_norm": 70.45510864257812, "learning_rate": 9.975070886748323e-06, "loss": 10.625, "step": 14380 }, { "epoch": 0.6386295427186136, "grad_norm": 73.3565444946289, "learning_rate": 9.975053550786395e-06, "loss": 11.1326, "step": 14390 }, { "epoch": 0.639073343651705, "grad_norm": 62.82444763183594, "learning_rate": 9.975036214824468e-06, "loss": 10.5034, "step": 14400 }, { "epoch": 0.6395171445847965, "grad_norm": 69.73179626464844, "learning_rate": 9.97501887886254e-06, "loss": 11.1584, "step": 14410 }, { "epoch": 0.639960945517888, "grad_norm": 71.01941680908203, "learning_rate": 9.975001542900612e-06, "loss": 10.6757, "step": 14420 }, { "epoch": 0.6404047464509794, "grad_norm": 66.62494659423828, "learning_rate": 9.974984206938685e-06, "loss": 10.9316, "step": 14430 }, { "epoch": 0.6408485473840708, "grad_norm": 70.99148559570312, "learning_rate": 9.974966870976758e-06, "loss": 10.5745, "step": 14440 }, { "epoch": 0.6412923483171623, "grad_norm": 68.47904968261719, "learning_rate": 9.97494953501483e-06, "loss": 10.7873, "step": 14450 }, { "epoch": 0.6417361492502538, "grad_norm": 78.76315307617188, "learning_rate": 9.974932199052903e-06, "loss": 10.8907, "step": 14460 }, { "epoch": 0.6421799501833453, "grad_norm": 69.0668716430664, "learning_rate": 9.974914863090976e-06, "loss": 10.9457, "step": 14470 }, { "epoch": 0.6426237511164368, "grad_norm": 70.29750061035156, "learning_rate": 9.974897527129047e-06, "loss": 10.9257, "step": 14480 }, { "epoch": 0.6430675520495281, "grad_norm": 66.38678741455078, "learning_rate": 9.97488019116712e-06, "loss": 10.2022, "step": 14490 }, { "epoch": 0.6435113529826196, "grad_norm": 66.664306640625, "learning_rate": 9.974862855205193e-06, "loss": 10.2297, "step": 14500 }, { "epoch": 0.6439551539157111, "grad_norm": 75.4664306640625, "learning_rate": 9.974845519243266e-06, "loss": 10.8164, "step": 14510 }, { "epoch": 0.6443989548488026, "grad_norm": 68.78631591796875, "learning_rate": 9.974828183281338e-06, "loss": 10.6404, "step": 14520 }, { "epoch": 0.644842755781894, "grad_norm": 79.80715942382812, "learning_rate": 9.974810847319411e-06, "loss": 10.6104, "step": 14530 }, { "epoch": 0.6452865567149855, "grad_norm": 66.43132019042969, "learning_rate": 9.974793511357484e-06, "loss": 10.3112, "step": 14540 }, { "epoch": 0.645730357648077, "grad_norm": 67.95149993896484, "learning_rate": 9.974776175395555e-06, "loss": 10.6031, "step": 14550 }, { "epoch": 0.6461741585811684, "grad_norm": 79.79896545410156, "learning_rate": 9.974758839433628e-06, "loss": 10.501, "step": 14560 }, { "epoch": 0.6466179595142599, "grad_norm": 65.96994018554688, "learning_rate": 9.974741503471701e-06, "loss": 10.2683, "step": 14570 }, { "epoch": 0.6470617604473513, "grad_norm": 76.4495849609375, "learning_rate": 9.974724167509773e-06, "loss": 10.6038, "step": 14580 }, { "epoch": 0.6475055613804428, "grad_norm": 69.33879089355469, "learning_rate": 9.974706831547846e-06, "loss": 10.2104, "step": 14590 }, { "epoch": 0.6479493623135343, "grad_norm": 68.40853881835938, "learning_rate": 9.974689495585919e-06, "loss": 10.7651, "step": 14600 }, { "epoch": 0.6483931632466258, "grad_norm": 71.53189849853516, "learning_rate": 9.97467215962399e-06, "loss": 10.6313, "step": 14610 }, { "epoch": 0.6488369641797171, "grad_norm": 73.11844635009766, "learning_rate": 9.974654823662064e-06, "loss": 11.0404, "step": 14620 }, { "epoch": 0.6492807651128086, "grad_norm": 72.54490661621094, "learning_rate": 9.974637487700137e-06, "loss": 10.0347, "step": 14630 }, { "epoch": 0.6497245660459001, "grad_norm": 77.51940155029297, "learning_rate": 9.974620151738208e-06, "loss": 10.875, "step": 14640 }, { "epoch": 0.6501683669789916, "grad_norm": 75.68498229980469, "learning_rate": 9.974602815776281e-06, "loss": 10.7511, "step": 14650 }, { "epoch": 0.650612167912083, "grad_norm": 85.01661682128906, "learning_rate": 9.974585479814354e-06, "loss": 11.0653, "step": 14660 }, { "epoch": 0.6510559688451745, "grad_norm": 71.71186065673828, "learning_rate": 9.974568143852426e-06, "loss": 10.1264, "step": 14670 }, { "epoch": 0.651499769778266, "grad_norm": 85.85321807861328, "learning_rate": 9.974550807890499e-06, "loss": 11.0676, "step": 14680 }, { "epoch": 0.6519435707113574, "grad_norm": 73.28590393066406, "learning_rate": 9.974533471928572e-06, "loss": 10.6028, "step": 14690 }, { "epoch": 0.6523873716444489, "grad_norm": 66.07901000976562, "learning_rate": 9.974516135966643e-06, "loss": 10.6085, "step": 14700 }, { "epoch": 0.6528311725775403, "grad_norm": 79.432861328125, "learning_rate": 9.974498800004716e-06, "loss": 10.8179, "step": 14710 }, { "epoch": 0.6532749735106318, "grad_norm": 64.7619857788086, "learning_rate": 9.97448146404279e-06, "loss": 10.6211, "step": 14720 }, { "epoch": 0.6537187744437233, "grad_norm": 79.81686401367188, "learning_rate": 9.974464128080862e-06, "loss": 10.3651, "step": 14730 }, { "epoch": 0.6541625753768148, "grad_norm": 80.43688201904297, "learning_rate": 9.974446792118934e-06, "loss": 10.7854, "step": 14740 }, { "epoch": 0.6546063763099061, "grad_norm": 78.90631866455078, "learning_rate": 9.974429456157007e-06, "loss": 10.7801, "step": 14750 }, { "epoch": 0.6550501772429976, "grad_norm": 82.28389739990234, "learning_rate": 9.97441212019508e-06, "loss": 10.6682, "step": 14760 }, { "epoch": 0.6554939781760891, "grad_norm": 71.01078033447266, "learning_rate": 9.974394784233151e-06, "loss": 11.0055, "step": 14770 }, { "epoch": 0.6559377791091806, "grad_norm": 74.68998718261719, "learning_rate": 9.974377448271224e-06, "loss": 10.3595, "step": 14780 }, { "epoch": 0.656381580042272, "grad_norm": 86.77886199951172, "learning_rate": 9.974360112309297e-06, "loss": 10.404, "step": 14790 }, { "epoch": 0.6568253809753635, "grad_norm": 72.12997436523438, "learning_rate": 9.974342776347369e-06, "loss": 10.5215, "step": 14800 }, { "epoch": 0.657269181908455, "grad_norm": 82.83440399169922, "learning_rate": 9.974325440385442e-06, "loss": 11.087, "step": 14810 }, { "epoch": 0.6577129828415464, "grad_norm": 62.340938568115234, "learning_rate": 9.974308104423515e-06, "loss": 10.3713, "step": 14820 }, { "epoch": 0.6581567837746379, "grad_norm": 65.31613159179688, "learning_rate": 9.974290768461586e-06, "loss": 10.6538, "step": 14830 }, { "epoch": 0.6586005847077293, "grad_norm": 69.7147445678711, "learning_rate": 9.97427343249966e-06, "loss": 10.8514, "step": 14840 }, { "epoch": 0.6590443856408208, "grad_norm": 63.39995574951172, "learning_rate": 9.974256096537732e-06, "loss": 10.4676, "step": 14850 }, { "epoch": 0.6594881865739123, "grad_norm": 74.32138061523438, "learning_rate": 9.974238760575804e-06, "loss": 10.4392, "step": 14860 }, { "epoch": 0.6599319875070038, "grad_norm": 86.13739776611328, "learning_rate": 9.974221424613877e-06, "loss": 10.5601, "step": 14870 }, { "epoch": 0.6603757884400951, "grad_norm": 71.16339874267578, "learning_rate": 9.97420408865195e-06, "loss": 11.0757, "step": 14880 }, { "epoch": 0.6608195893731866, "grad_norm": 85.1015625, "learning_rate": 9.974186752690021e-06, "loss": 11.1856, "step": 14890 }, { "epoch": 0.6612633903062781, "grad_norm": 64.8528823852539, "learning_rate": 9.974169416728094e-06, "loss": 11.0716, "step": 14900 }, { "epoch": 0.6617071912393696, "grad_norm": 81.894775390625, "learning_rate": 9.974152080766168e-06, "loss": 10.3157, "step": 14910 }, { "epoch": 0.6621509921724611, "grad_norm": 77.81661224365234, "learning_rate": 9.974134744804239e-06, "loss": 10.4957, "step": 14920 }, { "epoch": 0.6625947931055525, "grad_norm": 73.48043823242188, "learning_rate": 9.974117408842312e-06, "loss": 10.9361, "step": 14930 }, { "epoch": 0.663038594038644, "grad_norm": 67.01435089111328, "learning_rate": 9.974100072880385e-06, "loss": 10.4762, "step": 14940 }, { "epoch": 0.6634823949717354, "grad_norm": 76.67558288574219, "learning_rate": 9.974082736918458e-06, "loss": 10.5872, "step": 14950 }, { "epoch": 0.6639261959048269, "grad_norm": 69.8974838256836, "learning_rate": 9.97406540095653e-06, "loss": 10.6485, "step": 14960 }, { "epoch": 0.6643699968379183, "grad_norm": 77.30745697021484, "learning_rate": 9.974048064994603e-06, "loss": 10.601, "step": 14970 }, { "epoch": 0.6648137977710098, "grad_norm": 60.251041412353516, "learning_rate": 9.974030729032676e-06, "loss": 10.1426, "step": 14980 }, { "epoch": 0.6652575987041013, "grad_norm": 80.3484878540039, "learning_rate": 9.974013393070747e-06, "loss": 10.6465, "step": 14990 }, { "epoch": 0.6657013996371928, "grad_norm": 67.18524932861328, "learning_rate": 9.97399605710882e-06, "loss": 10.8912, "step": 15000 }, { "epoch": 0.6657013996371928, "eval_loss": 0.33302244544029236, "eval_runtime": 671.9227, "eval_samples_per_second": 1807.337, "eval_steps_per_second": 56.48, "step": 15000 }, { "epoch": 0.6661452005702841, "grad_norm": 70.91802215576172, "learning_rate": 9.973978721146893e-06, "loss": 10.7977, "step": 15010 }, { "epoch": 0.6665890015033756, "grad_norm": 73.55622100830078, "learning_rate": 9.973961385184965e-06, "loss": 10.6874, "step": 15020 }, { "epoch": 0.6670328024364671, "grad_norm": 83.61444854736328, "learning_rate": 9.973944049223038e-06, "loss": 10.7632, "step": 15030 }, { "epoch": 0.6674766033695586, "grad_norm": 67.87864685058594, "learning_rate": 9.97392671326111e-06, "loss": 10.6371, "step": 15040 }, { "epoch": 0.6679204043026501, "grad_norm": 67.57317352294922, "learning_rate": 9.973909377299182e-06, "loss": 10.4995, "step": 15050 }, { "epoch": 0.6683642052357415, "grad_norm": 75.98468780517578, "learning_rate": 9.973892041337255e-06, "loss": 10.5515, "step": 15060 }, { "epoch": 0.668808006168833, "grad_norm": 70.60570526123047, "learning_rate": 9.973874705375328e-06, "loss": 10.7693, "step": 15070 }, { "epoch": 0.6692518071019244, "grad_norm": 64.22474670410156, "learning_rate": 9.9738573694134e-06, "loss": 10.5848, "step": 15080 }, { "epoch": 0.6696956080350159, "grad_norm": 70.5452880859375, "learning_rate": 9.973840033451473e-06, "loss": 10.1573, "step": 15090 }, { "epoch": 0.6701394089681073, "grad_norm": 65.60162353515625, "learning_rate": 9.973822697489546e-06, "loss": 10.7798, "step": 15100 }, { "epoch": 0.6705832099011988, "grad_norm": 66.75809478759766, "learning_rate": 9.973805361527617e-06, "loss": 10.576, "step": 15110 }, { "epoch": 0.6710270108342903, "grad_norm": 71.2154312133789, "learning_rate": 9.97378802556569e-06, "loss": 11.1499, "step": 15120 }, { "epoch": 0.6714708117673818, "grad_norm": 77.17507934570312, "learning_rate": 9.973770689603763e-06, "loss": 10.311, "step": 15130 }, { "epoch": 0.6719146127004733, "grad_norm": 76.38935852050781, "learning_rate": 9.973753353641835e-06, "loss": 10.6521, "step": 15140 }, { "epoch": 0.6723584136335646, "grad_norm": 65.29672241210938, "learning_rate": 9.973736017679908e-06, "loss": 10.6403, "step": 15150 }, { "epoch": 0.6728022145666561, "grad_norm": 63.178077697753906, "learning_rate": 9.973718681717981e-06, "loss": 10.3128, "step": 15160 }, { "epoch": 0.6732460154997476, "grad_norm": 65.84847259521484, "learning_rate": 9.973701345756054e-06, "loss": 10.3361, "step": 15170 }, { "epoch": 0.6736898164328391, "grad_norm": 72.8542251586914, "learning_rate": 9.973684009794125e-06, "loss": 10.3667, "step": 15180 }, { "epoch": 0.6741336173659305, "grad_norm": 72.04983520507812, "learning_rate": 9.973666673832198e-06, "loss": 11.1101, "step": 15190 }, { "epoch": 0.674577418299022, "grad_norm": 63.43279266357422, "learning_rate": 9.973649337870272e-06, "loss": 10.7974, "step": 15200 }, { "epoch": 0.6750212192321134, "grad_norm": 76.92269897460938, "learning_rate": 9.973632001908343e-06, "loss": 10.4673, "step": 15210 }, { "epoch": 0.6754650201652049, "grad_norm": 78.26722717285156, "learning_rate": 9.973614665946416e-06, "loss": 10.9101, "step": 15220 }, { "epoch": 0.6759088210982963, "grad_norm": 67.4601058959961, "learning_rate": 9.973597329984489e-06, "loss": 10.7099, "step": 15230 }, { "epoch": 0.6763526220313878, "grad_norm": 67.75270080566406, "learning_rate": 9.97357999402256e-06, "loss": 10.8756, "step": 15240 }, { "epoch": 0.6767964229644793, "grad_norm": 65.31672668457031, "learning_rate": 9.973562658060634e-06, "loss": 10.5466, "step": 15250 }, { "epoch": 0.6772402238975708, "grad_norm": 84.87113952636719, "learning_rate": 9.973545322098707e-06, "loss": 10.5983, "step": 15260 }, { "epoch": 0.6776840248306623, "grad_norm": 64.18128204345703, "learning_rate": 9.973527986136778e-06, "loss": 10.241, "step": 15270 }, { "epoch": 0.6781278257637536, "grad_norm": 86.96048736572266, "learning_rate": 9.973510650174851e-06, "loss": 10.4494, "step": 15280 }, { "epoch": 0.6785716266968451, "grad_norm": 77.51726531982422, "learning_rate": 9.973493314212924e-06, "loss": 10.4231, "step": 15290 }, { "epoch": 0.6790154276299366, "grad_norm": 74.92723083496094, "learning_rate": 9.973475978250996e-06, "loss": 10.1194, "step": 15300 }, { "epoch": 0.6794592285630281, "grad_norm": 66.97340393066406, "learning_rate": 9.973458642289069e-06, "loss": 10.7708, "step": 15310 }, { "epoch": 0.6799030294961195, "grad_norm": 79.47786712646484, "learning_rate": 9.973441306327142e-06, "loss": 10.2036, "step": 15320 }, { "epoch": 0.680346830429211, "grad_norm": 85.9738540649414, "learning_rate": 9.973423970365213e-06, "loss": 10.7741, "step": 15330 }, { "epoch": 0.6807906313623024, "grad_norm": 83.63733673095703, "learning_rate": 9.973406634403286e-06, "loss": 10.5102, "step": 15340 }, { "epoch": 0.6812344322953939, "grad_norm": 68.50708770751953, "learning_rate": 9.97338929844136e-06, "loss": 10.3726, "step": 15350 }, { "epoch": 0.6816782332284853, "grad_norm": 74.40569305419922, "learning_rate": 9.97337196247943e-06, "loss": 11.1048, "step": 15360 }, { "epoch": 0.6821220341615768, "grad_norm": 81.3375473022461, "learning_rate": 9.973354626517504e-06, "loss": 9.9319, "step": 15370 }, { "epoch": 0.6825658350946683, "grad_norm": 74.2603988647461, "learning_rate": 9.973337290555577e-06, "loss": 10.8757, "step": 15380 }, { "epoch": 0.6830096360277598, "grad_norm": 71.74883270263672, "learning_rate": 9.97331995459365e-06, "loss": 10.64, "step": 15390 }, { "epoch": 0.6834534369608513, "grad_norm": 68.61750030517578, "learning_rate": 9.973302618631721e-06, "loss": 10.7962, "step": 15400 }, { "epoch": 0.6838972378939426, "grad_norm": 59.030921936035156, "learning_rate": 9.973285282669794e-06, "loss": 10.5287, "step": 15410 }, { "epoch": 0.6843410388270341, "grad_norm": 76.87126922607422, "learning_rate": 9.973267946707867e-06, "loss": 10.624, "step": 15420 }, { "epoch": 0.6847848397601256, "grad_norm": 78.29729461669922, "learning_rate": 9.973250610745939e-06, "loss": 10.6998, "step": 15430 }, { "epoch": 0.6852286406932171, "grad_norm": 70.97583770751953, "learning_rate": 9.973233274784012e-06, "loss": 10.6444, "step": 15440 }, { "epoch": 0.6856724416263085, "grad_norm": 65.24356842041016, "learning_rate": 9.973215938822085e-06, "loss": 10.6682, "step": 15450 }, { "epoch": 0.6861162425594, "grad_norm": 71.17062377929688, "learning_rate": 9.973198602860156e-06, "loss": 11.0228, "step": 15460 }, { "epoch": 0.6865600434924914, "grad_norm": 87.5817642211914, "learning_rate": 9.97318126689823e-06, "loss": 10.3963, "step": 15470 }, { "epoch": 0.6870038444255829, "grad_norm": 62.55752182006836, "learning_rate": 9.973163930936302e-06, "loss": 10.5304, "step": 15480 }, { "epoch": 0.6874476453586744, "grad_norm": 72.95471954345703, "learning_rate": 9.973146594974374e-06, "loss": 10.7116, "step": 15490 }, { "epoch": 0.6878914462917658, "grad_norm": 63.889129638671875, "learning_rate": 9.973129259012447e-06, "loss": 10.2496, "step": 15500 }, { "epoch": 0.6883352472248573, "grad_norm": 70.15679168701172, "learning_rate": 9.97311192305052e-06, "loss": 10.0643, "step": 15510 }, { "epoch": 0.6887790481579488, "grad_norm": 69.72803497314453, "learning_rate": 9.973094587088591e-06, "loss": 10.5465, "step": 15520 }, { "epoch": 0.6892228490910403, "grad_norm": 64.5792236328125, "learning_rate": 9.973077251126664e-06, "loss": 10.7322, "step": 15530 }, { "epoch": 0.6896666500241316, "grad_norm": 82.74343872070312, "learning_rate": 9.973059915164738e-06, "loss": 10.3258, "step": 15540 }, { "epoch": 0.6901104509572231, "grad_norm": 73.86137390136719, "learning_rate": 9.973042579202809e-06, "loss": 10.4541, "step": 15550 }, { "epoch": 0.6905542518903146, "grad_norm": 81.49348449707031, "learning_rate": 9.973025243240882e-06, "loss": 10.4592, "step": 15560 }, { "epoch": 0.6909980528234061, "grad_norm": 70.07816314697266, "learning_rate": 9.973007907278955e-06, "loss": 10.6212, "step": 15570 }, { "epoch": 0.6914418537564975, "grad_norm": 75.75015258789062, "learning_rate": 9.972990571317026e-06, "loss": 10.459, "step": 15580 }, { "epoch": 0.691885654689589, "grad_norm": 69.51868438720703, "learning_rate": 9.9729732353551e-06, "loss": 10.6566, "step": 15590 }, { "epoch": 0.6923294556226804, "grad_norm": 72.89574432373047, "learning_rate": 9.972955899393173e-06, "loss": 10.8038, "step": 15600 }, { "epoch": 0.6927732565557719, "grad_norm": 70.88813018798828, "learning_rate": 9.972938563431246e-06, "loss": 10.7221, "step": 15610 }, { "epoch": 0.6932170574888634, "grad_norm": 84.63404083251953, "learning_rate": 9.972921227469317e-06, "loss": 10.8532, "step": 15620 }, { "epoch": 0.6936608584219548, "grad_norm": 68.11902618408203, "learning_rate": 9.97290389150739e-06, "loss": 10.7919, "step": 15630 }, { "epoch": 0.6941046593550463, "grad_norm": 88.98064422607422, "learning_rate": 9.972886555545463e-06, "loss": 10.5843, "step": 15640 }, { "epoch": 0.6945484602881378, "grad_norm": 72.20332336425781, "learning_rate": 9.972869219583535e-06, "loss": 10.7185, "step": 15650 }, { "epoch": 0.6949922612212293, "grad_norm": 61.422576904296875, "learning_rate": 9.972851883621608e-06, "loss": 10.5118, "step": 15660 }, { "epoch": 0.6954360621543206, "grad_norm": 62.006317138671875, "learning_rate": 9.97283454765968e-06, "loss": 10.2841, "step": 15670 }, { "epoch": 0.6958798630874121, "grad_norm": 69.10589599609375, "learning_rate": 9.972817211697752e-06, "loss": 10.6618, "step": 15680 }, { "epoch": 0.6963236640205036, "grad_norm": 79.93278503417969, "learning_rate": 9.972799875735825e-06, "loss": 10.4638, "step": 15690 }, { "epoch": 0.6967674649535951, "grad_norm": 71.16869354248047, "learning_rate": 9.972782539773898e-06, "loss": 10.4969, "step": 15700 }, { "epoch": 0.6972112658866866, "grad_norm": 76.26361846923828, "learning_rate": 9.97276520381197e-06, "loss": 10.5915, "step": 15710 }, { "epoch": 0.697655066819778, "grad_norm": 80.85043334960938, "learning_rate": 9.972747867850043e-06, "loss": 11.0115, "step": 15720 }, { "epoch": 0.6980988677528694, "grad_norm": 65.5965576171875, "learning_rate": 9.972730531888116e-06, "loss": 10.756, "step": 15730 }, { "epoch": 0.6985426686859609, "grad_norm": 67.6102066040039, "learning_rate": 9.972713195926187e-06, "loss": 10.5331, "step": 15740 }, { "epoch": 0.6989864696190524, "grad_norm": 69.96884155273438, "learning_rate": 9.97269585996426e-06, "loss": 10.6435, "step": 15750 }, { "epoch": 0.6994302705521438, "grad_norm": 58.43931198120117, "learning_rate": 9.972678524002333e-06, "loss": 10.2629, "step": 15760 }, { "epoch": 0.6998740714852353, "grad_norm": 89.6247329711914, "learning_rate": 9.972661188040405e-06, "loss": 11.0823, "step": 15770 }, { "epoch": 0.7003178724183268, "grad_norm": 86.3462142944336, "learning_rate": 9.972643852078478e-06, "loss": 11.0013, "step": 15780 }, { "epoch": 0.7007616733514183, "grad_norm": 67.15304565429688, "learning_rate": 9.972626516116551e-06, "loss": 10.5305, "step": 15790 }, { "epoch": 0.7012054742845096, "grad_norm": 70.22341918945312, "learning_rate": 9.972609180154624e-06, "loss": 10.2173, "step": 15800 }, { "epoch": 0.7016492752176011, "grad_norm": 67.83938598632812, "learning_rate": 9.972591844192695e-06, "loss": 10.1845, "step": 15810 }, { "epoch": 0.7020930761506926, "grad_norm": 73.8240966796875, "learning_rate": 9.972574508230769e-06, "loss": 10.6668, "step": 15820 }, { "epoch": 0.7025368770837841, "grad_norm": 74.54997253417969, "learning_rate": 9.972557172268842e-06, "loss": 10.4764, "step": 15830 }, { "epoch": 0.7029806780168756, "grad_norm": 77.97396087646484, "learning_rate": 9.972539836306913e-06, "loss": 10.8094, "step": 15840 }, { "epoch": 0.703424478949967, "grad_norm": 69.54945373535156, "learning_rate": 9.972522500344986e-06, "loss": 10.1263, "step": 15850 }, { "epoch": 0.7038682798830584, "grad_norm": 74.77880859375, "learning_rate": 9.972505164383059e-06, "loss": 10.8418, "step": 15860 }, { "epoch": 0.7043120808161499, "grad_norm": 73.15492248535156, "learning_rate": 9.97248782842113e-06, "loss": 10.6459, "step": 15870 }, { "epoch": 0.7047558817492414, "grad_norm": 75.93856048583984, "learning_rate": 9.972470492459204e-06, "loss": 10.5095, "step": 15880 }, { "epoch": 0.7051996826823328, "grad_norm": 71.69056701660156, "learning_rate": 9.972453156497277e-06, "loss": 10.5275, "step": 15890 }, { "epoch": 0.7056434836154243, "grad_norm": 70.82780456542969, "learning_rate": 9.972435820535348e-06, "loss": 10.1364, "step": 15900 }, { "epoch": 0.7060872845485158, "grad_norm": 75.50528717041016, "learning_rate": 9.972418484573421e-06, "loss": 10.4329, "step": 15910 }, { "epoch": 0.7065310854816073, "grad_norm": 78.77973937988281, "learning_rate": 9.972401148611494e-06, "loss": 10.6305, "step": 15920 }, { "epoch": 0.7069748864146986, "grad_norm": 70.21820068359375, "learning_rate": 9.972383812649567e-06, "loss": 10.6905, "step": 15930 }, { "epoch": 0.7074186873477901, "grad_norm": 88.8916015625, "learning_rate": 9.972366476687639e-06, "loss": 10.3912, "step": 15940 }, { "epoch": 0.7078624882808816, "grad_norm": 84.31858825683594, "learning_rate": 9.972349140725712e-06, "loss": 10.5527, "step": 15950 }, { "epoch": 0.7083062892139731, "grad_norm": 71.62870788574219, "learning_rate": 9.972331804763785e-06, "loss": 10.6158, "step": 15960 }, { "epoch": 0.7087500901470646, "grad_norm": 64.43877410888672, "learning_rate": 9.972314468801856e-06, "loss": 10.8082, "step": 15970 }, { "epoch": 0.709193891080156, "grad_norm": 63.440391540527344, "learning_rate": 9.97229713283993e-06, "loss": 10.5228, "step": 15980 }, { "epoch": 0.7096376920132474, "grad_norm": 68.9405746459961, "learning_rate": 9.972279796878002e-06, "loss": 10.631, "step": 15990 }, { "epoch": 0.7100814929463389, "grad_norm": 78.99846649169922, "learning_rate": 9.972262460916074e-06, "loss": 10.7683, "step": 16000 }, { "epoch": 0.7100814929463389, "eval_loss": 0.3311347961425781, "eval_runtime": 678.7995, "eval_samples_per_second": 1789.028, "eval_steps_per_second": 55.908, "step": 16000 }, { "epoch": 0.7105252938794304, "grad_norm": 69.47785949707031, "learning_rate": 9.972245124954147e-06, "loss": 10.1387, "step": 16010 }, { "epoch": 0.7109690948125218, "grad_norm": 69.83346557617188, "learning_rate": 9.97222778899222e-06, "loss": 10.7619, "step": 16020 }, { "epoch": 0.7114128957456133, "grad_norm": 84.6226577758789, "learning_rate": 9.972210453030291e-06, "loss": 10.7781, "step": 16030 }, { "epoch": 0.7118566966787048, "grad_norm": 68.47029876708984, "learning_rate": 9.972193117068364e-06, "loss": 10.684, "step": 16040 }, { "epoch": 0.7123004976117963, "grad_norm": 78.78176879882812, "learning_rate": 9.972175781106437e-06, "loss": 10.7629, "step": 16050 }, { "epoch": 0.7127442985448877, "grad_norm": 70.59607696533203, "learning_rate": 9.97215844514451e-06, "loss": 10.7124, "step": 16060 }, { "epoch": 0.7131880994779791, "grad_norm": 69.93449401855469, "learning_rate": 9.972141109182582e-06, "loss": 11.4954, "step": 16070 }, { "epoch": 0.7136319004110706, "grad_norm": 67.49927520751953, "learning_rate": 9.972123773220655e-06, "loss": 10.9881, "step": 16080 }, { "epoch": 0.7140757013441621, "grad_norm": 66.40914916992188, "learning_rate": 9.972106437258728e-06, "loss": 10.8346, "step": 16090 }, { "epoch": 0.7145195022772536, "grad_norm": 66.59365844726562, "learning_rate": 9.9720891012968e-06, "loss": 10.703, "step": 16100 }, { "epoch": 0.714963303210345, "grad_norm": 61.00757598876953, "learning_rate": 9.972071765334873e-06, "loss": 10.3785, "step": 16110 }, { "epoch": 0.7154071041434364, "grad_norm": 78.33125305175781, "learning_rate": 9.972054429372946e-06, "loss": 10.8235, "step": 16120 }, { "epoch": 0.7158509050765279, "grad_norm": 63.80758285522461, "learning_rate": 9.972037093411017e-06, "loss": 10.8951, "step": 16130 }, { "epoch": 0.7162947060096194, "grad_norm": 76.70967864990234, "learning_rate": 9.97201975744909e-06, "loss": 10.9641, "step": 16140 }, { "epoch": 0.7167385069427108, "grad_norm": 71.60503387451172, "learning_rate": 9.972002421487163e-06, "loss": 10.5517, "step": 16150 }, { "epoch": 0.7171823078758023, "grad_norm": 74.2575912475586, "learning_rate": 9.971985085525235e-06, "loss": 10.161, "step": 16160 }, { "epoch": 0.7176261088088938, "grad_norm": 71.10789489746094, "learning_rate": 9.971967749563308e-06, "loss": 10.6625, "step": 16170 }, { "epoch": 0.7180699097419853, "grad_norm": 71.27813720703125, "learning_rate": 9.97195041360138e-06, "loss": 10.9964, "step": 16180 }, { "epoch": 0.7185137106750767, "grad_norm": 63.57979965209961, "learning_rate": 9.971933077639454e-06, "loss": 10.1936, "step": 16190 }, { "epoch": 0.7189575116081681, "grad_norm": 74.10035705566406, "learning_rate": 9.971915741677525e-06, "loss": 10.8882, "step": 16200 }, { "epoch": 0.7194013125412596, "grad_norm": 73.26065826416016, "learning_rate": 9.971898405715598e-06, "loss": 10.5628, "step": 16210 }, { "epoch": 0.7198451134743511, "grad_norm": 68.37783813476562, "learning_rate": 9.971881069753671e-06, "loss": 10.6096, "step": 16220 }, { "epoch": 0.7202889144074426, "grad_norm": 84.5615234375, "learning_rate": 9.971863733791743e-06, "loss": 10.8151, "step": 16230 }, { "epoch": 0.720732715340534, "grad_norm": 63.7740478515625, "learning_rate": 9.971846397829816e-06, "loss": 10.403, "step": 16240 }, { "epoch": 0.7211765162736254, "grad_norm": 70.54612731933594, "learning_rate": 9.971829061867889e-06, "loss": 10.0854, "step": 16250 }, { "epoch": 0.7216203172067169, "grad_norm": 63.2757682800293, "learning_rate": 9.97181172590596e-06, "loss": 10.6528, "step": 16260 }, { "epoch": 0.7220641181398084, "grad_norm": 71.35702514648438, "learning_rate": 9.971794389944033e-06, "loss": 10.4698, "step": 16270 }, { "epoch": 0.7225079190728998, "grad_norm": 66.68602752685547, "learning_rate": 9.971777053982106e-06, "loss": 10.593, "step": 16280 }, { "epoch": 0.7229517200059913, "grad_norm": 67.9871597290039, "learning_rate": 9.971759718020178e-06, "loss": 10.6789, "step": 16290 }, { "epoch": 0.7233955209390828, "grad_norm": 78.72682189941406, "learning_rate": 9.97174238205825e-06, "loss": 10.5447, "step": 16300 }, { "epoch": 0.7238393218721743, "grad_norm": 74.193359375, "learning_rate": 9.971725046096324e-06, "loss": 10.7683, "step": 16310 }, { "epoch": 0.7242831228052657, "grad_norm": 73.49239349365234, "learning_rate": 9.971707710134397e-06, "loss": 10.0631, "step": 16320 }, { "epoch": 0.7247269237383571, "grad_norm": 85.48190307617188, "learning_rate": 9.971690374172468e-06, "loss": 10.7823, "step": 16330 }, { "epoch": 0.7251707246714486, "grad_norm": 62.24036407470703, "learning_rate": 9.971673038210541e-06, "loss": 9.9273, "step": 16340 }, { "epoch": 0.7256145256045401, "grad_norm": 64.34358978271484, "learning_rate": 9.971655702248615e-06, "loss": 10.3995, "step": 16350 }, { "epoch": 0.7260583265376316, "grad_norm": 66.04435729980469, "learning_rate": 9.971638366286686e-06, "loss": 10.2451, "step": 16360 }, { "epoch": 0.726502127470723, "grad_norm": 64.41586303710938, "learning_rate": 9.971621030324759e-06, "loss": 10.8199, "step": 16370 }, { "epoch": 0.7269459284038144, "grad_norm": 69.1751937866211, "learning_rate": 9.971603694362832e-06, "loss": 10.4755, "step": 16380 }, { "epoch": 0.7273897293369059, "grad_norm": 66.9188003540039, "learning_rate": 9.971586358400903e-06, "loss": 10.3615, "step": 16390 }, { "epoch": 0.7278335302699974, "grad_norm": 75.37189483642578, "learning_rate": 9.971569022438977e-06, "loss": 11.1675, "step": 16400 }, { "epoch": 0.7282773312030889, "grad_norm": 64.27860260009766, "learning_rate": 9.97155168647705e-06, "loss": 10.897, "step": 16410 }, { "epoch": 0.7287211321361803, "grad_norm": 71.226806640625, "learning_rate": 9.971534350515123e-06, "loss": 10.5914, "step": 16420 }, { "epoch": 0.7291649330692718, "grad_norm": 65.22527313232422, "learning_rate": 9.971517014553194e-06, "loss": 10.7522, "step": 16430 }, { "epoch": 0.7296087340023633, "grad_norm": 70.96646118164062, "learning_rate": 9.971499678591267e-06, "loss": 10.519, "step": 16440 }, { "epoch": 0.7300525349354547, "grad_norm": 78.15453338623047, "learning_rate": 9.97148234262934e-06, "loss": 10.4566, "step": 16450 }, { "epoch": 0.7304963358685461, "grad_norm": 66.62492370605469, "learning_rate": 9.971465006667412e-06, "loss": 11.0907, "step": 16460 }, { "epoch": 0.7309401368016376, "grad_norm": 76.4091796875, "learning_rate": 9.971447670705485e-06, "loss": 10.673, "step": 16470 }, { "epoch": 0.7313839377347291, "grad_norm": 68.68971252441406, "learning_rate": 9.971430334743558e-06, "loss": 10.1921, "step": 16480 }, { "epoch": 0.7318277386678206, "grad_norm": 76.54867553710938, "learning_rate": 9.97141299878163e-06, "loss": 10.1928, "step": 16490 }, { "epoch": 0.732271539600912, "grad_norm": 72.19425964355469, "learning_rate": 9.971395662819702e-06, "loss": 10.8393, "step": 16500 }, { "epoch": 0.7327153405340034, "grad_norm": 68.68523406982422, "learning_rate": 9.971378326857775e-06, "loss": 10.8051, "step": 16510 }, { "epoch": 0.7331591414670949, "grad_norm": 73.1769027709961, "learning_rate": 9.971360990895847e-06, "loss": 10.6388, "step": 16520 }, { "epoch": 0.7336029424001864, "grad_norm": 61.5041618347168, "learning_rate": 9.97134365493392e-06, "loss": 10.3324, "step": 16530 }, { "epoch": 0.7340467433332779, "grad_norm": 64.92015838623047, "learning_rate": 9.971326318971993e-06, "loss": 10.3545, "step": 16540 }, { "epoch": 0.7344905442663693, "grad_norm": 70.95281982421875, "learning_rate": 9.971308983010066e-06, "loss": 10.4945, "step": 16550 }, { "epoch": 0.7349343451994608, "grad_norm": 62.154808044433594, "learning_rate": 9.971291647048137e-06, "loss": 10.4194, "step": 16560 }, { "epoch": 0.7353781461325523, "grad_norm": 63.85173416137695, "learning_rate": 9.97127431108621e-06, "loss": 10.6319, "step": 16570 }, { "epoch": 0.7358219470656437, "grad_norm": 87.90074157714844, "learning_rate": 9.971256975124283e-06, "loss": 10.2764, "step": 16580 }, { "epoch": 0.7362657479987351, "grad_norm": 68.94241333007812, "learning_rate": 9.971239639162355e-06, "loss": 10.6806, "step": 16590 }, { "epoch": 0.7367095489318266, "grad_norm": 74.24497985839844, "learning_rate": 9.971222303200428e-06, "loss": 10.0719, "step": 16600 }, { "epoch": 0.7371533498649181, "grad_norm": 70.3670654296875, "learning_rate": 9.971204967238501e-06, "loss": 10.5849, "step": 16610 }, { "epoch": 0.7375971507980096, "grad_norm": 69.45691680908203, "learning_rate": 9.971187631276572e-06, "loss": 10.4697, "step": 16620 }, { "epoch": 0.7380409517311011, "grad_norm": 65.4955062866211, "learning_rate": 9.971170295314645e-06, "loss": 10.3592, "step": 16630 }, { "epoch": 0.7384847526641924, "grad_norm": 79.28197479248047, "learning_rate": 9.971152959352719e-06, "loss": 10.2718, "step": 16640 }, { "epoch": 0.7389285535972839, "grad_norm": 57.633644104003906, "learning_rate": 9.97113562339079e-06, "loss": 10.4478, "step": 16650 }, { "epoch": 0.7393723545303754, "grad_norm": 58.49510955810547, "learning_rate": 9.971118287428863e-06, "loss": 10.6408, "step": 16660 }, { "epoch": 0.7398161554634669, "grad_norm": 68.87074279785156, "learning_rate": 9.971100951466936e-06, "loss": 10.4892, "step": 16670 }, { "epoch": 0.7402599563965583, "grad_norm": 83.9663314819336, "learning_rate": 9.971083615505007e-06, "loss": 10.6967, "step": 16680 }, { "epoch": 0.7407037573296498, "grad_norm": 66.5399398803711, "learning_rate": 9.97106627954308e-06, "loss": 10.2476, "step": 16690 }, { "epoch": 0.7411475582627413, "grad_norm": 71.39287567138672, "learning_rate": 9.971048943581154e-06, "loss": 10.4425, "step": 16700 }, { "epoch": 0.7415913591958327, "grad_norm": 67.80374145507812, "learning_rate": 9.971031607619227e-06, "loss": 10.8299, "step": 16710 }, { "epoch": 0.7420351601289241, "grad_norm": 63.548011779785156, "learning_rate": 9.971014271657298e-06, "loss": 10.5723, "step": 16720 }, { "epoch": 0.7424789610620156, "grad_norm": 66.04682922363281, "learning_rate": 9.970996935695371e-06, "loss": 10.3872, "step": 16730 }, { "epoch": 0.7429227619951071, "grad_norm": 69.4638900756836, "learning_rate": 9.970979599733444e-06, "loss": 11.023, "step": 16740 }, { "epoch": 0.7433665629281986, "grad_norm": 61.98347473144531, "learning_rate": 9.970962263771516e-06, "loss": 10.1672, "step": 16750 }, { "epoch": 0.7438103638612901, "grad_norm": 75.08468627929688, "learning_rate": 9.970944927809589e-06, "loss": 10.4586, "step": 16760 }, { "epoch": 0.7442541647943814, "grad_norm": 58.552120208740234, "learning_rate": 9.970927591847662e-06, "loss": 10.6722, "step": 16770 }, { "epoch": 0.7446979657274729, "grad_norm": 70.47040557861328, "learning_rate": 9.970910255885733e-06, "loss": 10.1118, "step": 16780 }, { "epoch": 0.7451417666605644, "grad_norm": 75.66903686523438, "learning_rate": 9.970892919923806e-06, "loss": 10.5813, "step": 16790 }, { "epoch": 0.7455855675936559, "grad_norm": 79.39729309082031, "learning_rate": 9.97087558396188e-06, "loss": 10.3935, "step": 16800 }, { "epoch": 0.7460293685267473, "grad_norm": 69.61038970947266, "learning_rate": 9.97085824799995e-06, "loss": 10.3304, "step": 16810 }, { "epoch": 0.7464731694598388, "grad_norm": 70.38590240478516, "learning_rate": 9.970840912038024e-06, "loss": 10.5146, "step": 16820 }, { "epoch": 0.7469169703929303, "grad_norm": 83.62046813964844, "learning_rate": 9.970823576076097e-06, "loss": 10.7194, "step": 16830 }, { "epoch": 0.7473607713260217, "grad_norm": 69.001708984375, "learning_rate": 9.970806240114168e-06, "loss": 10.6674, "step": 16840 }, { "epoch": 0.7478045722591131, "grad_norm": 65.15716552734375, "learning_rate": 9.970788904152241e-06, "loss": 10.6923, "step": 16850 }, { "epoch": 0.7482483731922046, "grad_norm": 74.37089538574219, "learning_rate": 9.970771568190314e-06, "loss": 10.7676, "step": 16860 }, { "epoch": 0.7486921741252961, "grad_norm": 64.5125732421875, "learning_rate": 9.970754232228386e-06, "loss": 10.2827, "step": 16870 }, { "epoch": 0.7491359750583876, "grad_norm": 69.91007232666016, "learning_rate": 9.970736896266459e-06, "loss": 10.6307, "step": 16880 }, { "epoch": 0.7495797759914791, "grad_norm": 59.29727554321289, "learning_rate": 9.970719560304532e-06, "loss": 10.4378, "step": 16890 }, { "epoch": 0.7500235769245704, "grad_norm": 90.56709289550781, "learning_rate": 9.970702224342603e-06, "loss": 10.3348, "step": 16900 }, { "epoch": 0.7504673778576619, "grad_norm": 70.33431243896484, "learning_rate": 9.970684888380676e-06, "loss": 10.3737, "step": 16910 }, { "epoch": 0.7509111787907534, "grad_norm": 62.954490661621094, "learning_rate": 9.97066755241875e-06, "loss": 10.6825, "step": 16920 }, { "epoch": 0.7513549797238449, "grad_norm": 64.03510284423828, "learning_rate": 9.970650216456823e-06, "loss": 10.4315, "step": 16930 }, { "epoch": 0.7517987806569363, "grad_norm": 61.26763153076172, "learning_rate": 9.970632880494894e-06, "loss": 9.9209, "step": 16940 }, { "epoch": 0.7522425815900278, "grad_norm": 64.45995330810547, "learning_rate": 9.970615544532967e-06, "loss": 10.4647, "step": 16950 }, { "epoch": 0.7526863825231193, "grad_norm": 71.07040405273438, "learning_rate": 9.97059820857104e-06, "loss": 10.4505, "step": 16960 }, { "epoch": 0.7531301834562107, "grad_norm": 61.676551818847656, "learning_rate": 9.970580872609112e-06, "loss": 10.1759, "step": 16970 }, { "epoch": 0.7535739843893022, "grad_norm": 76.77957916259766, "learning_rate": 9.970563536647185e-06, "loss": 10.5416, "step": 16980 }, { "epoch": 0.7540177853223936, "grad_norm": 83.22810363769531, "learning_rate": 9.970546200685258e-06, "loss": 10.2076, "step": 16990 }, { "epoch": 0.7544615862554851, "grad_norm": 68.98297882080078, "learning_rate": 9.970528864723329e-06, "loss": 10.4059, "step": 17000 }, { "epoch": 0.7544615862554851, "eval_loss": 0.3286471664905548, "eval_runtime": 678.3037, "eval_samples_per_second": 1790.335, "eval_steps_per_second": 55.948, "step": 17000 }, { "epoch": 0.7549053871885766, "grad_norm": 61.05122756958008, "learning_rate": 9.970511528761402e-06, "loss": 10.329, "step": 17010 }, { "epoch": 0.7553491881216681, "grad_norm": 69.60665893554688, "learning_rate": 9.970494192799475e-06, "loss": 10.3436, "step": 17020 }, { "epoch": 0.7557929890547594, "grad_norm": 70.77852630615234, "learning_rate": 9.970476856837547e-06, "loss": 10.8488, "step": 17030 }, { "epoch": 0.7562367899878509, "grad_norm": 71.26077270507812, "learning_rate": 9.97045952087562e-06, "loss": 10.4241, "step": 17040 }, { "epoch": 0.7566805909209424, "grad_norm": 95.98668670654297, "learning_rate": 9.970442184913693e-06, "loss": 10.1583, "step": 17050 }, { "epoch": 0.7571243918540339, "grad_norm": 77.86578369140625, "learning_rate": 9.970424848951764e-06, "loss": 10.8454, "step": 17060 }, { "epoch": 0.7575681927871253, "grad_norm": 81.84298706054688, "learning_rate": 9.970407512989837e-06, "loss": 10.523, "step": 17070 }, { "epoch": 0.7580119937202168, "grad_norm": 71.15061950683594, "learning_rate": 9.97039017702791e-06, "loss": 10.206, "step": 17080 }, { "epoch": 0.7584557946533083, "grad_norm": 65.26417541503906, "learning_rate": 9.970372841065982e-06, "loss": 10.8191, "step": 17090 }, { "epoch": 0.7588995955863997, "grad_norm": 76.11380767822266, "learning_rate": 9.970355505104055e-06, "loss": 10.6828, "step": 17100 }, { "epoch": 0.7593433965194912, "grad_norm": 82.5249252319336, "learning_rate": 9.970338169142128e-06, "loss": 10.5496, "step": 17110 }, { "epoch": 0.7597871974525826, "grad_norm": 66.36536407470703, "learning_rate": 9.9703208331802e-06, "loss": 10.0697, "step": 17120 }, { "epoch": 0.7602309983856741, "grad_norm": 61.46723556518555, "learning_rate": 9.970303497218272e-06, "loss": 10.5154, "step": 17130 }, { "epoch": 0.7606747993187656, "grad_norm": 65.16950225830078, "learning_rate": 9.970286161256345e-06, "loss": 10.5957, "step": 17140 }, { "epoch": 0.7611186002518571, "grad_norm": 74.29840850830078, "learning_rate": 9.970268825294418e-06, "loss": 10.9534, "step": 17150 }, { "epoch": 0.7615624011849484, "grad_norm": 63.95246887207031, "learning_rate": 9.97025148933249e-06, "loss": 10.559, "step": 17160 }, { "epoch": 0.7620062021180399, "grad_norm": 70.78131866455078, "learning_rate": 9.970234153370563e-06, "loss": 10.4259, "step": 17170 }, { "epoch": 0.7624500030511314, "grad_norm": 68.86581420898438, "learning_rate": 9.970216817408636e-06, "loss": 10.5729, "step": 17180 }, { "epoch": 0.7628938039842229, "grad_norm": 69.18133544921875, "learning_rate": 9.970199481446707e-06, "loss": 10.4368, "step": 17190 }, { "epoch": 0.7633376049173144, "grad_norm": 63.52793502807617, "learning_rate": 9.97018214548478e-06, "loss": 10.4153, "step": 17200 }, { "epoch": 0.7637814058504058, "grad_norm": 72.20518493652344, "learning_rate": 9.970164809522854e-06, "loss": 10.7712, "step": 17210 }, { "epoch": 0.7642252067834973, "grad_norm": 71.09992218017578, "learning_rate": 9.970147473560925e-06, "loss": 10.4743, "step": 17220 }, { "epoch": 0.7646690077165887, "grad_norm": 73.4151611328125, "learning_rate": 9.970130137598998e-06, "loss": 10.7, "step": 17230 }, { "epoch": 0.7651128086496802, "grad_norm": 59.74842834472656, "learning_rate": 9.970112801637071e-06, "loss": 10.5612, "step": 17240 }, { "epoch": 0.7655566095827716, "grad_norm": 67.19086456298828, "learning_rate": 9.970095465675142e-06, "loss": 10.3241, "step": 17250 }, { "epoch": 0.7660004105158631, "grad_norm": 80.37249755859375, "learning_rate": 9.970078129713216e-06, "loss": 10.8193, "step": 17260 }, { "epoch": 0.7664442114489546, "grad_norm": 64.05519104003906, "learning_rate": 9.970060793751289e-06, "loss": 10.5312, "step": 17270 }, { "epoch": 0.7668880123820461, "grad_norm": 63.82759475708008, "learning_rate": 9.97004345778936e-06, "loss": 10.2693, "step": 17280 }, { "epoch": 0.7673318133151374, "grad_norm": 74.00315856933594, "learning_rate": 9.970026121827433e-06, "loss": 10.5823, "step": 17290 }, { "epoch": 0.7677756142482289, "grad_norm": 72.47602844238281, "learning_rate": 9.970008785865506e-06, "loss": 10.3111, "step": 17300 }, { "epoch": 0.7682194151813204, "grad_norm": 59.534305572509766, "learning_rate": 9.969991449903578e-06, "loss": 10.7712, "step": 17310 }, { "epoch": 0.7686632161144119, "grad_norm": 70.15426635742188, "learning_rate": 9.96997411394165e-06, "loss": 10.3361, "step": 17320 }, { "epoch": 0.7691070170475034, "grad_norm": 60.14602279663086, "learning_rate": 9.969956777979724e-06, "loss": 10.8061, "step": 17330 }, { "epoch": 0.7695508179805948, "grad_norm": 64.0710678100586, "learning_rate": 9.969939442017795e-06, "loss": 9.9819, "step": 17340 }, { "epoch": 0.7699946189136863, "grad_norm": 76.11770629882812, "learning_rate": 9.969922106055868e-06, "loss": 10.2555, "step": 17350 }, { "epoch": 0.7704384198467777, "grad_norm": 68.8619613647461, "learning_rate": 9.969904770093941e-06, "loss": 10.5506, "step": 17360 }, { "epoch": 0.7708822207798692, "grad_norm": 66.60945129394531, "learning_rate": 9.969887434132014e-06, "loss": 10.3023, "step": 17370 }, { "epoch": 0.7713260217129606, "grad_norm": 66.75738525390625, "learning_rate": 9.969870098170086e-06, "loss": 10.126, "step": 17380 }, { "epoch": 0.7717698226460521, "grad_norm": 65.49826049804688, "learning_rate": 9.969852762208159e-06, "loss": 10.6585, "step": 17390 }, { "epoch": 0.7722136235791436, "grad_norm": 65.2136001586914, "learning_rate": 9.969835426246232e-06, "loss": 10.4105, "step": 17400 }, { "epoch": 0.7726574245122351, "grad_norm": 62.805213928222656, "learning_rate": 9.969818090284303e-06, "loss": 10.1974, "step": 17410 }, { "epoch": 0.7731012254453264, "grad_norm": 66.39070892333984, "learning_rate": 9.969800754322376e-06, "loss": 10.7106, "step": 17420 }, { "epoch": 0.7735450263784179, "grad_norm": 70.36665344238281, "learning_rate": 9.96978341836045e-06, "loss": 10.2655, "step": 17430 }, { "epoch": 0.7739888273115094, "grad_norm": 62.32572937011719, "learning_rate": 9.96976608239852e-06, "loss": 10.2899, "step": 17440 }, { "epoch": 0.7744326282446009, "grad_norm": 64.87157440185547, "learning_rate": 9.969748746436594e-06, "loss": 10.2878, "step": 17450 }, { "epoch": 0.7748764291776924, "grad_norm": 65.008544921875, "learning_rate": 9.969731410474667e-06, "loss": 10.5488, "step": 17460 }, { "epoch": 0.7753202301107838, "grad_norm": 62.856143951416016, "learning_rate": 9.969714074512738e-06, "loss": 10.3436, "step": 17470 }, { "epoch": 0.7757640310438753, "grad_norm": 77.94728088378906, "learning_rate": 9.969696738550811e-06, "loss": 10.7276, "step": 17480 }, { "epoch": 0.7762078319769667, "grad_norm": 59.522884368896484, "learning_rate": 9.969679402588884e-06, "loss": 10.267, "step": 17490 }, { "epoch": 0.7766516329100582, "grad_norm": 65.48174285888672, "learning_rate": 9.969662066626956e-06, "loss": 10.1914, "step": 17500 }, { "epoch": 0.7770954338431496, "grad_norm": 66.09324645996094, "learning_rate": 9.969644730665029e-06, "loss": 10.7307, "step": 17510 }, { "epoch": 0.7775392347762411, "grad_norm": 64.93851470947266, "learning_rate": 9.969627394703102e-06, "loss": 10.0904, "step": 17520 }, { "epoch": 0.7779830357093326, "grad_norm": 72.18850708007812, "learning_rate": 9.969610058741173e-06, "loss": 10.9298, "step": 17530 }, { "epoch": 0.7784268366424241, "grad_norm": 74.83529663085938, "learning_rate": 9.969592722779246e-06, "loss": 10.3685, "step": 17540 }, { "epoch": 0.7788706375755156, "grad_norm": 69.4002456665039, "learning_rate": 9.96957538681732e-06, "loss": 10.3551, "step": 17550 }, { "epoch": 0.7793144385086069, "grad_norm": 76.83059692382812, "learning_rate": 9.969558050855391e-06, "loss": 10.6919, "step": 17560 }, { "epoch": 0.7797582394416984, "grad_norm": 67.19385528564453, "learning_rate": 9.969540714893464e-06, "loss": 10.5123, "step": 17570 }, { "epoch": 0.7802020403747899, "grad_norm": 80.98805236816406, "learning_rate": 9.969523378931537e-06, "loss": 10.7374, "step": 17580 }, { "epoch": 0.7806458413078814, "grad_norm": 67.6994400024414, "learning_rate": 9.96950604296961e-06, "loss": 10.768, "step": 17590 }, { "epoch": 0.7810896422409728, "grad_norm": 74.86246490478516, "learning_rate": 9.969488707007682e-06, "loss": 10.1819, "step": 17600 }, { "epoch": 0.7815334431740643, "grad_norm": 73.82647705078125, "learning_rate": 9.969471371045755e-06, "loss": 10.7857, "step": 17610 }, { "epoch": 0.7819772441071557, "grad_norm": 68.298095703125, "learning_rate": 9.969454035083828e-06, "loss": 10.6649, "step": 17620 }, { "epoch": 0.7824210450402472, "grad_norm": 68.30016326904297, "learning_rate": 9.969436699121899e-06, "loss": 10.3087, "step": 17630 }, { "epoch": 0.7828648459733386, "grad_norm": 67.62581634521484, "learning_rate": 9.969419363159972e-06, "loss": 9.9373, "step": 17640 }, { "epoch": 0.7833086469064301, "grad_norm": 78.02869415283203, "learning_rate": 9.969402027198045e-06, "loss": 10.7946, "step": 17650 }, { "epoch": 0.7837524478395216, "grad_norm": 64.94837951660156, "learning_rate": 9.969384691236117e-06, "loss": 10.1355, "step": 17660 }, { "epoch": 0.7841962487726131, "grad_norm": 65.18085479736328, "learning_rate": 9.96936735527419e-06, "loss": 10.4704, "step": 17670 }, { "epoch": 0.7846400497057046, "grad_norm": 70.01870727539062, "learning_rate": 9.969350019312263e-06, "loss": 10.6109, "step": 17680 }, { "epoch": 0.7850838506387959, "grad_norm": 65.34024047851562, "learning_rate": 9.969332683350334e-06, "loss": 10.3535, "step": 17690 }, { "epoch": 0.7855276515718874, "grad_norm": 79.30274200439453, "learning_rate": 9.969315347388407e-06, "loss": 10.3437, "step": 17700 }, { "epoch": 0.7859714525049789, "grad_norm": 67.2250747680664, "learning_rate": 9.96929801142648e-06, "loss": 10.4723, "step": 17710 }, { "epoch": 0.7864152534380704, "grad_norm": 68.58338928222656, "learning_rate": 9.969280675464552e-06, "loss": 10.1836, "step": 17720 }, { "epoch": 0.7868590543711618, "grad_norm": 70.7483901977539, "learning_rate": 9.969263339502625e-06, "loss": 11.1165, "step": 17730 }, { "epoch": 0.7873028553042533, "grad_norm": 62.17152404785156, "learning_rate": 9.969246003540698e-06, "loss": 10.2156, "step": 17740 }, { "epoch": 0.7877466562373447, "grad_norm": 69.29998016357422, "learning_rate": 9.96922866757877e-06, "loss": 10.6663, "step": 17750 }, { "epoch": 0.7881904571704362, "grad_norm": 64.87523651123047, "learning_rate": 9.969211331616842e-06, "loss": 10.9555, "step": 17760 }, { "epoch": 0.7886342581035276, "grad_norm": 66.5212173461914, "learning_rate": 9.969193995654915e-06, "loss": 10.5622, "step": 17770 }, { "epoch": 0.7890780590366191, "grad_norm": 68.40711975097656, "learning_rate": 9.969176659692987e-06, "loss": 10.3267, "step": 17780 }, { "epoch": 0.7895218599697106, "grad_norm": 70.49530029296875, "learning_rate": 9.96915932373106e-06, "loss": 10.6099, "step": 17790 }, { "epoch": 0.7899656609028021, "grad_norm": 68.68482971191406, "learning_rate": 9.969141987769133e-06, "loss": 10.5234, "step": 17800 }, { "epoch": 0.7904094618358936, "grad_norm": 73.87464904785156, "learning_rate": 9.969124651807206e-06, "loss": 10.7718, "step": 17810 }, { "epoch": 0.7908532627689849, "grad_norm": 65.32040405273438, "learning_rate": 9.969107315845277e-06, "loss": 10.347, "step": 17820 }, { "epoch": 0.7912970637020764, "grad_norm": 66.08610534667969, "learning_rate": 9.96908997988335e-06, "loss": 10.4261, "step": 17830 }, { "epoch": 0.7917408646351679, "grad_norm": 55.77153396606445, "learning_rate": 9.969072643921424e-06, "loss": 10.013, "step": 17840 }, { "epoch": 0.7921846655682594, "grad_norm": 72.32164764404297, "learning_rate": 9.969055307959495e-06, "loss": 10.5249, "step": 17850 }, { "epoch": 0.7926284665013508, "grad_norm": 73.15257263183594, "learning_rate": 9.969037971997568e-06, "loss": 10.86, "step": 17860 }, { "epoch": 0.7930722674344423, "grad_norm": 79.71673583984375, "learning_rate": 9.969020636035641e-06, "loss": 10.4343, "step": 17870 }, { "epoch": 0.7935160683675337, "grad_norm": 68.7105941772461, "learning_rate": 9.969003300073712e-06, "loss": 10.5593, "step": 17880 }, { "epoch": 0.7939598693006252, "grad_norm": 67.40699005126953, "learning_rate": 9.968985964111786e-06, "loss": 9.8434, "step": 17890 }, { "epoch": 0.7944036702337167, "grad_norm": 67.37322235107422, "learning_rate": 9.968968628149859e-06, "loss": 10.9941, "step": 17900 }, { "epoch": 0.7948474711668081, "grad_norm": 66.23359680175781, "learning_rate": 9.96895129218793e-06, "loss": 10.731, "step": 17910 }, { "epoch": 0.7952912720998996, "grad_norm": 65.20913696289062, "learning_rate": 9.968933956226003e-06, "loss": 10.537, "step": 17920 }, { "epoch": 0.7957350730329911, "grad_norm": 61.01829528808594, "learning_rate": 9.968916620264076e-06, "loss": 9.9038, "step": 17930 }, { "epoch": 0.7961788739660826, "grad_norm": 65.00994110107422, "learning_rate": 9.968899284302148e-06, "loss": 10.364, "step": 17940 }, { "epoch": 0.7966226748991739, "grad_norm": 72.74436950683594, "learning_rate": 9.96888194834022e-06, "loss": 10.5061, "step": 17950 }, { "epoch": 0.7970664758322654, "grad_norm": 75.44004821777344, "learning_rate": 9.968864612378294e-06, "loss": 10.646, "step": 17960 }, { "epoch": 0.7975102767653569, "grad_norm": 69.74024200439453, "learning_rate": 9.968847276416365e-06, "loss": 10.5158, "step": 17970 }, { "epoch": 0.7979540776984484, "grad_norm": 73.52366638183594, "learning_rate": 9.968829940454438e-06, "loss": 10.0728, "step": 17980 }, { "epoch": 0.7983978786315398, "grad_norm": 62.016883850097656, "learning_rate": 9.968812604492511e-06, "loss": 9.9823, "step": 17990 }, { "epoch": 0.7988416795646313, "grad_norm": 77.03231811523438, "learning_rate": 9.968795268530583e-06, "loss": 10.4617, "step": 18000 }, { "epoch": 0.7988416795646313, "eval_loss": 0.32583364844322205, "eval_runtime": 672.7495, "eval_samples_per_second": 1805.116, "eval_steps_per_second": 56.41, "step": 18000 }, { "epoch": 0.7992854804977227, "grad_norm": 62.19236755371094, "learning_rate": 9.968777932568656e-06, "loss": 10.465, "step": 18010 }, { "epoch": 0.7997292814308142, "grad_norm": 64.64230346679688, "learning_rate": 9.968760596606729e-06, "loss": 11.0006, "step": 18020 }, { "epoch": 0.8001730823639057, "grad_norm": 71.75637817382812, "learning_rate": 9.968743260644802e-06, "loss": 10.3129, "step": 18030 }, { "epoch": 0.8006168832969971, "grad_norm": 68.54603576660156, "learning_rate": 9.968725924682873e-06, "loss": 10.7563, "step": 18040 }, { "epoch": 0.8010606842300886, "grad_norm": 66.99270629882812, "learning_rate": 9.968708588720946e-06, "loss": 10.2859, "step": 18050 }, { "epoch": 0.8015044851631801, "grad_norm": 72.47330474853516, "learning_rate": 9.96869125275902e-06, "loss": 10.6183, "step": 18060 }, { "epoch": 0.8019482860962716, "grad_norm": 69.65715789794922, "learning_rate": 9.96867391679709e-06, "loss": 10.0611, "step": 18070 }, { "epoch": 0.8023920870293629, "grad_norm": 68.04208374023438, "learning_rate": 9.968656580835164e-06, "loss": 10.9295, "step": 18080 }, { "epoch": 0.8028358879624544, "grad_norm": 69.00373840332031, "learning_rate": 9.968639244873237e-06, "loss": 10.3371, "step": 18090 }, { "epoch": 0.8032796888955459, "grad_norm": 62.3974723815918, "learning_rate": 9.968621908911308e-06, "loss": 10.3889, "step": 18100 }, { "epoch": 0.8037234898286374, "grad_norm": 71.53374481201172, "learning_rate": 9.968604572949381e-06, "loss": 11.0522, "step": 18110 }, { "epoch": 0.8041672907617289, "grad_norm": 61.865089416503906, "learning_rate": 9.968587236987455e-06, "loss": 10.6471, "step": 18120 }, { "epoch": 0.8046110916948203, "grad_norm": 70.71613311767578, "learning_rate": 9.968569901025526e-06, "loss": 11.1749, "step": 18130 }, { "epoch": 0.8050548926279117, "grad_norm": 70.22577667236328, "learning_rate": 9.968552565063599e-06, "loss": 10.5659, "step": 18140 }, { "epoch": 0.8054986935610032, "grad_norm": 62.12384796142578, "learning_rate": 9.968535229101672e-06, "loss": 10.3108, "step": 18150 }, { "epoch": 0.8059424944940947, "grad_norm": 67.61980438232422, "learning_rate": 9.968517893139743e-06, "loss": 10.4519, "step": 18160 }, { "epoch": 0.8063862954271861, "grad_norm": 64.64510345458984, "learning_rate": 9.968500557177817e-06, "loss": 10.4827, "step": 18170 }, { "epoch": 0.8068300963602776, "grad_norm": 73.72003173828125, "learning_rate": 9.96848322121589e-06, "loss": 9.9235, "step": 18180 }, { "epoch": 0.8072738972933691, "grad_norm": 66.29298400878906, "learning_rate": 9.968465885253961e-06, "loss": 10.5438, "step": 18190 }, { "epoch": 0.8077176982264606, "grad_norm": 67.15369415283203, "learning_rate": 9.968448549292034e-06, "loss": 10.3155, "step": 18200 }, { "epoch": 0.8081614991595519, "grad_norm": 59.32017517089844, "learning_rate": 9.968431213330107e-06, "loss": 10.3011, "step": 18210 }, { "epoch": 0.8086053000926434, "grad_norm": 72.47640228271484, "learning_rate": 9.96841387736818e-06, "loss": 10.4339, "step": 18220 }, { "epoch": 0.8090491010257349, "grad_norm": 68.0174789428711, "learning_rate": 9.968396541406252e-06, "loss": 10.1946, "step": 18230 }, { "epoch": 0.8094929019588264, "grad_norm": 62.51362228393555, "learning_rate": 9.968379205444325e-06, "loss": 10.5557, "step": 18240 }, { "epoch": 0.8099367028919179, "grad_norm": 72.3086929321289, "learning_rate": 9.968361869482398e-06, "loss": 10.1581, "step": 18250 }, { "epoch": 0.8103805038250093, "grad_norm": 74.7848892211914, "learning_rate": 9.968344533520469e-06, "loss": 10.6374, "step": 18260 }, { "epoch": 0.8108243047581007, "grad_norm": 60.53010177612305, "learning_rate": 9.968327197558542e-06, "loss": 10.5352, "step": 18270 }, { "epoch": 0.8112681056911922, "grad_norm": 71.8178482055664, "learning_rate": 9.968309861596615e-06, "loss": 10.5353, "step": 18280 }, { "epoch": 0.8117119066242837, "grad_norm": 58.963165283203125, "learning_rate": 9.968292525634687e-06, "loss": 10.3527, "step": 18290 }, { "epoch": 0.8121557075573751, "grad_norm": 64.75, "learning_rate": 9.96827518967276e-06, "loss": 11.0742, "step": 18300 }, { "epoch": 0.8125995084904666, "grad_norm": 61.654296875, "learning_rate": 9.968257853710833e-06, "loss": 9.8275, "step": 18310 }, { "epoch": 0.8130433094235581, "grad_norm": 63.89625930786133, "learning_rate": 9.968240517748904e-06, "loss": 10.3494, "step": 18320 }, { "epoch": 0.8134871103566496, "grad_norm": 69.73605346679688, "learning_rate": 9.968223181786977e-06, "loss": 10.4358, "step": 18330 }, { "epoch": 0.8139309112897409, "grad_norm": 69.21589660644531, "learning_rate": 9.96820584582505e-06, "loss": 10.3777, "step": 18340 }, { "epoch": 0.8143747122228324, "grad_norm": 68.85872650146484, "learning_rate": 9.968188509863122e-06, "loss": 10.7136, "step": 18350 }, { "epoch": 0.8148185131559239, "grad_norm": 63.11106491088867, "learning_rate": 9.968171173901195e-06, "loss": 11.0215, "step": 18360 }, { "epoch": 0.8152623140890154, "grad_norm": 56.74385070800781, "learning_rate": 9.968153837939268e-06, "loss": 10.032, "step": 18370 }, { "epoch": 0.8157061150221069, "grad_norm": 65.63390350341797, "learning_rate": 9.968136501977341e-06, "loss": 10.615, "step": 18380 }, { "epoch": 0.8161499159551983, "grad_norm": 58.63720703125, "learning_rate": 9.968119166015412e-06, "loss": 10.3503, "step": 18390 }, { "epoch": 0.8165937168882897, "grad_norm": 60.3001708984375, "learning_rate": 9.968101830053485e-06, "loss": 10.2183, "step": 18400 }, { "epoch": 0.8170375178213812, "grad_norm": 68.03216552734375, "learning_rate": 9.968084494091559e-06, "loss": 10.5718, "step": 18410 }, { "epoch": 0.8174813187544727, "grad_norm": 71.72623443603516, "learning_rate": 9.96806715812963e-06, "loss": 10.7557, "step": 18420 }, { "epoch": 0.8179251196875641, "grad_norm": 69.74810791015625, "learning_rate": 9.968049822167703e-06, "loss": 10.1841, "step": 18430 }, { "epoch": 0.8183689206206556, "grad_norm": 58.47687530517578, "learning_rate": 9.968032486205776e-06, "loss": 10.0264, "step": 18440 }, { "epoch": 0.8188127215537471, "grad_norm": 67.85263061523438, "learning_rate": 9.968015150243847e-06, "loss": 10.3225, "step": 18450 }, { "epoch": 0.8192565224868386, "grad_norm": 67.8355712890625, "learning_rate": 9.96799781428192e-06, "loss": 9.9914, "step": 18460 }, { "epoch": 0.81970032341993, "grad_norm": 74.0328140258789, "learning_rate": 9.967980478319994e-06, "loss": 10.3044, "step": 18470 }, { "epoch": 0.8201441243530214, "grad_norm": 75.59931945800781, "learning_rate": 9.967963142358065e-06, "loss": 10.6398, "step": 18480 }, { "epoch": 0.8205879252861129, "grad_norm": 59.03470230102539, "learning_rate": 9.967945806396138e-06, "loss": 10.1309, "step": 18490 }, { "epoch": 0.8210317262192044, "grad_norm": 63.74763107299805, "learning_rate": 9.967928470434211e-06, "loss": 9.8492, "step": 18500 }, { "epoch": 0.8214755271522959, "grad_norm": 58.71684265136719, "learning_rate": 9.967911134472284e-06, "loss": 10.0112, "step": 18510 }, { "epoch": 0.8219193280853873, "grad_norm": 70.03022003173828, "learning_rate": 9.967893798510356e-06, "loss": 10.6968, "step": 18520 }, { "epoch": 0.8223631290184787, "grad_norm": 61.144004821777344, "learning_rate": 9.967876462548429e-06, "loss": 10.3381, "step": 18530 }, { "epoch": 0.8228069299515702, "grad_norm": 67.76824188232422, "learning_rate": 9.967859126586502e-06, "loss": 10.2819, "step": 18540 }, { "epoch": 0.8232507308846617, "grad_norm": 64.85346221923828, "learning_rate": 9.967841790624573e-06, "loss": 10.7358, "step": 18550 }, { "epoch": 0.8236945318177531, "grad_norm": 64.5184326171875, "learning_rate": 9.967824454662646e-06, "loss": 10.49, "step": 18560 }, { "epoch": 0.8241383327508446, "grad_norm": 69.41261291503906, "learning_rate": 9.96780711870072e-06, "loss": 10.6639, "step": 18570 }, { "epoch": 0.8245821336839361, "grad_norm": 67.25212097167969, "learning_rate": 9.96778978273879e-06, "loss": 10.5115, "step": 18580 }, { "epoch": 0.8250259346170276, "grad_norm": 62.52476501464844, "learning_rate": 9.967772446776864e-06, "loss": 10.0552, "step": 18590 }, { "epoch": 0.825469735550119, "grad_norm": 62.43718719482422, "learning_rate": 9.967755110814937e-06, "loss": 10.2405, "step": 18600 }, { "epoch": 0.8259135364832104, "grad_norm": 67.1116714477539, "learning_rate": 9.96773777485301e-06, "loss": 10.4917, "step": 18610 }, { "epoch": 0.8263573374163019, "grad_norm": 67.36260986328125, "learning_rate": 9.967720438891081e-06, "loss": 9.8809, "step": 18620 }, { "epoch": 0.8268011383493934, "grad_norm": 69.18153381347656, "learning_rate": 9.967703102929154e-06, "loss": 10.0411, "step": 18630 }, { "epoch": 0.8272449392824849, "grad_norm": 54.77642059326172, "learning_rate": 9.967685766967227e-06, "loss": 10.2812, "step": 18640 }, { "epoch": 0.8276887402155763, "grad_norm": 64.23429107666016, "learning_rate": 9.967668431005299e-06, "loss": 10.2792, "step": 18650 }, { "epoch": 0.8281325411486677, "grad_norm": 77.61302185058594, "learning_rate": 9.967651095043372e-06, "loss": 10.5122, "step": 18660 }, { "epoch": 0.8285763420817592, "grad_norm": 63.884666442871094, "learning_rate": 9.967633759081445e-06, "loss": 10.5592, "step": 18670 }, { "epoch": 0.8290201430148507, "grad_norm": 68.2164077758789, "learning_rate": 9.967616423119516e-06, "loss": 10.0031, "step": 18680 }, { "epoch": 0.8294639439479422, "grad_norm": 64.70232391357422, "learning_rate": 9.96759908715759e-06, "loss": 10.0672, "step": 18690 }, { "epoch": 0.8299077448810336, "grad_norm": 70.52904510498047, "learning_rate": 9.967581751195663e-06, "loss": 10.8457, "step": 18700 }, { "epoch": 0.8303515458141251, "grad_norm": 74.24815368652344, "learning_rate": 9.967564415233734e-06, "loss": 10.2618, "step": 18710 }, { "epoch": 0.8307953467472166, "grad_norm": 70.21379852294922, "learning_rate": 9.967547079271807e-06, "loss": 10.6413, "step": 18720 }, { "epoch": 0.831239147680308, "grad_norm": 59.27021408081055, "learning_rate": 9.96752974330988e-06, "loss": 10.0213, "step": 18730 }, { "epoch": 0.8316829486133994, "grad_norm": 68.8056869506836, "learning_rate": 9.967512407347953e-06, "loss": 10.6691, "step": 18740 }, { "epoch": 0.8321267495464909, "grad_norm": 59.90221405029297, "learning_rate": 9.967495071386025e-06, "loss": 10.2026, "step": 18750 }, { "epoch": 0.8325705504795824, "grad_norm": 59.39807891845703, "learning_rate": 9.967477735424098e-06, "loss": 10.296, "step": 18760 }, { "epoch": 0.8330143514126739, "grad_norm": 60.97962951660156, "learning_rate": 9.96746039946217e-06, "loss": 10.4608, "step": 18770 }, { "epoch": 0.8334581523457653, "grad_norm": 69.33479309082031, "learning_rate": 9.967443063500242e-06, "loss": 10.1023, "step": 18780 }, { "epoch": 0.8339019532788567, "grad_norm": 62.024993896484375, "learning_rate": 9.967425727538315e-06, "loss": 10.1188, "step": 18790 }, { "epoch": 0.8343457542119482, "grad_norm": 70.13167572021484, "learning_rate": 9.967408391576388e-06, "loss": 10.5289, "step": 18800 }, { "epoch": 0.8347895551450397, "grad_norm": 59.78411865234375, "learning_rate": 9.96739105561446e-06, "loss": 10.0232, "step": 18810 }, { "epoch": 0.8352333560781312, "grad_norm": 65.34579467773438, "learning_rate": 9.967373719652533e-06, "loss": 10.3009, "step": 18820 }, { "epoch": 0.8356771570112226, "grad_norm": 66.02912902832031, "learning_rate": 9.967356383690606e-06, "loss": 10.4396, "step": 18830 }, { "epoch": 0.8361209579443141, "grad_norm": 64.6055679321289, "learning_rate": 9.967339047728677e-06, "loss": 11.1638, "step": 18840 }, { "epoch": 0.8365647588774056, "grad_norm": 68.47040557861328, "learning_rate": 9.96732171176675e-06, "loss": 10.3824, "step": 18850 }, { "epoch": 0.837008559810497, "grad_norm": 70.76081848144531, "learning_rate": 9.967304375804823e-06, "loss": 10.4553, "step": 18860 }, { "epoch": 0.8374523607435884, "grad_norm": 69.01679229736328, "learning_rate": 9.967287039842896e-06, "loss": 10.7882, "step": 18870 }, { "epoch": 0.8378961616766799, "grad_norm": 72.1138687133789, "learning_rate": 9.967269703880968e-06, "loss": 10.7932, "step": 18880 }, { "epoch": 0.8383399626097714, "grad_norm": 63.26852035522461, "learning_rate": 9.967252367919041e-06, "loss": 9.8872, "step": 18890 }, { "epoch": 0.8387837635428629, "grad_norm": 74.27698516845703, "learning_rate": 9.967235031957114e-06, "loss": 10.3047, "step": 18900 }, { "epoch": 0.8392275644759543, "grad_norm": 69.13713073730469, "learning_rate": 9.967217695995185e-06, "loss": 10.4936, "step": 18910 }, { "epoch": 0.8396713654090457, "grad_norm": 66.47625732421875, "learning_rate": 9.967200360033258e-06, "loss": 10.6965, "step": 18920 }, { "epoch": 0.8401151663421372, "grad_norm": 62.18655776977539, "learning_rate": 9.967183024071331e-06, "loss": 10.3633, "step": 18930 }, { "epoch": 0.8405589672752287, "grad_norm": 60.379478454589844, "learning_rate": 9.967165688109403e-06, "loss": 10.4711, "step": 18940 }, { "epoch": 0.8410027682083202, "grad_norm": 62.18358612060547, "learning_rate": 9.967148352147476e-06, "loss": 10.3252, "step": 18950 }, { "epoch": 0.8414465691414116, "grad_norm": 69.07564544677734, "learning_rate": 9.967131016185549e-06, "loss": 10.785, "step": 18960 }, { "epoch": 0.8418903700745031, "grad_norm": 67.07147216796875, "learning_rate": 9.96711368022362e-06, "loss": 10.6486, "step": 18970 }, { "epoch": 0.8423341710075946, "grad_norm": 68.17425537109375, "learning_rate": 9.967096344261693e-06, "loss": 10.2245, "step": 18980 }, { "epoch": 0.842777971940686, "grad_norm": 62.00086975097656, "learning_rate": 9.967079008299767e-06, "loss": 10.4259, "step": 18990 }, { "epoch": 0.8432217728737774, "grad_norm": 68.44880676269531, "learning_rate": 9.96706167233784e-06, "loss": 10.5632, "step": 19000 }, { "epoch": 0.8432217728737774, "eval_loss": 0.3246602714061737, "eval_runtime": 673.5905, "eval_samples_per_second": 1802.863, "eval_steps_per_second": 56.34, "step": 19000 }, { "epoch": 0.8436655738068689, "grad_norm": 67.45813751220703, "learning_rate": 9.967044336375911e-06, "loss": 10.3405, "step": 19010 }, { "epoch": 0.8441093747399604, "grad_norm": 71.77626037597656, "learning_rate": 9.967027000413984e-06, "loss": 10.9004, "step": 19020 }, { "epoch": 0.8445531756730519, "grad_norm": 63.87392044067383, "learning_rate": 9.967009664452057e-06, "loss": 10.5393, "step": 19030 }, { "epoch": 0.8449969766061434, "grad_norm": 62.10248947143555, "learning_rate": 9.966992328490129e-06, "loss": 10.1483, "step": 19040 }, { "epoch": 0.8454407775392347, "grad_norm": 58.07029342651367, "learning_rate": 9.966974992528202e-06, "loss": 10.4056, "step": 19050 }, { "epoch": 0.8458845784723262, "grad_norm": 69.88272094726562, "learning_rate": 9.966957656566275e-06, "loss": 10.3336, "step": 19060 }, { "epoch": 0.8463283794054177, "grad_norm": 57.19210433959961, "learning_rate": 9.966940320604346e-06, "loss": 10.5325, "step": 19070 }, { "epoch": 0.8467721803385092, "grad_norm": 68.29473876953125, "learning_rate": 9.96692298464242e-06, "loss": 10.5171, "step": 19080 }, { "epoch": 0.8472159812716006, "grad_norm": 61.379425048828125, "learning_rate": 9.966905648680492e-06, "loss": 9.9401, "step": 19090 }, { "epoch": 0.8476597822046921, "grad_norm": 68.13114929199219, "learning_rate": 9.966888312718564e-06, "loss": 10.4374, "step": 19100 }, { "epoch": 0.8481035831377836, "grad_norm": 64.97882843017578, "learning_rate": 9.966870976756637e-06, "loss": 10.4241, "step": 19110 }, { "epoch": 0.848547384070875, "grad_norm": 66.36862182617188, "learning_rate": 9.96685364079471e-06, "loss": 10.3443, "step": 19120 }, { "epoch": 0.8489911850039664, "grad_norm": 68.75626373291016, "learning_rate": 9.966836304832783e-06, "loss": 10.2445, "step": 19130 }, { "epoch": 0.8494349859370579, "grad_norm": 69.54931640625, "learning_rate": 9.966818968870854e-06, "loss": 10.1782, "step": 19140 }, { "epoch": 0.8498787868701494, "grad_norm": 62.799842834472656, "learning_rate": 9.966801632908927e-06, "loss": 9.9597, "step": 19150 }, { "epoch": 0.8503225878032409, "grad_norm": 72.54212951660156, "learning_rate": 9.966784296947e-06, "loss": 10.6076, "step": 19160 }, { "epoch": 0.8507663887363324, "grad_norm": 66.57682037353516, "learning_rate": 9.966766960985072e-06, "loss": 10.3762, "step": 19170 }, { "epoch": 0.8512101896694237, "grad_norm": 59.173683166503906, "learning_rate": 9.966749625023145e-06, "loss": 10.1135, "step": 19180 }, { "epoch": 0.8516539906025152, "grad_norm": 74.29920959472656, "learning_rate": 9.966732289061218e-06, "loss": 10.5823, "step": 19190 }, { "epoch": 0.8520977915356067, "grad_norm": 65.05313873291016, "learning_rate": 9.96671495309929e-06, "loss": 10.5004, "step": 19200 }, { "epoch": 0.8525415924686982, "grad_norm": 69.46266174316406, "learning_rate": 9.966697617137362e-06, "loss": 10.02, "step": 19210 }, { "epoch": 0.8529853934017896, "grad_norm": 64.3421859741211, "learning_rate": 9.966680281175436e-06, "loss": 10.7279, "step": 19220 }, { "epoch": 0.8534291943348811, "grad_norm": 69.0867919921875, "learning_rate": 9.966662945213507e-06, "loss": 10.4257, "step": 19230 }, { "epoch": 0.8538729952679726, "grad_norm": 70.24497985839844, "learning_rate": 9.96664560925158e-06, "loss": 10.169, "step": 19240 }, { "epoch": 0.854316796201064, "grad_norm": 67.85358428955078, "learning_rate": 9.966628273289653e-06, "loss": 10.3167, "step": 19250 }, { "epoch": 0.8547605971341554, "grad_norm": 70.9292221069336, "learning_rate": 9.966610937327724e-06, "loss": 11.0968, "step": 19260 }, { "epoch": 0.8552043980672469, "grad_norm": 71.09864044189453, "learning_rate": 9.966593601365798e-06, "loss": 10.5742, "step": 19270 }, { "epoch": 0.8556481990003384, "grad_norm": 69.86164093017578, "learning_rate": 9.96657626540387e-06, "loss": 10.5123, "step": 19280 }, { "epoch": 0.8560919999334299, "grad_norm": 56.01103210449219, "learning_rate": 9.966558929441942e-06, "loss": 10.0181, "step": 19290 }, { "epoch": 0.8565358008665214, "grad_norm": 70.41612243652344, "learning_rate": 9.966541593480015e-06, "loss": 10.4778, "step": 19300 }, { "epoch": 0.8569796017996127, "grad_norm": 66.11145782470703, "learning_rate": 9.966524257518088e-06, "loss": 10.7167, "step": 19310 }, { "epoch": 0.8574234027327042, "grad_norm": 72.80441284179688, "learning_rate": 9.966506921556161e-06, "loss": 10.3861, "step": 19320 }, { "epoch": 0.8578672036657957, "grad_norm": 62.77549362182617, "learning_rate": 9.966489585594233e-06, "loss": 10.2368, "step": 19330 }, { "epoch": 0.8583110045988872, "grad_norm": 68.18376922607422, "learning_rate": 9.966472249632306e-06, "loss": 10.3437, "step": 19340 }, { "epoch": 0.8587548055319786, "grad_norm": 66.24810028076172, "learning_rate": 9.966454913670379e-06, "loss": 10.0901, "step": 19350 }, { "epoch": 0.8591986064650701, "grad_norm": 68.41353607177734, "learning_rate": 9.96643757770845e-06, "loss": 10.6208, "step": 19360 }, { "epoch": 0.8596424073981616, "grad_norm": 61.160438537597656, "learning_rate": 9.966420241746523e-06, "loss": 10.2203, "step": 19370 }, { "epoch": 0.860086208331253, "grad_norm": 66.53337097167969, "learning_rate": 9.966402905784596e-06, "loss": 10.4119, "step": 19380 }, { "epoch": 0.8605300092643445, "grad_norm": 74.49799346923828, "learning_rate": 9.966385569822668e-06, "loss": 10.1698, "step": 19390 }, { "epoch": 0.8609738101974359, "grad_norm": 76.49808502197266, "learning_rate": 9.96636823386074e-06, "loss": 10.356, "step": 19400 }, { "epoch": 0.8614176111305274, "grad_norm": 72.61251068115234, "learning_rate": 9.966350897898814e-06, "loss": 10.0123, "step": 19410 }, { "epoch": 0.8618614120636189, "grad_norm": 69.5442123413086, "learning_rate": 9.966333561936885e-06, "loss": 10.7127, "step": 19420 }, { "epoch": 0.8623052129967104, "grad_norm": 75.41436767578125, "learning_rate": 9.966316225974958e-06, "loss": 10.2879, "step": 19430 }, { "epoch": 0.8627490139298017, "grad_norm": 62.93849563598633, "learning_rate": 9.966298890013031e-06, "loss": 10.1251, "step": 19440 }, { "epoch": 0.8631928148628932, "grad_norm": 61.55092239379883, "learning_rate": 9.966281554051103e-06, "loss": 10.4005, "step": 19450 }, { "epoch": 0.8636366157959847, "grad_norm": 64.35807037353516, "learning_rate": 9.966264218089176e-06, "loss": 10.3085, "step": 19460 }, { "epoch": 0.8640804167290762, "grad_norm": 63.883033752441406, "learning_rate": 9.966246882127249e-06, "loss": 10.7861, "step": 19470 }, { "epoch": 0.8645242176621676, "grad_norm": 62.51860809326172, "learning_rate": 9.96622954616532e-06, "loss": 10.5587, "step": 19480 }, { "epoch": 0.8649680185952591, "grad_norm": 67.6877212524414, "learning_rate": 9.966212210203393e-06, "loss": 10.5259, "step": 19490 }, { "epoch": 0.8654118195283506, "grad_norm": 55.97256851196289, "learning_rate": 9.966194874241466e-06, "loss": 10.0621, "step": 19500 }, { "epoch": 0.865855620461442, "grad_norm": 66.48442077636719, "learning_rate": 9.966177538279538e-06, "loss": 10.1117, "step": 19510 }, { "epoch": 0.8662994213945335, "grad_norm": 71.5040512084961, "learning_rate": 9.966160202317611e-06, "loss": 10.0883, "step": 19520 }, { "epoch": 0.8667432223276249, "grad_norm": 62.448360443115234, "learning_rate": 9.966142866355684e-06, "loss": 9.8626, "step": 19530 }, { "epoch": 0.8671870232607164, "grad_norm": 70.04524230957031, "learning_rate": 9.966125530393757e-06, "loss": 10.3496, "step": 19540 }, { "epoch": 0.8676308241938079, "grad_norm": 68.1249008178711, "learning_rate": 9.966108194431828e-06, "loss": 10.2206, "step": 19550 }, { "epoch": 0.8680746251268994, "grad_norm": 60.323795318603516, "learning_rate": 9.966090858469902e-06, "loss": 10.545, "step": 19560 }, { "epoch": 0.8685184260599907, "grad_norm": 62.9224853515625, "learning_rate": 9.966073522507975e-06, "loss": 10.4674, "step": 19570 }, { "epoch": 0.8689622269930822, "grad_norm": 60.80291748046875, "learning_rate": 9.966056186546046e-06, "loss": 10.1987, "step": 19580 }, { "epoch": 0.8694060279261737, "grad_norm": 53.80615234375, "learning_rate": 9.966038850584119e-06, "loss": 10.4711, "step": 19590 }, { "epoch": 0.8698498288592652, "grad_norm": 58.71421813964844, "learning_rate": 9.966021514622192e-06, "loss": 10.2597, "step": 19600 }, { "epoch": 0.8702936297923567, "grad_norm": 60.54587936401367, "learning_rate": 9.966004178660264e-06, "loss": 10.3637, "step": 19610 }, { "epoch": 0.8707374307254481, "grad_norm": 58.478153228759766, "learning_rate": 9.965986842698337e-06, "loss": 10.1861, "step": 19620 }, { "epoch": 0.8711812316585396, "grad_norm": 70.70100402832031, "learning_rate": 9.96596950673641e-06, "loss": 9.8945, "step": 19630 }, { "epoch": 0.871625032591631, "grad_norm": 64.967041015625, "learning_rate": 9.965952170774481e-06, "loss": 10.0128, "step": 19640 }, { "epoch": 0.8720688335247225, "grad_norm": 67.52765655517578, "learning_rate": 9.965934834812554e-06, "loss": 10.5425, "step": 19650 }, { "epoch": 0.8725126344578139, "grad_norm": 63.967247009277344, "learning_rate": 9.965917498850627e-06, "loss": 10.6763, "step": 19660 }, { "epoch": 0.8729564353909054, "grad_norm": 71.41963958740234, "learning_rate": 9.965900162888699e-06, "loss": 10.6471, "step": 19670 }, { "epoch": 0.8734002363239969, "grad_norm": 60.292701721191406, "learning_rate": 9.965882826926772e-06, "loss": 10.1802, "step": 19680 }, { "epoch": 0.8738440372570884, "grad_norm": 63.403560638427734, "learning_rate": 9.965865490964845e-06, "loss": 10.2882, "step": 19690 }, { "epoch": 0.8742878381901797, "grad_norm": 65.6253662109375, "learning_rate": 9.965848155002916e-06, "loss": 10.2649, "step": 19700 }, { "epoch": 0.8747316391232712, "grad_norm": 61.846683502197266, "learning_rate": 9.96583081904099e-06, "loss": 10.2564, "step": 19710 }, { "epoch": 0.8751754400563627, "grad_norm": 65.23727416992188, "learning_rate": 9.965813483079062e-06, "loss": 10.5773, "step": 19720 }, { "epoch": 0.8756192409894542, "grad_norm": 57.89336395263672, "learning_rate": 9.965796147117134e-06, "loss": 10.3047, "step": 19730 }, { "epoch": 0.8760630419225457, "grad_norm": 63.75661849975586, "learning_rate": 9.965778811155207e-06, "loss": 9.6075, "step": 19740 }, { "epoch": 0.8765068428556371, "grad_norm": 52.67669677734375, "learning_rate": 9.96576147519328e-06, "loss": 10.1751, "step": 19750 }, { "epoch": 0.8769506437887286, "grad_norm": 62.90458297729492, "learning_rate": 9.965744139231353e-06, "loss": 9.9069, "step": 19760 }, { "epoch": 0.87739444472182, "grad_norm": 75.43021392822266, "learning_rate": 9.965726803269424e-06, "loss": 10.7198, "step": 19770 }, { "epoch": 0.8778382456549115, "grad_norm": 65.16674041748047, "learning_rate": 9.965709467307497e-06, "loss": 9.8234, "step": 19780 }, { "epoch": 0.8782820465880029, "grad_norm": 55.970890045166016, "learning_rate": 9.96569213134557e-06, "loss": 10.2538, "step": 19790 }, { "epoch": 0.8787258475210944, "grad_norm": 63.947113037109375, "learning_rate": 9.965674795383642e-06, "loss": 10.4209, "step": 19800 }, { "epoch": 0.8791696484541859, "grad_norm": 65.56307983398438, "learning_rate": 9.965657459421715e-06, "loss": 9.7725, "step": 19810 }, { "epoch": 0.8796134493872774, "grad_norm": 66.70881652832031, "learning_rate": 9.965640123459788e-06, "loss": 10.7015, "step": 19820 }, { "epoch": 0.8800572503203687, "grad_norm": 65.10243225097656, "learning_rate": 9.96562278749786e-06, "loss": 10.1794, "step": 19830 }, { "epoch": 0.8805010512534602, "grad_norm": 58.50627517700195, "learning_rate": 9.965605451535932e-06, "loss": 10.758, "step": 19840 }, { "epoch": 0.8809448521865517, "grad_norm": 66.39806365966797, "learning_rate": 9.965588115574006e-06, "loss": 9.9667, "step": 19850 }, { "epoch": 0.8813886531196432, "grad_norm": 78.95050048828125, "learning_rate": 9.965570779612077e-06, "loss": 10.5293, "step": 19860 }, { "epoch": 0.8818324540527347, "grad_norm": 63.072444915771484, "learning_rate": 9.96555344365015e-06, "loss": 10.1306, "step": 19870 }, { "epoch": 0.8822762549858261, "grad_norm": 61.44473648071289, "learning_rate": 9.965536107688223e-06, "loss": 10.3155, "step": 19880 }, { "epoch": 0.8827200559189176, "grad_norm": 70.35446166992188, "learning_rate": 9.965518771726294e-06, "loss": 10.2451, "step": 19890 }, { "epoch": 0.883163856852009, "grad_norm": 58.564395904541016, "learning_rate": 9.965501435764368e-06, "loss": 10.1813, "step": 19900 }, { "epoch": 0.8836076577851005, "grad_norm": 64.06719970703125, "learning_rate": 9.96548409980244e-06, "loss": 10.2219, "step": 19910 }, { "epoch": 0.8840514587181919, "grad_norm": 57.828590393066406, "learning_rate": 9.965466763840512e-06, "loss": 10.4087, "step": 19920 }, { "epoch": 0.8844952596512834, "grad_norm": 61.435123443603516, "learning_rate": 9.965449427878585e-06, "loss": 10.3527, "step": 19930 }, { "epoch": 0.8849390605843749, "grad_norm": 61.76189041137695, "learning_rate": 9.965432091916658e-06, "loss": 10.3732, "step": 19940 }, { "epoch": 0.8853828615174664, "grad_norm": 62.846946716308594, "learning_rate": 9.96541475595473e-06, "loss": 10.2244, "step": 19950 }, { "epoch": 0.8858266624505579, "grad_norm": 63.24193572998047, "learning_rate": 9.965397419992803e-06, "loss": 10.1221, "step": 19960 }, { "epoch": 0.8862704633836492, "grad_norm": 56.531044006347656, "learning_rate": 9.965380084030876e-06, "loss": 9.9695, "step": 19970 }, { "epoch": 0.8867142643167407, "grad_norm": 59.855491638183594, "learning_rate": 9.965362748068949e-06, "loss": 10.4246, "step": 19980 }, { "epoch": 0.8871580652498322, "grad_norm": 56.98590850830078, "learning_rate": 9.96534541210702e-06, "loss": 10.5376, "step": 19990 }, { "epoch": 0.8876018661829237, "grad_norm": 64.37902069091797, "learning_rate": 9.965328076145093e-06, "loss": 9.9193, "step": 20000 }, { "epoch": 0.8876018661829237, "eval_loss": 0.32305407524108887, "eval_runtime": 673.2893, "eval_samples_per_second": 1803.669, "eval_steps_per_second": 56.365, "step": 20000 }, { "epoch": 0.8880456671160151, "grad_norm": 65.7813491821289, "learning_rate": 9.965310740183166e-06, "loss": 10.058, "step": 20010 }, { "epoch": 0.8884894680491066, "grad_norm": 56.84501647949219, "learning_rate": 9.965293404221238e-06, "loss": 10.5991, "step": 20020 }, { "epoch": 0.888933268982198, "grad_norm": 55.88136672973633, "learning_rate": 9.96527606825931e-06, "loss": 10.2387, "step": 20030 }, { "epoch": 0.8893770699152895, "grad_norm": 65.21720886230469, "learning_rate": 9.965258732297384e-06, "loss": 10.6988, "step": 20040 }, { "epoch": 0.8898208708483809, "grad_norm": 57.60173416137695, "learning_rate": 9.965241396335455e-06, "loss": 10.0901, "step": 20050 }, { "epoch": 0.8902646717814724, "grad_norm": 66.22525787353516, "learning_rate": 9.965224060373528e-06, "loss": 9.9539, "step": 20060 }, { "epoch": 0.8907084727145639, "grad_norm": 59.16026306152344, "learning_rate": 9.965206724411601e-06, "loss": 10.2055, "step": 20070 }, { "epoch": 0.8911522736476554, "grad_norm": 63.042850494384766, "learning_rate": 9.965189388449673e-06, "loss": 10.608, "step": 20080 }, { "epoch": 0.8915960745807469, "grad_norm": 70.37860870361328, "learning_rate": 9.965172052487746e-06, "loss": 10.2199, "step": 20090 }, { "epoch": 0.8920398755138382, "grad_norm": 59.544456481933594, "learning_rate": 9.965154716525819e-06, "loss": 10.625, "step": 20100 }, { "epoch": 0.8924836764469297, "grad_norm": 59.02389907836914, "learning_rate": 9.96513738056389e-06, "loss": 10.6333, "step": 20110 }, { "epoch": 0.8929274773800212, "grad_norm": 55.048667907714844, "learning_rate": 9.965120044601963e-06, "loss": 9.668, "step": 20120 }, { "epoch": 0.8933712783131127, "grad_norm": 70.93208312988281, "learning_rate": 9.965102708640036e-06, "loss": 10.0719, "step": 20130 }, { "epoch": 0.8938150792462041, "grad_norm": 60.164588928222656, "learning_rate": 9.965085372678108e-06, "loss": 10.5491, "step": 20140 }, { "epoch": 0.8942588801792956, "grad_norm": 63.05065155029297, "learning_rate": 9.965068036716181e-06, "loss": 10.5735, "step": 20150 }, { "epoch": 0.894702681112387, "grad_norm": 66.36126708984375, "learning_rate": 9.965050700754254e-06, "loss": 10.0222, "step": 20160 }, { "epoch": 0.8951464820454785, "grad_norm": 65.72769165039062, "learning_rate": 9.965033364792325e-06, "loss": 9.764, "step": 20170 }, { "epoch": 0.8955902829785699, "grad_norm": 65.65251922607422, "learning_rate": 9.965016028830398e-06, "loss": 10.2024, "step": 20180 }, { "epoch": 0.8960340839116614, "grad_norm": 75.98188018798828, "learning_rate": 9.964998692868472e-06, "loss": 10.6188, "step": 20190 }, { "epoch": 0.8964778848447529, "grad_norm": 53.20927429199219, "learning_rate": 9.964981356906545e-06, "loss": 10.6707, "step": 20200 }, { "epoch": 0.8969216857778444, "grad_norm": 68.85663604736328, "learning_rate": 9.964964020944616e-06, "loss": 10.2216, "step": 20210 }, { "epoch": 0.8973654867109359, "grad_norm": 60.16876983642578, "learning_rate": 9.964946684982689e-06, "loss": 9.6589, "step": 20220 }, { "epoch": 0.8978092876440272, "grad_norm": 63.93645477294922, "learning_rate": 9.964929349020762e-06, "loss": 10.0363, "step": 20230 }, { "epoch": 0.8982530885771187, "grad_norm": 60.31320571899414, "learning_rate": 9.964912013058834e-06, "loss": 10.2793, "step": 20240 }, { "epoch": 0.8986968895102102, "grad_norm": 57.123626708984375, "learning_rate": 9.964894677096907e-06, "loss": 10.699, "step": 20250 }, { "epoch": 0.8991406904433017, "grad_norm": 67.34060668945312, "learning_rate": 9.96487734113498e-06, "loss": 10.715, "step": 20260 }, { "epoch": 0.8995844913763931, "grad_norm": 60.88409423828125, "learning_rate": 9.964860005173051e-06, "loss": 10.485, "step": 20270 }, { "epoch": 0.9000282923094846, "grad_norm": 59.35341262817383, "learning_rate": 9.964842669211124e-06, "loss": 9.8806, "step": 20280 }, { "epoch": 0.900472093242576, "grad_norm": 70.66928100585938, "learning_rate": 9.964825333249197e-06, "loss": 10.1476, "step": 20290 }, { "epoch": 0.9009158941756675, "grad_norm": 57.342830657958984, "learning_rate": 9.964807997287269e-06, "loss": 10.0399, "step": 20300 }, { "epoch": 0.901359695108759, "grad_norm": 72.41960906982422, "learning_rate": 9.964790661325342e-06, "loss": 10.3914, "step": 20310 }, { "epoch": 0.9018034960418504, "grad_norm": 66.77056121826172, "learning_rate": 9.964773325363415e-06, "loss": 10.1905, "step": 20320 }, { "epoch": 0.9022472969749419, "grad_norm": 72.86076354980469, "learning_rate": 9.964755989401486e-06, "loss": 10.1389, "step": 20330 }, { "epoch": 0.9026910979080334, "grad_norm": 64.16802215576172, "learning_rate": 9.96473865343956e-06, "loss": 10.4992, "step": 20340 }, { "epoch": 0.9031348988411249, "grad_norm": 71.86752319335938, "learning_rate": 9.964721317477632e-06, "loss": 9.9231, "step": 20350 }, { "epoch": 0.9035786997742162, "grad_norm": 69.46532440185547, "learning_rate": 9.964703981515704e-06, "loss": 10.2235, "step": 20360 }, { "epoch": 0.9040225007073077, "grad_norm": 58.126220703125, "learning_rate": 9.964686645553777e-06, "loss": 9.7999, "step": 20370 }, { "epoch": 0.9044663016403992, "grad_norm": 60.532958984375, "learning_rate": 9.96466930959185e-06, "loss": 10.689, "step": 20380 }, { "epoch": 0.9049101025734907, "grad_norm": 68.76100158691406, "learning_rate": 9.964651973629921e-06, "loss": 10.3952, "step": 20390 }, { "epoch": 0.9053539035065821, "grad_norm": 68.4054183959961, "learning_rate": 9.964634637667994e-06, "loss": 10.5912, "step": 20400 }, { "epoch": 0.9057977044396736, "grad_norm": 58.026851654052734, "learning_rate": 9.964617301706067e-06, "loss": 10.5631, "step": 20410 }, { "epoch": 0.906241505372765, "grad_norm": 68.02127838134766, "learning_rate": 9.96459996574414e-06, "loss": 10.4884, "step": 20420 }, { "epoch": 0.9066853063058565, "grad_norm": 71.5272216796875, "learning_rate": 9.964582629782212e-06, "loss": 10.2185, "step": 20430 }, { "epoch": 0.907129107238948, "grad_norm": 69.71690368652344, "learning_rate": 9.964565293820285e-06, "loss": 9.6736, "step": 20440 }, { "epoch": 0.9075729081720394, "grad_norm": 71.79097747802734, "learning_rate": 9.964547957858358e-06, "loss": 10.3408, "step": 20450 }, { "epoch": 0.9080167091051309, "grad_norm": 62.07600402832031, "learning_rate": 9.96453062189643e-06, "loss": 10.1348, "step": 20460 }, { "epoch": 0.9084605100382224, "grad_norm": 70.9972152709961, "learning_rate": 9.964513285934503e-06, "loss": 10.3505, "step": 20470 }, { "epoch": 0.9089043109713139, "grad_norm": 62.994075775146484, "learning_rate": 9.964495949972576e-06, "loss": 10.8796, "step": 20480 }, { "epoch": 0.9093481119044052, "grad_norm": 57.46833801269531, "learning_rate": 9.964478614010647e-06, "loss": 10.4861, "step": 20490 }, { "epoch": 0.9097919128374967, "grad_norm": 71.40296173095703, "learning_rate": 9.96446127804872e-06, "loss": 9.9325, "step": 20500 }, { "epoch": 0.9102357137705882, "grad_norm": 71.27222442626953, "learning_rate": 9.964443942086793e-06, "loss": 10.204, "step": 20510 }, { "epoch": 0.9106795147036797, "grad_norm": 76.38119506835938, "learning_rate": 9.964426606124865e-06, "loss": 9.9359, "step": 20520 }, { "epoch": 0.9111233156367712, "grad_norm": 60.81074905395508, "learning_rate": 9.964409270162938e-06, "loss": 10.2593, "step": 20530 }, { "epoch": 0.9115671165698626, "grad_norm": 71.3153305053711, "learning_rate": 9.96439193420101e-06, "loss": 10.3253, "step": 20540 }, { "epoch": 0.912010917502954, "grad_norm": 68.32502746582031, "learning_rate": 9.964374598239082e-06, "loss": 10.073, "step": 20550 }, { "epoch": 0.9124547184360455, "grad_norm": 60.2060546875, "learning_rate": 9.964357262277155e-06, "loss": 10.3595, "step": 20560 }, { "epoch": 0.912898519369137, "grad_norm": 69.63631439208984, "learning_rate": 9.964339926315228e-06, "loss": 10.1938, "step": 20570 }, { "epoch": 0.9133423203022284, "grad_norm": 57.60273742675781, "learning_rate": 9.9643225903533e-06, "loss": 10.612, "step": 20580 }, { "epoch": 0.9137861212353199, "grad_norm": 58.673763275146484, "learning_rate": 9.964305254391373e-06, "loss": 9.8792, "step": 20590 }, { "epoch": 0.9142299221684114, "grad_norm": 65.37464141845703, "learning_rate": 9.964287918429446e-06, "loss": 10.0737, "step": 20600 }, { "epoch": 0.9146737231015029, "grad_norm": 63.91664123535156, "learning_rate": 9.964270582467517e-06, "loss": 9.939, "step": 20610 }, { "epoch": 0.9151175240345942, "grad_norm": 69.06259155273438, "learning_rate": 9.96425324650559e-06, "loss": 10.422, "step": 20620 }, { "epoch": 0.9155613249676857, "grad_norm": 68.58475494384766, "learning_rate": 9.964235910543663e-06, "loss": 9.9517, "step": 20630 }, { "epoch": 0.9160051259007772, "grad_norm": 61.14803695678711, "learning_rate": 9.964218574581736e-06, "loss": 10.2273, "step": 20640 }, { "epoch": 0.9164489268338687, "grad_norm": 61.80668258666992, "learning_rate": 9.964201238619808e-06, "loss": 10.4235, "step": 20650 }, { "epoch": 0.9168927277669602, "grad_norm": 68.1749038696289, "learning_rate": 9.96418390265788e-06, "loss": 10.5679, "step": 20660 }, { "epoch": 0.9173365287000516, "grad_norm": 58.40181350708008, "learning_rate": 9.964166566695954e-06, "loss": 10.4709, "step": 20670 }, { "epoch": 0.917780329633143, "grad_norm": 59.180511474609375, "learning_rate": 9.964149230734025e-06, "loss": 9.9807, "step": 20680 }, { "epoch": 0.9182241305662345, "grad_norm": 73.31056213378906, "learning_rate": 9.964131894772098e-06, "loss": 10.3648, "step": 20690 }, { "epoch": 0.918667931499326, "grad_norm": 69.01880645751953, "learning_rate": 9.964114558810171e-06, "loss": 9.9768, "step": 20700 }, { "epoch": 0.9191117324324174, "grad_norm": 68.61119079589844, "learning_rate": 9.964097222848243e-06, "loss": 10.1686, "step": 20710 }, { "epoch": 0.9195555333655089, "grad_norm": 62.651973724365234, "learning_rate": 9.964079886886316e-06, "loss": 10.1275, "step": 20720 }, { "epoch": 0.9199993342986004, "grad_norm": 62.72435760498047, "learning_rate": 9.964062550924389e-06, "loss": 10.2096, "step": 20730 }, { "epoch": 0.9204431352316919, "grad_norm": 57.11748504638672, "learning_rate": 9.96404521496246e-06, "loss": 10.5196, "step": 20740 }, { "epoch": 0.9208869361647832, "grad_norm": 64.23450469970703, "learning_rate": 9.964027879000533e-06, "loss": 10.4117, "step": 20750 }, { "epoch": 0.9213307370978747, "grad_norm": 69.50020599365234, "learning_rate": 9.964010543038607e-06, "loss": 10.4837, "step": 20760 }, { "epoch": 0.9217745380309662, "grad_norm": 55.89120101928711, "learning_rate": 9.963993207076678e-06, "loss": 9.9386, "step": 20770 }, { "epoch": 0.9222183389640577, "grad_norm": 59.105995178222656, "learning_rate": 9.963975871114751e-06, "loss": 10.2322, "step": 20780 }, { "epoch": 0.9226621398971492, "grad_norm": 55.461021423339844, "learning_rate": 9.963958535152824e-06, "loss": 10.1765, "step": 20790 }, { "epoch": 0.9231059408302406, "grad_norm": 76.23094940185547, "learning_rate": 9.963941199190897e-06, "loss": 10.2576, "step": 20800 }, { "epoch": 0.923549741763332, "grad_norm": 72.75599670410156, "learning_rate": 9.963923863228969e-06, "loss": 10.7945, "step": 20810 }, { "epoch": 0.9239935426964235, "grad_norm": 61.83809280395508, "learning_rate": 9.963906527267042e-06, "loss": 10.4771, "step": 20820 }, { "epoch": 0.924437343629515, "grad_norm": 58.66376876831055, "learning_rate": 9.963889191305115e-06, "loss": 10.0925, "step": 20830 }, { "epoch": 0.9248811445626064, "grad_norm": 70.21393585205078, "learning_rate": 9.963871855343186e-06, "loss": 10.2228, "step": 20840 }, { "epoch": 0.9253249454956979, "grad_norm": 55.493282318115234, "learning_rate": 9.96385451938126e-06, "loss": 10.1143, "step": 20850 }, { "epoch": 0.9257687464287894, "grad_norm": 58.75727081298828, "learning_rate": 9.963837183419332e-06, "loss": 10.2045, "step": 20860 }, { "epoch": 0.9262125473618809, "grad_norm": 60.97832107543945, "learning_rate": 9.963819847457404e-06, "loss": 10.176, "step": 20870 }, { "epoch": 0.9266563482949723, "grad_norm": 66.70062255859375, "learning_rate": 9.963802511495477e-06, "loss": 10.8102, "step": 20880 }, { "epoch": 0.9271001492280637, "grad_norm": 59.54788589477539, "learning_rate": 9.96378517553355e-06, "loss": 9.8927, "step": 20890 }, { "epoch": 0.9275439501611552, "grad_norm": 56.45012283325195, "learning_rate": 9.963767839571621e-06, "loss": 10.2811, "step": 20900 }, { "epoch": 0.9279877510942467, "grad_norm": 63.22991943359375, "learning_rate": 9.963750503609694e-06, "loss": 10.0369, "step": 20910 }, { "epoch": 0.9284315520273382, "grad_norm": 66.25374603271484, "learning_rate": 9.963733167647767e-06, "loss": 10.2809, "step": 20920 }, { "epoch": 0.9288753529604296, "grad_norm": 65.86581420898438, "learning_rate": 9.96371583168584e-06, "loss": 10.2681, "step": 20930 }, { "epoch": 0.929319153893521, "grad_norm": 68.9689712524414, "learning_rate": 9.963698495723912e-06, "loss": 10.6552, "step": 20940 }, { "epoch": 0.9297629548266125, "grad_norm": 71.05926513671875, "learning_rate": 9.963681159761985e-06, "loss": 9.7715, "step": 20950 }, { "epoch": 0.930206755759704, "grad_norm": 76.48685455322266, "learning_rate": 9.963663823800058e-06, "loss": 10.4682, "step": 20960 }, { "epoch": 0.9306505566927954, "grad_norm": 63.782257080078125, "learning_rate": 9.96364648783813e-06, "loss": 10.3633, "step": 20970 }, { "epoch": 0.9310943576258869, "grad_norm": 71.73765563964844, "learning_rate": 9.963629151876202e-06, "loss": 10.465, "step": 20980 }, { "epoch": 0.9315381585589784, "grad_norm": 60.337677001953125, "learning_rate": 9.963611815914275e-06, "loss": 9.9345, "step": 20990 }, { "epoch": 0.9319819594920699, "grad_norm": 58.7932243347168, "learning_rate": 9.963594479952347e-06, "loss": 9.7854, "step": 21000 }, { "epoch": 0.9319819594920699, "eval_loss": 0.32051748037338257, "eval_runtime": 673.8843, "eval_samples_per_second": 1802.076, "eval_steps_per_second": 56.315, "step": 21000 }, { "epoch": 0.9324257604251613, "grad_norm": 61.75080871582031, "learning_rate": 9.96357714399042e-06, "loss": 10.7108, "step": 21010 }, { "epoch": 0.9328695613582527, "grad_norm": 66.45942687988281, "learning_rate": 9.963559808028493e-06, "loss": 9.7115, "step": 21020 }, { "epoch": 0.9333133622913442, "grad_norm": 66.86365509033203, "learning_rate": 9.963542472066564e-06, "loss": 10.4903, "step": 21030 }, { "epoch": 0.9337571632244357, "grad_norm": 59.50422668457031, "learning_rate": 9.963525136104637e-06, "loss": 10.2654, "step": 21040 }, { "epoch": 0.9342009641575272, "grad_norm": 65.55677032470703, "learning_rate": 9.96350780014271e-06, "loss": 10.2086, "step": 21050 }, { "epoch": 0.9346447650906186, "grad_norm": 60.787967681884766, "learning_rate": 9.963490464180784e-06, "loss": 10.4747, "step": 21060 }, { "epoch": 0.93508856602371, "grad_norm": 58.43632507324219, "learning_rate": 9.963473128218855e-06, "loss": 10.1393, "step": 21070 }, { "epoch": 0.9355323669568015, "grad_norm": 80.03214263916016, "learning_rate": 9.963455792256928e-06, "loss": 10.1442, "step": 21080 }, { "epoch": 0.935976167889893, "grad_norm": 58.377357482910156, "learning_rate": 9.963438456295001e-06, "loss": 10.3524, "step": 21090 }, { "epoch": 0.9364199688229845, "grad_norm": 62.32807159423828, "learning_rate": 9.963421120333073e-06, "loss": 10.1192, "step": 21100 }, { "epoch": 0.9368637697560759, "grad_norm": 62.65857696533203, "learning_rate": 9.963403784371146e-06, "loss": 10.2389, "step": 21110 }, { "epoch": 0.9373075706891674, "grad_norm": 57.880252838134766, "learning_rate": 9.963386448409219e-06, "loss": 10.244, "step": 21120 }, { "epoch": 0.9377513716222589, "grad_norm": 65.94375610351562, "learning_rate": 9.96336911244729e-06, "loss": 10.4006, "step": 21130 }, { "epoch": 0.9381951725553503, "grad_norm": 57.031944274902344, "learning_rate": 9.963351776485363e-06, "loss": 10.1134, "step": 21140 }, { "epoch": 0.9386389734884417, "grad_norm": 54.85254669189453, "learning_rate": 9.963334440523436e-06, "loss": 10.1874, "step": 21150 }, { "epoch": 0.9390827744215332, "grad_norm": 61.38536834716797, "learning_rate": 9.963317104561508e-06, "loss": 10.2983, "step": 21160 }, { "epoch": 0.9395265753546247, "grad_norm": 63.140010833740234, "learning_rate": 9.96329976859958e-06, "loss": 10.518, "step": 21170 }, { "epoch": 0.9399703762877162, "grad_norm": 64.6523208618164, "learning_rate": 9.963282432637654e-06, "loss": 9.8054, "step": 21180 }, { "epoch": 0.9404141772208076, "grad_norm": 64.04774475097656, "learning_rate": 9.963265096675727e-06, "loss": 10.1823, "step": 21190 }, { "epoch": 0.940857978153899, "grad_norm": 61.016780853271484, "learning_rate": 9.963247760713798e-06, "loss": 10.0684, "step": 21200 }, { "epoch": 0.9413017790869905, "grad_norm": 55.790435791015625, "learning_rate": 9.963230424751871e-06, "loss": 9.9463, "step": 21210 }, { "epoch": 0.941745580020082, "grad_norm": 58.64189910888672, "learning_rate": 9.963213088789944e-06, "loss": 9.7533, "step": 21220 }, { "epoch": 0.9421893809531735, "grad_norm": 62.45125961303711, "learning_rate": 9.963195752828016e-06, "loss": 10.2871, "step": 21230 }, { "epoch": 0.9426331818862649, "grad_norm": 57.653106689453125, "learning_rate": 9.963178416866089e-06, "loss": 9.6557, "step": 21240 }, { "epoch": 0.9430769828193564, "grad_norm": 70.4138412475586, "learning_rate": 9.963161080904162e-06, "loss": 9.947, "step": 21250 }, { "epoch": 0.9435207837524479, "grad_norm": 62.09977340698242, "learning_rate": 9.963143744942233e-06, "loss": 10.3042, "step": 21260 }, { "epoch": 0.9439645846855393, "grad_norm": 57.68659973144531, "learning_rate": 9.963126408980306e-06, "loss": 10.1486, "step": 21270 }, { "epoch": 0.9444083856186307, "grad_norm": 58.4940185546875, "learning_rate": 9.96310907301838e-06, "loss": 10.6721, "step": 21280 }, { "epoch": 0.9448521865517222, "grad_norm": 62.28818893432617, "learning_rate": 9.963091737056451e-06, "loss": 10.3957, "step": 21290 }, { "epoch": 0.9452959874848137, "grad_norm": 52.62106704711914, "learning_rate": 9.963074401094524e-06, "loss": 9.9156, "step": 21300 }, { "epoch": 0.9457397884179052, "grad_norm": 55.59827423095703, "learning_rate": 9.963057065132597e-06, "loss": 10.5472, "step": 21310 }, { "epoch": 0.9461835893509966, "grad_norm": 66.30583953857422, "learning_rate": 9.96303972917067e-06, "loss": 10.2904, "step": 21320 }, { "epoch": 0.946627390284088, "grad_norm": 61.947025299072266, "learning_rate": 9.963022393208741e-06, "loss": 10.238, "step": 21330 }, { "epoch": 0.9470711912171795, "grad_norm": 64.70133209228516, "learning_rate": 9.963005057246815e-06, "loss": 10.2199, "step": 21340 }, { "epoch": 0.947514992150271, "grad_norm": 60.59946823120117, "learning_rate": 9.962987721284888e-06, "loss": 10.3811, "step": 21350 }, { "epoch": 0.9479587930833625, "grad_norm": 57.58180618286133, "learning_rate": 9.962970385322959e-06, "loss": 10.4364, "step": 21360 }, { "epoch": 0.9484025940164539, "grad_norm": 63.964500427246094, "learning_rate": 9.962953049361032e-06, "loss": 10.1727, "step": 21370 }, { "epoch": 0.9488463949495454, "grad_norm": 66.83090209960938, "learning_rate": 9.962935713399105e-06, "loss": 10.4908, "step": 21380 }, { "epoch": 0.9492901958826369, "grad_norm": 58.45000457763672, "learning_rate": 9.962918377437177e-06, "loss": 10.3677, "step": 21390 }, { "epoch": 0.9497339968157283, "grad_norm": 62.023040771484375, "learning_rate": 9.96290104147525e-06, "loss": 10.0164, "step": 21400 }, { "epoch": 0.9501777977488197, "grad_norm": 58.94234848022461, "learning_rate": 9.962883705513323e-06, "loss": 10.4101, "step": 21410 }, { "epoch": 0.9506215986819112, "grad_norm": 58.807456970214844, "learning_rate": 9.962866369551396e-06, "loss": 9.882, "step": 21420 }, { "epoch": 0.9510653996150027, "grad_norm": 54.390098571777344, "learning_rate": 9.962849033589467e-06, "loss": 10.6928, "step": 21430 }, { "epoch": 0.9515092005480942, "grad_norm": 65.57588195800781, "learning_rate": 9.96283169762754e-06, "loss": 10.8391, "step": 21440 }, { "epoch": 0.9519530014811857, "grad_norm": 59.871700286865234, "learning_rate": 9.962814361665613e-06, "loss": 10.1765, "step": 21450 }, { "epoch": 0.952396802414277, "grad_norm": 61.5579948425293, "learning_rate": 9.962797025703685e-06, "loss": 10.1676, "step": 21460 }, { "epoch": 0.9528406033473685, "grad_norm": 50.21920394897461, "learning_rate": 9.962779689741758e-06, "loss": 10.3828, "step": 21470 }, { "epoch": 0.95328440428046, "grad_norm": 59.22177505493164, "learning_rate": 9.962762353779831e-06, "loss": 10.3205, "step": 21480 }, { "epoch": 0.9537282052135515, "grad_norm": 66.43260955810547, "learning_rate": 9.962745017817902e-06, "loss": 9.9529, "step": 21490 }, { "epoch": 0.9541720061466429, "grad_norm": 52.945499420166016, "learning_rate": 9.962727681855975e-06, "loss": 10.0895, "step": 21500 }, { "epoch": 0.9546158070797344, "grad_norm": 65.87628173828125, "learning_rate": 9.962710345894048e-06, "loss": 9.9705, "step": 21510 }, { "epoch": 0.9550596080128259, "grad_norm": 60.30337142944336, "learning_rate": 9.96269300993212e-06, "loss": 10.2607, "step": 21520 }, { "epoch": 0.9555034089459173, "grad_norm": 69.21615600585938, "learning_rate": 9.962675673970193e-06, "loss": 9.8337, "step": 21530 }, { "epoch": 0.9559472098790087, "grad_norm": 60.485984802246094, "learning_rate": 9.962658338008266e-06, "loss": 10.2629, "step": 21540 }, { "epoch": 0.9563910108121002, "grad_norm": 57.54688262939453, "learning_rate": 9.962641002046339e-06, "loss": 10.5513, "step": 21550 }, { "epoch": 0.9568348117451917, "grad_norm": 56.43334197998047, "learning_rate": 9.96262366608441e-06, "loss": 10.5496, "step": 21560 }, { "epoch": 0.9572786126782832, "grad_norm": 69.3460464477539, "learning_rate": 9.962606330122484e-06, "loss": 10.0796, "step": 21570 }, { "epoch": 0.9577224136113747, "grad_norm": 59.36885070800781, "learning_rate": 9.962588994160557e-06, "loss": 9.7917, "step": 21580 }, { "epoch": 0.958166214544466, "grad_norm": 50.145694732666016, "learning_rate": 9.962571658198628e-06, "loss": 10.1281, "step": 21590 }, { "epoch": 0.9586100154775575, "grad_norm": 63.263710021972656, "learning_rate": 9.962554322236701e-06, "loss": 10.6324, "step": 21600 }, { "epoch": 0.959053816410649, "grad_norm": 64.55142211914062, "learning_rate": 9.962536986274774e-06, "loss": 10.3349, "step": 21610 }, { "epoch": 0.9594976173437405, "grad_norm": 69.35453796386719, "learning_rate": 9.962519650312846e-06, "loss": 10.1361, "step": 21620 }, { "epoch": 0.9599414182768319, "grad_norm": 54.69525909423828, "learning_rate": 9.962502314350919e-06, "loss": 10.561, "step": 21630 }, { "epoch": 0.9603852192099234, "grad_norm": 58.769649505615234, "learning_rate": 9.962484978388992e-06, "loss": 9.6632, "step": 21640 }, { "epoch": 0.9608290201430149, "grad_norm": 62.73846435546875, "learning_rate": 9.962467642427063e-06, "loss": 10.144, "step": 21650 }, { "epoch": 0.9612728210761063, "grad_norm": 56.944557189941406, "learning_rate": 9.962450306465136e-06, "loss": 10.048, "step": 21660 }, { "epoch": 0.9617166220091977, "grad_norm": 51.97823715209961, "learning_rate": 9.96243297050321e-06, "loss": 9.9427, "step": 21670 }, { "epoch": 0.9621604229422892, "grad_norm": 60.300987243652344, "learning_rate": 9.96241563454128e-06, "loss": 10.6487, "step": 21680 }, { "epoch": 0.9626042238753807, "grad_norm": 61.2890739440918, "learning_rate": 9.962398298579354e-06, "loss": 10.0074, "step": 21690 }, { "epoch": 0.9630480248084722, "grad_norm": 53.57798767089844, "learning_rate": 9.962380962617427e-06, "loss": 10.3964, "step": 21700 }, { "epoch": 0.9634918257415637, "grad_norm": 63.061988830566406, "learning_rate": 9.9623636266555e-06, "loss": 10.4485, "step": 21710 }, { "epoch": 0.963935626674655, "grad_norm": 60.63272476196289, "learning_rate": 9.962346290693571e-06, "loss": 10.3628, "step": 21720 }, { "epoch": 0.9643794276077465, "grad_norm": 69.05794525146484, "learning_rate": 9.962328954731644e-06, "loss": 10.0831, "step": 21730 }, { "epoch": 0.964823228540838, "grad_norm": 63.956844329833984, "learning_rate": 9.962311618769717e-06, "loss": 10.7723, "step": 21740 }, { "epoch": 0.9652670294739295, "grad_norm": 58.707271575927734, "learning_rate": 9.962294282807789e-06, "loss": 9.9793, "step": 21750 }, { "epoch": 0.9657108304070209, "grad_norm": 60.982521057128906, "learning_rate": 9.962276946845862e-06, "loss": 10.037, "step": 21760 }, { "epoch": 0.9661546313401124, "grad_norm": 58.97859191894531, "learning_rate": 9.962259610883935e-06, "loss": 10.2802, "step": 21770 }, { "epoch": 0.9665984322732039, "grad_norm": 59.34490966796875, "learning_rate": 9.962242274922006e-06, "loss": 10.1092, "step": 21780 }, { "epoch": 0.9670422332062953, "grad_norm": 72.03850555419922, "learning_rate": 9.96222493896008e-06, "loss": 10.4551, "step": 21790 }, { "epoch": 0.9674860341393868, "grad_norm": 64.43966674804688, "learning_rate": 9.962207602998152e-06, "loss": 10.2547, "step": 21800 }, { "epoch": 0.9679298350724782, "grad_norm": 58.295806884765625, "learning_rate": 9.962190267036224e-06, "loss": 10.3897, "step": 21810 }, { "epoch": 0.9683736360055697, "grad_norm": 64.32368469238281, "learning_rate": 9.962172931074297e-06, "loss": 10.4014, "step": 21820 }, { "epoch": 0.9688174369386612, "grad_norm": 61.49608612060547, "learning_rate": 9.96215559511237e-06, "loss": 10.3173, "step": 21830 }, { "epoch": 0.9692612378717527, "grad_norm": 66.80955505371094, "learning_rate": 9.962138259150441e-06, "loss": 10.3526, "step": 21840 }, { "epoch": 0.969705038804844, "grad_norm": 60.56246566772461, "learning_rate": 9.962120923188514e-06, "loss": 9.8569, "step": 21850 }, { "epoch": 0.9701488397379355, "grad_norm": 57.73928451538086, "learning_rate": 9.962103587226588e-06, "loss": 10.3068, "step": 21860 }, { "epoch": 0.970592640671027, "grad_norm": 54.70594024658203, "learning_rate": 9.962086251264659e-06, "loss": 10.054, "step": 21870 }, { "epoch": 0.9710364416041185, "grad_norm": 54.87747573852539, "learning_rate": 9.962068915302732e-06, "loss": 9.9986, "step": 21880 }, { "epoch": 0.9714802425372099, "grad_norm": 60.417457580566406, "learning_rate": 9.962051579340805e-06, "loss": 10.3474, "step": 21890 }, { "epoch": 0.9719240434703014, "grad_norm": 68.63028717041016, "learning_rate": 9.962034243378876e-06, "loss": 10.0733, "step": 21900 }, { "epoch": 0.9723678444033929, "grad_norm": 63.004581451416016, "learning_rate": 9.96201690741695e-06, "loss": 10.58, "step": 21910 }, { "epoch": 0.9728116453364843, "grad_norm": 72.34359741210938, "learning_rate": 9.961999571455023e-06, "loss": 10.2606, "step": 21920 }, { "epoch": 0.9732554462695758, "grad_norm": 66.6717758178711, "learning_rate": 9.961982235493096e-06, "loss": 10.5124, "step": 21930 }, { "epoch": 0.9736992472026672, "grad_norm": 68.4973373413086, "learning_rate": 9.961964899531167e-06, "loss": 10.3562, "step": 21940 }, { "epoch": 0.9741430481357587, "grad_norm": 67.8966064453125, "learning_rate": 9.96194756356924e-06, "loss": 9.8984, "step": 21950 }, { "epoch": 0.9745868490688502, "grad_norm": 64.77039337158203, "learning_rate": 9.961930227607313e-06, "loss": 10.4107, "step": 21960 }, { "epoch": 0.9750306500019417, "grad_norm": 56.87838363647461, "learning_rate": 9.961912891645385e-06, "loss": 10.3442, "step": 21970 }, { "epoch": 0.975474450935033, "grad_norm": 63.49540710449219, "learning_rate": 9.961895555683458e-06, "loss": 9.7954, "step": 21980 }, { "epoch": 0.9759182518681245, "grad_norm": 66.3160171508789, "learning_rate": 9.96187821972153e-06, "loss": 9.961, "step": 21990 }, { "epoch": 0.976362052801216, "grad_norm": 59.505393981933594, "learning_rate": 9.961860883759602e-06, "loss": 10.3546, "step": 22000 }, { "epoch": 0.976362052801216, "eval_loss": 0.3203989863395691, "eval_runtime": 674.395, "eval_samples_per_second": 1800.712, "eval_steps_per_second": 56.273, "step": 22000 }, { "epoch": 0.9768058537343075, "grad_norm": 65.98320770263672, "learning_rate": 9.961843547797675e-06, "loss": 10.2495, "step": 22010 }, { "epoch": 0.977249654667399, "grad_norm": 60.26272964477539, "learning_rate": 9.961826211835748e-06, "loss": 10.185, "step": 22020 }, { "epoch": 0.9776934556004904, "grad_norm": 77.41650390625, "learning_rate": 9.96180887587382e-06, "loss": 10.2332, "step": 22030 }, { "epoch": 0.9781372565335819, "grad_norm": 67.0610580444336, "learning_rate": 9.961791539911893e-06, "loss": 9.9651, "step": 22040 }, { "epoch": 0.9785810574666733, "grad_norm": 62.729793548583984, "learning_rate": 9.961774203949966e-06, "loss": 10.2186, "step": 22050 }, { "epoch": 0.9790248583997648, "grad_norm": 59.35409927368164, "learning_rate": 9.961756867988037e-06, "loss": 9.6738, "step": 22060 }, { "epoch": 0.9794686593328562, "grad_norm": 63.04311752319336, "learning_rate": 9.96173953202611e-06, "loss": 10.0731, "step": 22070 }, { "epoch": 0.9799124602659477, "grad_norm": 69.83229064941406, "learning_rate": 9.961722196064183e-06, "loss": 10.3592, "step": 22080 }, { "epoch": 0.9803562611990392, "grad_norm": 71.35539245605469, "learning_rate": 9.961704860102255e-06, "loss": 10.2029, "step": 22090 }, { "epoch": 0.9808000621321307, "grad_norm": 57.54240798950195, "learning_rate": 9.961687524140328e-06, "loss": 10.6619, "step": 22100 }, { "epoch": 0.981243863065222, "grad_norm": 62.44277572631836, "learning_rate": 9.961670188178401e-06, "loss": 10.35, "step": 22110 }, { "epoch": 0.9816876639983135, "grad_norm": 61.99805450439453, "learning_rate": 9.961652852216472e-06, "loss": 10.4932, "step": 22120 }, { "epoch": 0.982131464931405, "grad_norm": 63.21669387817383, "learning_rate": 9.961635516254545e-06, "loss": 10.4017, "step": 22130 }, { "epoch": 0.9825752658644965, "grad_norm": 59.47304916381836, "learning_rate": 9.961618180292618e-06, "loss": 10.439, "step": 22140 }, { "epoch": 0.983019066797588, "grad_norm": 63.6852912902832, "learning_rate": 9.961600844330692e-06, "loss": 10.2629, "step": 22150 }, { "epoch": 0.9834628677306794, "grad_norm": 58.97916793823242, "learning_rate": 9.961583508368763e-06, "loss": 10.1938, "step": 22160 }, { "epoch": 0.9839066686637709, "grad_norm": 64.60242462158203, "learning_rate": 9.961566172406836e-06, "loss": 10.5744, "step": 22170 }, { "epoch": 0.9843504695968623, "grad_norm": 61.11840057373047, "learning_rate": 9.961548836444909e-06, "loss": 9.7976, "step": 22180 }, { "epoch": 0.9847942705299538, "grad_norm": 64.354248046875, "learning_rate": 9.96153150048298e-06, "loss": 10.3659, "step": 22190 }, { "epoch": 0.9852380714630452, "grad_norm": 64.9032974243164, "learning_rate": 9.961514164521054e-06, "loss": 10.3499, "step": 22200 }, { "epoch": 0.9856818723961367, "grad_norm": 58.52092742919922, "learning_rate": 9.961496828559127e-06, "loss": 10.0338, "step": 22210 }, { "epoch": 0.9861256733292282, "grad_norm": 63.69118881225586, "learning_rate": 9.961479492597198e-06, "loss": 10.3962, "step": 22220 }, { "epoch": 0.9865694742623197, "grad_norm": 69.64185333251953, "learning_rate": 9.961462156635271e-06, "loss": 9.8446, "step": 22230 }, { "epoch": 0.987013275195411, "grad_norm": 64.73435974121094, "learning_rate": 9.961444820673344e-06, "loss": 9.8991, "step": 22240 }, { "epoch": 0.9874570761285025, "grad_norm": 55.16053771972656, "learning_rate": 9.961427484711416e-06, "loss": 10.1616, "step": 22250 }, { "epoch": 0.987900877061594, "grad_norm": 63.042667388916016, "learning_rate": 9.961410148749489e-06, "loss": 10.7821, "step": 22260 }, { "epoch": 0.9883446779946855, "grad_norm": 67.79585266113281, "learning_rate": 9.961392812787562e-06, "loss": 10.337, "step": 22270 }, { "epoch": 0.988788478927777, "grad_norm": 56.07440185546875, "learning_rate": 9.961375476825633e-06, "loss": 9.5085, "step": 22280 }, { "epoch": 0.9892322798608684, "grad_norm": 70.412841796875, "learning_rate": 9.961358140863706e-06, "loss": 10.1263, "step": 22290 }, { "epoch": 0.9896760807939599, "grad_norm": 52.69704818725586, "learning_rate": 9.96134080490178e-06, "loss": 10.1837, "step": 22300 }, { "epoch": 0.9901198817270513, "grad_norm": 61.2188606262207, "learning_rate": 9.96132346893985e-06, "loss": 10.7708, "step": 22310 }, { "epoch": 0.9905636826601428, "grad_norm": 54.49897766113281, "learning_rate": 9.961306132977924e-06, "loss": 10.7213, "step": 22320 }, { "epoch": 0.9910074835932342, "grad_norm": 62.42861557006836, "learning_rate": 9.961288797015997e-06, "loss": 10.1813, "step": 22330 }, { "epoch": 0.9914512845263257, "grad_norm": 60.572418212890625, "learning_rate": 9.961271461054068e-06, "loss": 9.7815, "step": 22340 }, { "epoch": 0.9918950854594172, "grad_norm": 66.12911224365234, "learning_rate": 9.961254125092141e-06, "loss": 10.526, "step": 22350 }, { "epoch": 0.9923388863925087, "grad_norm": 68.29117584228516, "learning_rate": 9.961236789130214e-06, "loss": 10.4441, "step": 22360 }, { "epoch": 0.9927826873256002, "grad_norm": 62.79308319091797, "learning_rate": 9.961219453168287e-06, "loss": 9.7355, "step": 22370 }, { "epoch": 0.9932264882586915, "grad_norm": 69.58606719970703, "learning_rate": 9.961202117206359e-06, "loss": 10.3635, "step": 22380 }, { "epoch": 0.993670289191783, "grad_norm": 56.06214904785156, "learning_rate": 9.961184781244432e-06, "loss": 10.2798, "step": 22390 }, { "epoch": 0.9941140901248745, "grad_norm": 68.97488403320312, "learning_rate": 9.961167445282505e-06, "loss": 10.7732, "step": 22400 }, { "epoch": 0.994557891057966, "grad_norm": 56.97425079345703, "learning_rate": 9.961150109320576e-06, "loss": 10.1022, "step": 22410 }, { "epoch": 0.9950016919910574, "grad_norm": 56.741455078125, "learning_rate": 9.96113277335865e-06, "loss": 10.2925, "step": 22420 }, { "epoch": 0.9954454929241489, "grad_norm": 69.6789321899414, "learning_rate": 9.961115437396722e-06, "loss": 9.8957, "step": 22430 }, { "epoch": 0.9958892938572403, "grad_norm": 58.19102478027344, "learning_rate": 9.961098101434794e-06, "loss": 9.5713, "step": 22440 }, { "epoch": 0.9963330947903318, "grad_norm": 56.53800964355469, "learning_rate": 9.961080765472867e-06, "loss": 9.8579, "step": 22450 }, { "epoch": 0.9967768957234232, "grad_norm": 58.388755798339844, "learning_rate": 9.96106342951094e-06, "loss": 10.0617, "step": 22460 }, { "epoch": 0.9972206966565147, "grad_norm": 57.43764877319336, "learning_rate": 9.961046093549011e-06, "loss": 10.0687, "step": 22470 }, { "epoch": 0.9976644975896062, "grad_norm": 64.95555877685547, "learning_rate": 9.961028757587084e-06, "loss": 10.092, "step": 22480 }, { "epoch": 0.9981082985226977, "grad_norm": 57.88254928588867, "learning_rate": 9.961011421625158e-06, "loss": 10.4509, "step": 22490 }, { "epoch": 0.9985520994557892, "grad_norm": 63.7093391418457, "learning_rate": 9.960994085663229e-06, "loss": 10.5474, "step": 22500 } ], "logging_steps": 10, "max_steps": 22532, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.851877211308032e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }