{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.78184172591561, "eval_steps": 500, "global_step": 16000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004886510786972562, "grad_norm": 550.610107421875, "learning_rate": 4.396678065461651e-08, "loss": 8.0618, "step": 10 }, { "epoch": 0.0009773021573945123, "grad_norm": 547.8289794921875, "learning_rate": 9.281875915974597e-08, "loss": 8.0357, "step": 20 }, { "epoch": 0.0014659532360917686, "grad_norm": 534.9840087890625, "learning_rate": 1.4167073766487544e-07, "loss": 7.8875, "step": 30 }, { "epoch": 0.0019546043147890247, "grad_norm": 550.1026000976562, "learning_rate": 1.905227161700049e-07, "loss": 7.5113, "step": 40 }, { "epoch": 0.002443255393486281, "grad_norm": 519.316650390625, "learning_rate": 2.3937469467513437e-07, "loss": 6.2256, "step": 50 }, { "epoch": 0.0029319064721835372, "grad_norm": 198.8031768798828, "learning_rate": 2.8822667318026384e-07, "loss": 4.6185, "step": 60 }, { "epoch": 0.0034205575508807935, "grad_norm": 113.63694763183594, "learning_rate": 3.3707865168539325e-07, "loss": 2.404, "step": 70 }, { "epoch": 0.003909208629578049, "grad_norm": 77.9081802368164, "learning_rate": 3.859306301905227e-07, "loss": 1.9127, "step": 80 }, { "epoch": 0.004397859708275306, "grad_norm": 92.41219329833984, "learning_rate": 4.347826086956522e-07, "loss": 1.5194, "step": 90 }, { "epoch": 0.004886510786972562, "grad_norm": 20.305315017700195, "learning_rate": 4.836345872007817e-07, "loss": 1.3619, "step": 100 }, { "epoch": 0.005375161865669819, "grad_norm": 15.185184478759766, "learning_rate": 5.324865657059111e-07, "loss": 1.2711, "step": 110 }, { "epoch": 0.0058638129443670745, "grad_norm": 17.12596321105957, "learning_rate": 5.813385442110406e-07, "loss": 1.2171, "step": 120 }, { "epoch": 0.006352464023064331, "grad_norm": 24.50565528869629, "learning_rate": 6.3019052271617e-07, "loss": 1.1917, "step": 130 }, { "epoch": 0.006841115101761587, "grad_norm": 9.890480995178223, "learning_rate": 6.790425012212995e-07, "loss": 1.1793, "step": 140 }, { "epoch": 0.007329766180458844, "grad_norm": 16.53375816345215, "learning_rate": 7.278944797264289e-07, "loss": 1.1749, "step": 150 }, { "epoch": 0.007818417259156099, "grad_norm": 19.722103118896484, "learning_rate": 7.767464582315585e-07, "loss": 1.1696, "step": 160 }, { "epoch": 0.008307068337853355, "grad_norm": 21.687253952026367, "learning_rate": 8.255984367366879e-07, "loss": 1.1658, "step": 170 }, { "epoch": 0.008795719416550612, "grad_norm": 20.212892532348633, "learning_rate": 8.744504152418174e-07, "loss": 1.1672, "step": 180 }, { "epoch": 0.009284370495247869, "grad_norm": 14.680685043334961, "learning_rate": 9.233023937469468e-07, "loss": 1.164, "step": 190 }, { "epoch": 0.009773021573945124, "grad_norm": 15.129215240478516, "learning_rate": 9.721543722520762e-07, "loss": 1.1606, "step": 200 }, { "epoch": 0.01026167265264238, "grad_norm": 6.895666122436523, "learning_rate": 1.0210063507572057e-06, "loss": 1.161, "step": 210 }, { "epoch": 0.010750323731339637, "grad_norm": 6.139767646789551, "learning_rate": 1.0698583292623353e-06, "loss": 1.1602, "step": 220 }, { "epoch": 0.011238974810036894, "grad_norm": 25.940549850463867, "learning_rate": 1.1187103077674646e-06, "loss": 1.158, "step": 230 }, { "epoch": 0.011727625888734149, "grad_norm": 5.9631829261779785, "learning_rate": 1.167562286272594e-06, "loss": 1.1565, "step": 240 }, { "epoch": 0.012216276967431406, "grad_norm": 42.288856506347656, "learning_rate": 1.2164142647777236e-06, "loss": 1.1634, "step": 250 }, { "epoch": 0.012704928046128662, "grad_norm": 23.973031997680664, "learning_rate": 1.265266243282853e-06, "loss": 1.1622, "step": 260 }, { "epoch": 0.013193579124825917, "grad_norm": 32.71512985229492, "learning_rate": 1.3141182217879824e-06, "loss": 1.1632, "step": 270 }, { "epoch": 0.013682230203523174, "grad_norm": 45.931095123291016, "learning_rate": 1.362970200293112e-06, "loss": 1.1611, "step": 280 }, { "epoch": 0.01417088128222043, "grad_norm": 31.298593521118164, "learning_rate": 1.4118221787982415e-06, "loss": 1.1609, "step": 290 }, { "epoch": 0.014659532360917688, "grad_norm": 37.475921630859375, "learning_rate": 1.4606741573033708e-06, "loss": 1.159, "step": 300 }, { "epoch": 0.015148183439614942, "grad_norm": 56.90618896484375, "learning_rate": 1.5095261358085003e-06, "loss": 1.1611, "step": 310 }, { "epoch": 0.015636834518312197, "grad_norm": 26.84503746032715, "learning_rate": 1.5583781143136298e-06, "loss": 1.1605, "step": 320 }, { "epoch": 0.016125485597009454, "grad_norm": 31.706214904785156, "learning_rate": 1.6072300928187593e-06, "loss": 1.1636, "step": 330 }, { "epoch": 0.01661413667570671, "grad_norm": 20.083066940307617, "learning_rate": 1.6560820713238887e-06, "loss": 1.155, "step": 340 }, { "epoch": 0.017102787754403968, "grad_norm": 20.37936782836914, "learning_rate": 1.7049340498290182e-06, "loss": 1.1544, "step": 350 }, { "epoch": 0.017591438833101224, "grad_norm": 29.238149642944336, "learning_rate": 1.7537860283341477e-06, "loss": 1.1526, "step": 360 }, { "epoch": 0.01808008991179848, "grad_norm": 24.459911346435547, "learning_rate": 1.802638006839277e-06, "loss": 1.1542, "step": 370 }, { "epoch": 0.018568740990495738, "grad_norm": 25.11469841003418, "learning_rate": 1.8514899853444065e-06, "loss": 1.153, "step": 380 }, { "epoch": 0.019057392069192994, "grad_norm": 19.211380004882812, "learning_rate": 1.900341963849536e-06, "loss": 1.1524, "step": 390 }, { "epoch": 0.019546043147890248, "grad_norm": 29.28157615661621, "learning_rate": 1.9491939423546656e-06, "loss": 1.1507, "step": 400 }, { "epoch": 0.020034694226587504, "grad_norm": 18.40865707397461, "learning_rate": 1.998045920859795e-06, "loss": 1.1514, "step": 410 }, { "epoch": 0.02052334530528476, "grad_norm": 32.68934631347656, "learning_rate": 2.046897899364924e-06, "loss": 1.1509, "step": 420 }, { "epoch": 0.021011996383982018, "grad_norm": 28.276269912719727, "learning_rate": 2.0957498778700537e-06, "loss": 1.1508, "step": 430 }, { "epoch": 0.021500647462679275, "grad_norm": 29.66724967956543, "learning_rate": 2.1446018563751832e-06, "loss": 1.1507, "step": 440 }, { "epoch": 0.02198929854137653, "grad_norm": 33.39693069458008, "learning_rate": 2.1934538348803127e-06, "loss": 1.1505, "step": 450 }, { "epoch": 0.022477949620073788, "grad_norm": 28.482940673828125, "learning_rate": 2.2423058133854423e-06, "loss": 1.1488, "step": 460 }, { "epoch": 0.02296660069877104, "grad_norm": 22.53483009338379, "learning_rate": 2.2911577918905718e-06, "loss": 1.1495, "step": 470 }, { "epoch": 0.023455251777468298, "grad_norm": 20.745651245117188, "learning_rate": 2.3400097703957013e-06, "loss": 1.1477, "step": 480 }, { "epoch": 0.023943902856165555, "grad_norm": 27.499927520751953, "learning_rate": 2.388861748900831e-06, "loss": 1.1491, "step": 490 }, { "epoch": 0.02443255393486281, "grad_norm": 17.32890510559082, "learning_rate": 2.43771372740596e-06, "loss": 1.1483, "step": 500 }, { "epoch": 0.02443255393486281, "eval_loss": 1.1245596408843994, "eval_runtime": 728.0762, "eval_samples_per_second": 242.98, "eval_steps_per_second": 0.475, "step": 500 }, { "epoch": 0.024921205013560068, "grad_norm": 28.78767967224121, "learning_rate": 2.4865657059110894e-06, "loss": 1.1466, "step": 510 }, { "epoch": 0.025409856092257325, "grad_norm": 19.717103958129883, "learning_rate": 2.5354176844162194e-06, "loss": 1.1464, "step": 520 }, { "epoch": 0.02589850717095458, "grad_norm": 27.598007202148438, "learning_rate": 2.584269662921349e-06, "loss": 1.1465, "step": 530 }, { "epoch": 0.026387158249651835, "grad_norm": 23.63872528076172, "learning_rate": 2.633121641426478e-06, "loss": 1.1453, "step": 540 }, { "epoch": 0.02687580932834909, "grad_norm": 26.8532772064209, "learning_rate": 2.6819736199316075e-06, "loss": 1.1466, "step": 550 }, { "epoch": 0.027364460407046348, "grad_norm": 22.594478607177734, "learning_rate": 2.730825598436737e-06, "loss": 1.146, "step": 560 }, { "epoch": 0.027853111485743605, "grad_norm": 22.817705154418945, "learning_rate": 2.7796775769418666e-06, "loss": 1.1437, "step": 570 }, { "epoch": 0.02834176256444086, "grad_norm": 7.5399250984191895, "learning_rate": 2.828529555446996e-06, "loss": 1.1443, "step": 580 }, { "epoch": 0.02883041364313812, "grad_norm": 34.241981506347656, "learning_rate": 2.8773815339521256e-06, "loss": 1.1723, "step": 590 }, { "epoch": 0.029319064721835375, "grad_norm": 41.769996643066406, "learning_rate": 2.926233512457255e-06, "loss": 1.1566, "step": 600 }, { "epoch": 0.02980771580053263, "grad_norm": 60.579036712646484, "learning_rate": 2.9750854909623842e-06, "loss": 1.1581, "step": 610 }, { "epoch": 0.030296366879229885, "grad_norm": 9.520317077636719, "learning_rate": 3.0239374694675137e-06, "loss": 1.1521, "step": 620 }, { "epoch": 0.03078501795792714, "grad_norm": 48.57497787475586, "learning_rate": 3.0727894479726433e-06, "loss": 1.1721, "step": 630 }, { "epoch": 0.031273669036624395, "grad_norm": 44.00090026855469, "learning_rate": 3.1216414264777728e-06, "loss": 1.1517, "step": 640 }, { "epoch": 0.03176232011532165, "grad_norm": 25.236425399780273, "learning_rate": 3.1704934049829023e-06, "loss": 1.1657, "step": 650 }, { "epoch": 0.03225097119401891, "grad_norm": 26.25858497619629, "learning_rate": 3.219345383488032e-06, "loss": 1.146, "step": 660 }, { "epoch": 0.032739622272716165, "grad_norm": 33.179771423339844, "learning_rate": 3.2681973619931613e-06, "loss": 1.1452, "step": 670 }, { "epoch": 0.03322827335141342, "grad_norm": 26.868507385253906, "learning_rate": 3.3170493404982904e-06, "loss": 1.1399, "step": 680 }, { "epoch": 0.03371692443011068, "grad_norm": 14.93895149230957, "learning_rate": 3.36590131900342e-06, "loss": 1.142, "step": 690 }, { "epoch": 0.034205575508807935, "grad_norm": 10.044151306152344, "learning_rate": 3.4147532975085495e-06, "loss": 1.1391, "step": 700 }, { "epoch": 0.03469422658750519, "grad_norm": 22.18167495727539, "learning_rate": 3.463605276013679e-06, "loss": 1.1439, "step": 710 }, { "epoch": 0.03518287766620245, "grad_norm": 17.276782989501953, "learning_rate": 3.5124572545188085e-06, "loss": 1.1402, "step": 720 }, { "epoch": 0.035671528744899705, "grad_norm": 22.945816040039062, "learning_rate": 3.561309233023938e-06, "loss": 1.1409, "step": 730 }, { "epoch": 0.03616017982359696, "grad_norm": 28.2205810546875, "learning_rate": 3.6101612115290676e-06, "loss": 1.1425, "step": 740 }, { "epoch": 0.03664883090229422, "grad_norm": 29.494741439819336, "learning_rate": 3.6590131900341966e-06, "loss": 1.1423, "step": 750 }, { "epoch": 0.037137481980991476, "grad_norm": 12.51252269744873, "learning_rate": 3.707865168539326e-06, "loss": 1.14, "step": 760 }, { "epoch": 0.03762613305968873, "grad_norm": 19.594589233398438, "learning_rate": 3.7567171470444557e-06, "loss": 1.1472, "step": 770 }, { "epoch": 0.03811478413838599, "grad_norm": 8.805150032043457, "learning_rate": 3.805569125549585e-06, "loss": 1.143, "step": 780 }, { "epoch": 0.03860343521708324, "grad_norm": 16.11551856994629, "learning_rate": 3.854421104054714e-06, "loss": 1.1394, "step": 790 }, { "epoch": 0.039092086295780495, "grad_norm": 30.43114471435547, "learning_rate": 3.903273082559844e-06, "loss": 1.1412, "step": 800 }, { "epoch": 0.03958073737447775, "grad_norm": 24.658550262451172, "learning_rate": 3.952125061064973e-06, "loss": 1.1379, "step": 810 }, { "epoch": 0.04006938845317501, "grad_norm": 30.698740005493164, "learning_rate": 4.000977039570103e-06, "loss": 1.1402, "step": 820 }, { "epoch": 0.040558039531872266, "grad_norm": 15.285526275634766, "learning_rate": 4.049829018075232e-06, "loss": 1.1397, "step": 830 }, { "epoch": 0.04104669061056952, "grad_norm": 16.959575653076172, "learning_rate": 4.098680996580362e-06, "loss": 1.143, "step": 840 }, { "epoch": 0.04153534168926678, "grad_norm": 9.172962188720703, "learning_rate": 4.1475329750854914e-06, "loss": 1.1394, "step": 850 }, { "epoch": 0.042023992767964036, "grad_norm": 34.94649887084961, "learning_rate": 4.196384953590621e-06, "loss": 1.1586, "step": 860 }, { "epoch": 0.04251264384666129, "grad_norm": 31.494056701660156, "learning_rate": 4.2452369320957505e-06, "loss": 1.1561, "step": 870 }, { "epoch": 0.04300129492535855, "grad_norm": 30.298629760742188, "learning_rate": 4.29408891060088e-06, "loss": 1.1579, "step": 880 }, { "epoch": 0.043489946004055806, "grad_norm": 8.591865539550781, "learning_rate": 4.3429408891060095e-06, "loss": 1.1441, "step": 890 }, { "epoch": 0.04397859708275306, "grad_norm": 22.960233688354492, "learning_rate": 4.391792867611139e-06, "loss": 1.1444, "step": 900 }, { "epoch": 0.04446724816145032, "grad_norm": 14.041954040527344, "learning_rate": 4.4406448461162685e-06, "loss": 1.141, "step": 910 }, { "epoch": 0.044955899240147576, "grad_norm": 25.523542404174805, "learning_rate": 4.489496824621398e-06, "loss": 1.1335, "step": 920 }, { "epoch": 0.045444550318844826, "grad_norm": 12.317065238952637, "learning_rate": 4.538348803126527e-06, "loss": 1.1412, "step": 930 }, { "epoch": 0.04593320139754208, "grad_norm": 6.889744758605957, "learning_rate": 4.587200781631656e-06, "loss": 1.1431, "step": 940 }, { "epoch": 0.04642185247623934, "grad_norm": 14.626124382019043, "learning_rate": 4.636052760136786e-06, "loss": 1.1369, "step": 950 }, { "epoch": 0.046910503554936596, "grad_norm": 8.889772415161133, "learning_rate": 4.684904738641915e-06, "loss": 1.1327, "step": 960 }, { "epoch": 0.04739915463363385, "grad_norm": 22.604360580444336, "learning_rate": 4.733756717147045e-06, "loss": 1.1311, "step": 970 }, { "epoch": 0.04788780571233111, "grad_norm": 18.373239517211914, "learning_rate": 4.782608695652174e-06, "loss": 1.1329, "step": 980 }, { "epoch": 0.048376456791028366, "grad_norm": 18.741851806640625, "learning_rate": 4.831460674157304e-06, "loss": 1.1275, "step": 990 }, { "epoch": 0.04886510786972562, "grad_norm": 21.531051635742188, "learning_rate": 4.880312652662433e-06, "loss": 1.1242, "step": 1000 }, { "epoch": 0.04886510786972562, "eval_loss": 1.0948448181152344, "eval_runtime": 728.3165, "eval_samples_per_second": 242.9, "eval_steps_per_second": 0.475, "step": 1000 }, { "epoch": 0.04935375894842288, "grad_norm": 9.384544372558594, "learning_rate": 4.929164631167563e-06, "loss": 1.1484, "step": 1010 }, { "epoch": 0.049842410027120136, "grad_norm": 25.287551879882812, "learning_rate": 4.978016609672692e-06, "loss": 1.1365, "step": 1020 }, { "epoch": 0.05033106110581739, "grad_norm": 25.104299545288086, "learning_rate": 5.026868588177821e-06, "loss": 1.1073, "step": 1030 }, { "epoch": 0.05081971218451465, "grad_norm": 5.135197639465332, "learning_rate": 5.0757205666829515e-06, "loss": 1.0908, "step": 1040 }, { "epoch": 0.051308363263211906, "grad_norm": 10.835426330566406, "learning_rate": 5.12457254518808e-06, "loss": 1.0344, "step": 1050 }, { "epoch": 0.05179701434190916, "grad_norm": 17.45260238647461, "learning_rate": 5.1734245236932105e-06, "loss": 0.9916, "step": 1060 }, { "epoch": 0.05228566542060641, "grad_norm": 18.409074783325195, "learning_rate": 5.222276502198339e-06, "loss": 0.9616, "step": 1070 }, { "epoch": 0.05277431649930367, "grad_norm": 13.753133773803711, "learning_rate": 5.271128480703469e-06, "loss": 0.9379, "step": 1080 }, { "epoch": 0.053262967578000926, "grad_norm": 16.086511611938477, "learning_rate": 5.319980459208598e-06, "loss": 0.922, "step": 1090 }, { "epoch": 0.05375161865669818, "grad_norm": 14.6001558303833, "learning_rate": 5.368832437713728e-06, "loss": 0.9061, "step": 1100 }, { "epoch": 0.05424026973539544, "grad_norm": 22.474435806274414, "learning_rate": 5.417684416218857e-06, "loss": 0.9022, "step": 1110 }, { "epoch": 0.054728920814092696, "grad_norm": 22.234281539916992, "learning_rate": 5.466536394723987e-06, "loss": 0.9091, "step": 1120 }, { "epoch": 0.05521757189278995, "grad_norm": 10.945754051208496, "learning_rate": 5.5153883732291154e-06, "loss": 0.903, "step": 1130 }, { "epoch": 0.05570622297148721, "grad_norm": 12.38178539276123, "learning_rate": 5.564240351734246e-06, "loss": 0.882, "step": 1140 }, { "epoch": 0.056194874050184467, "grad_norm": 18.168428421020508, "learning_rate": 5.6130923302393745e-06, "loss": 0.8671, "step": 1150 }, { "epoch": 0.05668352512888172, "grad_norm": 13.480072975158691, "learning_rate": 5.661944308744505e-06, "loss": 0.8604, "step": 1160 }, { "epoch": 0.05717217620757898, "grad_norm": 15.529900550842285, "learning_rate": 5.7107962872496335e-06, "loss": 0.8467, "step": 1170 }, { "epoch": 0.05766082728627624, "grad_norm": 12.60476016998291, "learning_rate": 5.759648265754764e-06, "loss": 0.8376, "step": 1180 }, { "epoch": 0.05814947836497349, "grad_norm": 12.737000465393066, "learning_rate": 5.8085002442598926e-06, "loss": 0.8262, "step": 1190 }, { "epoch": 0.05863812944367075, "grad_norm": 11.14971923828125, "learning_rate": 5.857352222765023e-06, "loss": 0.8164, "step": 1200 }, { "epoch": 0.059126780522368, "grad_norm": 13.185476303100586, "learning_rate": 5.906204201270152e-06, "loss": 0.8107, "step": 1210 }, { "epoch": 0.05961543160106526, "grad_norm": 19.2025203704834, "learning_rate": 5.955056179775281e-06, "loss": 0.8026, "step": 1220 }, { "epoch": 0.06010408267976251, "grad_norm": 15.930268287658691, "learning_rate": 6.003908158280411e-06, "loss": 0.8046, "step": 1230 }, { "epoch": 0.06059273375845977, "grad_norm": 9.219900131225586, "learning_rate": 6.05276013678554e-06, "loss": 0.7913, "step": 1240 }, { "epoch": 0.06108138483715703, "grad_norm": 9.282882690429688, "learning_rate": 6.10161211529067e-06, "loss": 0.7798, "step": 1250 }, { "epoch": 0.06157003591585428, "grad_norm": 10.684017181396484, "learning_rate": 6.150464093795799e-06, "loss": 0.7746, "step": 1260 }, { "epoch": 0.06205868699455154, "grad_norm": 19.030454635620117, "learning_rate": 6.199316072300928e-06, "loss": 0.7789, "step": 1270 }, { "epoch": 0.06254733807324879, "grad_norm": 14.472164154052734, "learning_rate": 6.248168050806058e-06, "loss": 0.7645, "step": 1280 }, { "epoch": 0.06303598915194605, "grad_norm": 15.92104721069336, "learning_rate": 6.297020029311187e-06, "loss": 0.7601, "step": 1290 }, { "epoch": 0.0635246402306433, "grad_norm": 12.93683910369873, "learning_rate": 6.345872007816317e-06, "loss": 0.7508, "step": 1300 }, { "epoch": 0.06401329130934057, "grad_norm": 12.283439636230469, "learning_rate": 6.394723986321446e-06, "loss": 0.7441, "step": 1310 }, { "epoch": 0.06450194238803782, "grad_norm": 11.30448055267334, "learning_rate": 6.443575964826576e-06, "loss": 0.7359, "step": 1320 }, { "epoch": 0.06499059346673508, "grad_norm": 10.10558795928955, "learning_rate": 6.492427943331705e-06, "loss": 0.7312, "step": 1330 }, { "epoch": 0.06547924454543233, "grad_norm": 10.84056282043457, "learning_rate": 6.541279921836835e-06, "loss": 0.7257, "step": 1340 }, { "epoch": 0.0659678956241296, "grad_norm": 11.601236343383789, "learning_rate": 6.590131900341964e-06, "loss": 0.7205, "step": 1350 }, { "epoch": 0.06645654670282684, "grad_norm": 9.640713691711426, "learning_rate": 6.6389838788470936e-06, "loss": 0.7179, "step": 1360 }, { "epoch": 0.06694519778152411, "grad_norm": 7.962968826293945, "learning_rate": 6.687835857352223e-06, "loss": 0.7218, "step": 1370 }, { "epoch": 0.06743384886022136, "grad_norm": 6.22469425201416, "learning_rate": 6.736687835857353e-06, "loss": 0.7158, "step": 1380 }, { "epoch": 0.06792249993891862, "grad_norm": 11.041025161743164, "learning_rate": 6.785539814362482e-06, "loss": 0.7075, "step": 1390 }, { "epoch": 0.06841115101761587, "grad_norm": 11.427011489868164, "learning_rate": 6.834391792867612e-06, "loss": 0.7097, "step": 1400 }, { "epoch": 0.06889980209631313, "grad_norm": 5.472978115081787, "learning_rate": 6.88324377137274e-06, "loss": 0.7058, "step": 1410 }, { "epoch": 0.06938845317501038, "grad_norm": 11.191500663757324, "learning_rate": 6.932095749877871e-06, "loss": 0.7033, "step": 1420 }, { "epoch": 0.06987710425370763, "grad_norm": 9.202252388000488, "learning_rate": 6.980947728382999e-06, "loss": 0.7019, "step": 1430 }, { "epoch": 0.0703657553324049, "grad_norm": 5.8239216804504395, "learning_rate": 7.02979970688813e-06, "loss": 0.6908, "step": 1440 }, { "epoch": 0.07085440641110215, "grad_norm": 6.5890092849731445, "learning_rate": 7.078651685393258e-06, "loss": 0.6821, "step": 1450 }, { "epoch": 0.07134305748979941, "grad_norm": 5.046870231628418, "learning_rate": 7.127503663898389e-06, "loss": 0.6782, "step": 1460 }, { "epoch": 0.07183170856849666, "grad_norm": 6.238111972808838, "learning_rate": 7.1763556424035174e-06, "loss": 0.6597, "step": 1470 }, { "epoch": 0.07232035964719392, "grad_norm": 14.37743091583252, "learning_rate": 7.225207620908648e-06, "loss": 0.6472, "step": 1480 }, { "epoch": 0.07280901072589117, "grad_norm": 8.147233963012695, "learning_rate": 7.2740595994137765e-06, "loss": 0.6435, "step": 1490 }, { "epoch": 0.07329766180458844, "grad_norm": 10.538481712341309, "learning_rate": 7.322911577918906e-06, "loss": 0.6375, "step": 1500 }, { "epoch": 0.07329766180458844, "eval_loss": 0.6336132884025574, "eval_runtime": 729.8371, "eval_samples_per_second": 242.394, "eval_steps_per_second": 0.474, "step": 1500 }, { "epoch": 0.07378631288328569, "grad_norm": 8.202781677246094, "learning_rate": 7.3717635564240355e-06, "loss": 0.627, "step": 1510 }, { "epoch": 0.07427496396198295, "grad_norm": 9.16813850402832, "learning_rate": 7.420615534929165e-06, "loss": 0.6136, "step": 1520 }, { "epoch": 0.0747636150406802, "grad_norm": 4.204853057861328, "learning_rate": 7.4694675134342946e-06, "loss": 0.6092, "step": 1530 }, { "epoch": 0.07525226611937746, "grad_norm": 7.2187652587890625, "learning_rate": 7.518319491939424e-06, "loss": 0.619, "step": 1540 }, { "epoch": 0.07574091719807471, "grad_norm": 6.75137996673584, "learning_rate": 7.567171470444553e-06, "loss": 0.6183, "step": 1550 }, { "epoch": 0.07622956827677198, "grad_norm": 12.58353042602539, "learning_rate": 7.616023448949683e-06, "loss": 0.6053, "step": 1560 }, { "epoch": 0.07671821935546923, "grad_norm": 5.846193313598633, "learning_rate": 7.664875427454813e-06, "loss": 0.607, "step": 1570 }, { "epoch": 0.07720687043416648, "grad_norm": 7.444247722625732, "learning_rate": 7.713727405959941e-06, "loss": 0.5934, "step": 1580 }, { "epoch": 0.07769552151286374, "grad_norm": 3.659825563430786, "learning_rate": 7.762579384465072e-06, "loss": 0.5938, "step": 1590 }, { "epoch": 0.07818417259156099, "grad_norm": 6.078113079071045, "learning_rate": 7.8114313629702e-06, "loss": 0.5942, "step": 1600 }, { "epoch": 0.07867282367025825, "grad_norm": 7.572592735290527, "learning_rate": 7.86028334147533e-06, "loss": 0.6032, "step": 1610 }, { "epoch": 0.0791614747489555, "grad_norm": 6.511207103729248, "learning_rate": 7.90913531998046e-06, "loss": 0.5873, "step": 1620 }, { "epoch": 0.07965012582765277, "grad_norm": 6.170757293701172, "learning_rate": 7.957987298485588e-06, "loss": 0.5804, "step": 1630 }, { "epoch": 0.08013877690635002, "grad_norm": 14.552532196044922, "learning_rate": 8.006839276990718e-06, "loss": 0.5753, "step": 1640 }, { "epoch": 0.08062742798504728, "grad_norm": 8.183059692382812, "learning_rate": 8.055691255495847e-06, "loss": 0.5739, "step": 1650 }, { "epoch": 0.08111607906374453, "grad_norm": 4.893775463104248, "learning_rate": 8.104543234000977e-06, "loss": 0.5722, "step": 1660 }, { "epoch": 0.0816047301424418, "grad_norm": 9.298670768737793, "learning_rate": 8.153395212506106e-06, "loss": 0.5696, "step": 1670 }, { "epoch": 0.08209338122113904, "grad_norm": 5.700584888458252, "learning_rate": 8.202247191011237e-06, "loss": 0.568, "step": 1680 }, { "epoch": 0.08258203229983631, "grad_norm": 14.690134048461914, "learning_rate": 8.251099169516365e-06, "loss": 0.5739, "step": 1690 }, { "epoch": 0.08307068337853356, "grad_norm": 12.68682861328125, "learning_rate": 8.299951148021496e-06, "loss": 0.5801, "step": 1700 }, { "epoch": 0.08355933445723081, "grad_norm": 8.979551315307617, "learning_rate": 8.348803126526624e-06, "loss": 0.5791, "step": 1710 }, { "epoch": 0.08404798553592807, "grad_norm": 5.448888301849365, "learning_rate": 8.397655105031755e-06, "loss": 0.5657, "step": 1720 }, { "epoch": 0.08453663661462532, "grad_norm": 8.006872177124023, "learning_rate": 8.446507083536883e-06, "loss": 0.5484, "step": 1730 }, { "epoch": 0.08502528769332258, "grad_norm": 6.7078046798706055, "learning_rate": 8.495359062042014e-06, "loss": 0.5555, "step": 1740 }, { "epoch": 0.08551393877201983, "grad_norm": 8.614073753356934, "learning_rate": 8.544211040547142e-06, "loss": 0.5606, "step": 1750 }, { "epoch": 0.0860025898507171, "grad_norm": 4.551246643066406, "learning_rate": 8.593063019052273e-06, "loss": 0.5544, "step": 1760 }, { "epoch": 0.08649124092941435, "grad_norm": 3.444021463394165, "learning_rate": 8.641914997557401e-06, "loss": 0.5411, "step": 1770 }, { "epoch": 0.08697989200811161, "grad_norm": 17.660511016845703, "learning_rate": 8.690766976062532e-06, "loss": 0.5427, "step": 1780 }, { "epoch": 0.08746854308680886, "grad_norm": 7.721867561340332, "learning_rate": 8.73961895456766e-06, "loss": 0.5526, "step": 1790 }, { "epoch": 0.08795719416550613, "grad_norm": 3.451046943664551, "learning_rate": 8.78847093307279e-06, "loss": 0.5425, "step": 1800 }, { "epoch": 0.08844584524420337, "grad_norm": 4.078919887542725, "learning_rate": 8.83732291157792e-06, "loss": 0.5543, "step": 1810 }, { "epoch": 0.08893449632290064, "grad_norm": 4.645016193389893, "learning_rate": 8.88617489008305e-06, "loss": 0.5463, "step": 1820 }, { "epoch": 0.08942314740159789, "grad_norm": 8.30947208404541, "learning_rate": 8.935026868588178e-06, "loss": 0.5452, "step": 1830 }, { "epoch": 0.08991179848029515, "grad_norm": 5.685572147369385, "learning_rate": 8.983878847093309e-06, "loss": 0.5369, "step": 1840 }, { "epoch": 0.0904004495589924, "grad_norm": 9.45528793334961, "learning_rate": 9.032730825598438e-06, "loss": 0.5299, "step": 1850 }, { "epoch": 0.09088910063768965, "grad_norm": 10.99970817565918, "learning_rate": 9.081582804103566e-06, "loss": 0.5287, "step": 1860 }, { "epoch": 0.09137775171638692, "grad_norm": 6.199814796447754, "learning_rate": 9.130434782608697e-06, "loss": 0.5397, "step": 1870 }, { "epoch": 0.09186640279508416, "grad_norm": 5.611557483673096, "learning_rate": 9.179286761113825e-06, "loss": 0.5472, "step": 1880 }, { "epoch": 0.09235505387378143, "grad_norm": 4.567397594451904, "learning_rate": 9.228138739618956e-06, "loss": 0.5353, "step": 1890 }, { "epoch": 0.09284370495247868, "grad_norm": 17.8961238861084, "learning_rate": 9.276990718124084e-06, "loss": 0.5263, "step": 1900 }, { "epoch": 0.09333235603117594, "grad_norm": 8.548867225646973, "learning_rate": 9.325842696629213e-06, "loss": 0.5301, "step": 1910 }, { "epoch": 0.09382100710987319, "grad_norm": 5.053003787994385, "learning_rate": 9.374694675134343e-06, "loss": 0.5316, "step": 1920 }, { "epoch": 0.09430965818857046, "grad_norm": 10.809515953063965, "learning_rate": 9.423546653639472e-06, "loss": 0.5255, "step": 1930 }, { "epoch": 0.0947983092672677, "grad_norm": 4.784992218017578, "learning_rate": 9.472398632144602e-06, "loss": 0.5232, "step": 1940 }, { "epoch": 0.09528696034596497, "grad_norm": 6.658916473388672, "learning_rate": 9.521250610649731e-06, "loss": 0.5225, "step": 1950 }, { "epoch": 0.09577561142466222, "grad_norm": 5.238591194152832, "learning_rate": 9.570102589154861e-06, "loss": 0.52, "step": 1960 }, { "epoch": 0.09626426250335948, "grad_norm": 6.568732261657715, "learning_rate": 9.61895456765999e-06, "loss": 0.5117, "step": 1970 }, { "epoch": 0.09675291358205673, "grad_norm": 11.915630340576172, "learning_rate": 9.66780654616512e-06, "loss": 0.5113, "step": 1980 }, { "epoch": 0.09724156466075398, "grad_norm": 3.4283180236816406, "learning_rate": 9.716658524670249e-06, "loss": 0.5206, "step": 1990 }, { "epoch": 0.09773021573945125, "grad_norm": 7.299953937530518, "learning_rate": 9.76551050317538e-06, "loss": 0.5319, "step": 2000 }, { "epoch": 0.09773021573945125, "eval_loss": 0.5161277055740356, "eval_runtime": 728.8014, "eval_samples_per_second": 242.738, "eval_steps_per_second": 0.475, "step": 2000 }, { "epoch": 0.0982188668181485, "grad_norm": 4.911329746246338, "learning_rate": 9.814362481680508e-06, "loss": 0.5179, "step": 2010 }, { "epoch": 0.09870751789684576, "grad_norm": 3.644986152648926, "learning_rate": 9.863214460185639e-06, "loss": 0.5148, "step": 2020 }, { "epoch": 0.09919616897554301, "grad_norm": 5.680597305297852, "learning_rate": 9.912066438690767e-06, "loss": 0.5119, "step": 2030 }, { "epoch": 0.09968482005424027, "grad_norm": 6.847180366516113, "learning_rate": 9.960918417195898e-06, "loss": 0.5135, "step": 2040 }, { "epoch": 0.10017347113293752, "grad_norm": 4.022679328918457, "learning_rate": 9.999999709052384e-06, "loss": 0.5307, "step": 2050 }, { "epoch": 0.10066212221163479, "grad_norm": 8.008437156677246, "learning_rate": 9.999989525889357e-06, "loss": 0.5135, "step": 2060 }, { "epoch": 0.10115077329033204, "grad_norm": 3.9152987003326416, "learning_rate": 9.99996479537936e-06, "loss": 0.5098, "step": 2070 }, { "epoch": 0.1016394243690293, "grad_norm": 4.81342887878418, "learning_rate": 9.999925517594343e-06, "loss": 0.5229, "step": 2080 }, { "epoch": 0.10212807544772655, "grad_norm": 4.663543224334717, "learning_rate": 9.999871692648587e-06, "loss": 0.5198, "step": 2090 }, { "epoch": 0.10261672652642381, "grad_norm": 3.905458927154541, "learning_rate": 9.999803320698692e-06, "loss": 0.5074, "step": 2100 }, { "epoch": 0.10310537760512106, "grad_norm": 7.694464206695557, "learning_rate": 9.999720401943584e-06, "loss": 0.503, "step": 2110 }, { "epoch": 0.10359402868381833, "grad_norm": 4.2866668701171875, "learning_rate": 9.999622936624515e-06, "loss": 0.5052, "step": 2120 }, { "epoch": 0.10408267976251558, "grad_norm": 7.022489070892334, "learning_rate": 9.999510925025058e-06, "loss": 0.5087, "step": 2130 }, { "epoch": 0.10457133084121283, "grad_norm": 2.201606273651123, "learning_rate": 9.999384367471108e-06, "loss": 0.5051, "step": 2140 }, { "epoch": 0.10505998191991009, "grad_norm": 4.468674659729004, "learning_rate": 9.99924326433088e-06, "loss": 0.5123, "step": 2150 }, { "epoch": 0.10554863299860734, "grad_norm": 3.214961528778076, "learning_rate": 9.999087616014909e-06, "loss": 0.5045, "step": 2160 }, { "epoch": 0.1060372840773046, "grad_norm": 8.839011192321777, "learning_rate": 9.998917422976053e-06, "loss": 0.5057, "step": 2170 }, { "epoch": 0.10652593515600185, "grad_norm": 3.3649775981903076, "learning_rate": 9.998732685709482e-06, "loss": 0.5026, "step": 2180 }, { "epoch": 0.10701458623469912, "grad_norm": 5.231264591217041, "learning_rate": 9.998533404752686e-06, "loss": 0.4967, "step": 2190 }, { "epoch": 0.10750323731339637, "grad_norm": 10.444920539855957, "learning_rate": 9.998319580685467e-06, "loss": 0.4978, "step": 2200 }, { "epoch": 0.10799188839209363, "grad_norm": 3.976793050765991, "learning_rate": 9.998091214129943e-06, "loss": 0.5012, "step": 2210 }, { "epoch": 0.10848053947079088, "grad_norm": 4.761758327484131, "learning_rate": 9.997848305750538e-06, "loss": 0.4948, "step": 2220 }, { "epoch": 0.10896919054948814, "grad_norm": 4.317152976989746, "learning_rate": 9.997590856253988e-06, "loss": 0.4991, "step": 2230 }, { "epoch": 0.10945784162818539, "grad_norm": 3.9865562915802, "learning_rate": 9.99731886638934e-06, "loss": 0.4973, "step": 2240 }, { "epoch": 0.10994649270688266, "grad_norm": 3.0519254207611084, "learning_rate": 9.997032336947938e-06, "loss": 0.4968, "step": 2250 }, { "epoch": 0.1104351437855799, "grad_norm": 3.462034225463867, "learning_rate": 9.996731268763434e-06, "loss": 0.4908, "step": 2260 }, { "epoch": 0.11092379486427716, "grad_norm": 4.285225868225098, "learning_rate": 9.996415662711779e-06, "loss": 0.4906, "step": 2270 }, { "epoch": 0.11141244594297442, "grad_norm": 2.549806833267212, "learning_rate": 9.996085519711218e-06, "loss": 0.4934, "step": 2280 }, { "epoch": 0.11190109702167167, "grad_norm": 6.287642478942871, "learning_rate": 9.995740840722297e-06, "loss": 0.4969, "step": 2290 }, { "epoch": 0.11238974810036893, "grad_norm": 6.043119430541992, "learning_rate": 9.99538162674785e-06, "loss": 0.4959, "step": 2300 }, { "epoch": 0.11287839917906618, "grad_norm": 3.221782922744751, "learning_rate": 9.995007878833001e-06, "loss": 0.4895, "step": 2310 }, { "epoch": 0.11336705025776345, "grad_norm": 7.820531368255615, "learning_rate": 9.994619598065162e-06, "loss": 0.4921, "step": 2320 }, { "epoch": 0.1138557013364607, "grad_norm": 1.8136892318725586, "learning_rate": 9.994216785574024e-06, "loss": 0.4893, "step": 2330 }, { "epoch": 0.11434435241515796, "grad_norm": 2.453530788421631, "learning_rate": 9.993799442531562e-06, "loss": 0.4874, "step": 2340 }, { "epoch": 0.11483300349385521, "grad_norm": 2.470960855484009, "learning_rate": 9.993367570152024e-06, "loss": 0.4876, "step": 2350 }, { "epoch": 0.11532165457255247, "grad_norm": 5.889760971069336, "learning_rate": 9.992921169691934e-06, "loss": 0.485, "step": 2360 }, { "epoch": 0.11581030565124972, "grad_norm": 1.6044597625732422, "learning_rate": 9.992460242450081e-06, "loss": 0.4857, "step": 2370 }, { "epoch": 0.11629895672994699, "grad_norm": 3.4553425312042236, "learning_rate": 9.991984789767521e-06, "loss": 0.4894, "step": 2380 }, { "epoch": 0.11678760780864424, "grad_norm": 4.581337928771973, "learning_rate": 9.991494813027576e-06, "loss": 0.4915, "step": 2390 }, { "epoch": 0.1172762588873415, "grad_norm": 3.9853124618530273, "learning_rate": 9.990990313655817e-06, "loss": 0.4885, "step": 2400 }, { "epoch": 0.11776490996603875, "grad_norm": 2.2269527912139893, "learning_rate": 9.990471293120074e-06, "loss": 0.4868, "step": 2410 }, { "epoch": 0.118253561044736, "grad_norm": 5.388997554779053, "learning_rate": 9.989937752930426e-06, "loss": 0.4958, "step": 2420 }, { "epoch": 0.11874221212343326, "grad_norm": 4.705722332000732, "learning_rate": 9.989389694639194e-06, "loss": 0.4916, "step": 2430 }, { "epoch": 0.11923086320213051, "grad_norm": 3.4011592864990234, "learning_rate": 9.988827119840937e-06, "loss": 0.4879, "step": 2440 }, { "epoch": 0.11971951428082778, "grad_norm": 4.242159366607666, "learning_rate": 9.98825003017246e-06, "loss": 0.4856, "step": 2450 }, { "epoch": 0.12020816535952503, "grad_norm": 3.563094139099121, "learning_rate": 9.987658427312785e-06, "loss": 0.4838, "step": 2460 }, { "epoch": 0.12069681643822229, "grad_norm": 3.6437556743621826, "learning_rate": 9.987052312983168e-06, "loss": 0.4803, "step": 2470 }, { "epoch": 0.12118546751691954, "grad_norm": 7.271683216094971, "learning_rate": 9.986431688947083e-06, "loss": 0.4855, "step": 2480 }, { "epoch": 0.1216741185956168, "grad_norm": 4.0858941078186035, "learning_rate": 9.98579655701022e-06, "loss": 0.4878, "step": 2490 }, { "epoch": 0.12216276967431405, "grad_norm": 4.186237335205078, "learning_rate": 9.985146919020483e-06, "loss": 0.4849, "step": 2500 }, { "epoch": 0.12216276967431405, "eval_loss": 0.46980607509613037, "eval_runtime": 728.0838, "eval_samples_per_second": 242.978, "eval_steps_per_second": 0.475, "step": 2500 }, { "epoch": 0.12265142075301132, "grad_norm": 4.360340595245361, "learning_rate": 9.984482776867975e-06, "loss": 0.4824, "step": 2510 }, { "epoch": 0.12314007183170857, "grad_norm": 2.920182228088379, "learning_rate": 9.983804132485003e-06, "loss": 0.4813, "step": 2520 }, { "epoch": 0.12362872291040583, "grad_norm": 2.6488723754882812, "learning_rate": 9.983110987846063e-06, "loss": 0.4811, "step": 2530 }, { "epoch": 0.12411737398910308, "grad_norm": 2.2960548400878906, "learning_rate": 9.982403344967847e-06, "loss": 0.4755, "step": 2540 }, { "epoch": 0.12460602506780034, "grad_norm": 2.8793044090270996, "learning_rate": 9.98168120590922e-06, "loss": 0.4792, "step": 2550 }, { "epoch": 0.12509467614649758, "grad_norm": 2.4910120964050293, "learning_rate": 9.980944572771231e-06, "loss": 0.4839, "step": 2560 }, { "epoch": 0.12558332722519486, "grad_norm": 6.9705891609191895, "learning_rate": 9.980193447697095e-06, "loss": 0.4792, "step": 2570 }, { "epoch": 0.1260719783038921, "grad_norm": 2.401073694229126, "learning_rate": 9.979427832872191e-06, "loss": 0.4788, "step": 2580 }, { "epoch": 0.12656062938258936, "grad_norm": 2.653182029724121, "learning_rate": 9.97864773052406e-06, "loss": 0.4804, "step": 2590 }, { "epoch": 0.1270492804612866, "grad_norm": 2.8506484031677246, "learning_rate": 9.977853142922386e-06, "loss": 0.4769, "step": 2600 }, { "epoch": 0.12753793153998388, "grad_norm": 3.2540268898010254, "learning_rate": 9.977044072379006e-06, "loss": 0.4797, "step": 2610 }, { "epoch": 0.12802658261868113, "grad_norm": 6.425643444061279, "learning_rate": 9.976220521247888e-06, "loss": 0.4872, "step": 2620 }, { "epoch": 0.12851523369737838, "grad_norm": 3.4844772815704346, "learning_rate": 9.975382491925137e-06, "loss": 0.4775, "step": 2630 }, { "epoch": 0.12900388477607563, "grad_norm": 2.7126948833465576, "learning_rate": 9.974529986848976e-06, "loss": 0.4795, "step": 2640 }, { "epoch": 0.1294925358547729, "grad_norm": 3.378321409225464, "learning_rate": 9.973663008499748e-06, "loss": 0.4851, "step": 2650 }, { "epoch": 0.12998118693347016, "grad_norm": 2.3212387561798096, "learning_rate": 9.972781559399906e-06, "loss": 0.4765, "step": 2660 }, { "epoch": 0.1304698380121674, "grad_norm": 3.0284295082092285, "learning_rate": 9.971885642114006e-06, "loss": 0.4779, "step": 2670 }, { "epoch": 0.13095848909086466, "grad_norm": 2.1346194744110107, "learning_rate": 9.970975259248696e-06, "loss": 0.4765, "step": 2680 }, { "epoch": 0.13144714016956194, "grad_norm": 2.0011963844299316, "learning_rate": 9.97005041345271e-06, "loss": 0.4813, "step": 2690 }, { "epoch": 0.1319357912482592, "grad_norm": 3.866771936416626, "learning_rate": 9.969111107416867e-06, "loss": 0.4766, "step": 2700 }, { "epoch": 0.13242444232695644, "grad_norm": 6.982947826385498, "learning_rate": 9.968157343874056e-06, "loss": 0.4773, "step": 2710 }, { "epoch": 0.1329130934056537, "grad_norm": 4.293519973754883, "learning_rate": 9.967189125599228e-06, "loss": 0.4818, "step": 2720 }, { "epoch": 0.13340174448435094, "grad_norm": 3.3985178470611572, "learning_rate": 9.966206455409386e-06, "loss": 0.4778, "step": 2730 }, { "epoch": 0.13389039556304821, "grad_norm": 1.5569087266921997, "learning_rate": 9.96520933616359e-06, "loss": 0.4737, "step": 2740 }, { "epoch": 0.13437904664174546, "grad_norm": 4.966946125030518, "learning_rate": 9.964197770762933e-06, "loss": 0.4762, "step": 2750 }, { "epoch": 0.13486769772044271, "grad_norm": 2.4373340606689453, "learning_rate": 9.96317176215054e-06, "loss": 0.4764, "step": 2760 }, { "epoch": 0.13535634879913996, "grad_norm": 4.127823352813721, "learning_rate": 9.962131313311555e-06, "loss": 0.4753, "step": 2770 }, { "epoch": 0.13584499987783724, "grad_norm": 2.2819466590881348, "learning_rate": 9.96107642727314e-06, "loss": 0.475, "step": 2780 }, { "epoch": 0.1363336509565345, "grad_norm": 5.689523696899414, "learning_rate": 9.960007107104462e-06, "loss": 0.4748, "step": 2790 }, { "epoch": 0.13682230203523174, "grad_norm": 2.3338582515716553, "learning_rate": 9.958923355916682e-06, "loss": 0.4774, "step": 2800 }, { "epoch": 0.137310953113929, "grad_norm": 5.458847522735596, "learning_rate": 9.95782517686294e-06, "loss": 0.474, "step": 2810 }, { "epoch": 0.13779960419262627, "grad_norm": 1.634664535522461, "learning_rate": 9.956712573138371e-06, "loss": 0.4737, "step": 2820 }, { "epoch": 0.13828825527132352, "grad_norm": 1.757805585861206, "learning_rate": 9.955585547980065e-06, "loss": 0.4713, "step": 2830 }, { "epoch": 0.13877690635002077, "grad_norm": 1.5585452318191528, "learning_rate": 9.954444104667071e-06, "loss": 0.4734, "step": 2840 }, { "epoch": 0.13926555742871802, "grad_norm": 8.348752975463867, "learning_rate": 9.953288246520393e-06, "loss": 0.4754, "step": 2850 }, { "epoch": 0.13975420850741527, "grad_norm": 2.4966542720794678, "learning_rate": 9.95211797690297e-06, "loss": 0.4719, "step": 2860 }, { "epoch": 0.14024285958611254, "grad_norm": 2.205169439315796, "learning_rate": 9.950933299219676e-06, "loss": 0.4705, "step": 2870 }, { "epoch": 0.1407315106648098, "grad_norm": 2.2777152061462402, "learning_rate": 9.949734216917301e-06, "loss": 0.4687, "step": 2880 }, { "epoch": 0.14122016174350704, "grad_norm": 1.1067817211151123, "learning_rate": 9.948520733484543e-06, "loss": 0.4673, "step": 2890 }, { "epoch": 0.1417088128222043, "grad_norm": 3.3773841857910156, "learning_rate": 9.947292852452003e-06, "loss": 0.4707, "step": 2900 }, { "epoch": 0.14219746390090157, "grad_norm": 1.1769728660583496, "learning_rate": 9.946050577392173e-06, "loss": 0.4703, "step": 2910 }, { "epoch": 0.14268611497959882, "grad_norm": 7.464486122131348, "learning_rate": 9.94479391191942e-06, "loss": 0.4723, "step": 2920 }, { "epoch": 0.14317476605829607, "grad_norm": 2.232747793197632, "learning_rate": 9.94352285968998e-06, "loss": 0.4735, "step": 2930 }, { "epoch": 0.14366341713699332, "grad_norm": 3.54618239402771, "learning_rate": 9.942237424401952e-06, "loss": 0.4695, "step": 2940 }, { "epoch": 0.1441520682156906, "grad_norm": 1.840293526649475, "learning_rate": 9.940937609795276e-06, "loss": 0.471, "step": 2950 }, { "epoch": 0.14464071929438785, "grad_norm": 3.1132638454437256, "learning_rate": 9.939623419651732e-06, "loss": 0.47, "step": 2960 }, { "epoch": 0.1451293703730851, "grad_norm": 1.120263695716858, "learning_rate": 9.93829485779492e-06, "loss": 0.47, "step": 2970 }, { "epoch": 0.14561802145178235, "grad_norm": 4.9828057289123535, "learning_rate": 9.936951928090266e-06, "loss": 0.4731, "step": 2980 }, { "epoch": 0.1461066725304796, "grad_norm": 2.0490236282348633, "learning_rate": 9.935594634444985e-06, "loss": 0.4707, "step": 2990 }, { "epoch": 0.14659532360917688, "grad_norm": 3.313997268676758, "learning_rate": 9.93422298080809e-06, "loss": 0.4675, "step": 3000 }, { "epoch": 0.14659532360917688, "eval_loss": 0.45171666145324707, "eval_runtime": 727.5975, "eval_samples_per_second": 243.14, "eval_steps_per_second": 0.476, "step": 3000 }, { "epoch": 0.14708397468787412, "grad_norm": 3.7955098152160645, "learning_rate": 9.932836971170375e-06, "loss": 0.4759, "step": 3010 }, { "epoch": 0.14757262576657137, "grad_norm": 0.955259382724762, "learning_rate": 9.931436609564402e-06, "loss": 0.4676, "step": 3020 }, { "epoch": 0.14806127684526862, "grad_norm": 1.6290405988693237, "learning_rate": 9.930021900064486e-06, "loss": 0.47, "step": 3030 }, { "epoch": 0.1485499279239659, "grad_norm": 4.106773376464844, "learning_rate": 9.928592846786693e-06, "loss": 0.4693, "step": 3040 }, { "epoch": 0.14903857900266315, "grad_norm": 1.7998560667037964, "learning_rate": 9.927149453888814e-06, "loss": 0.4679, "step": 3050 }, { "epoch": 0.1495272300813604, "grad_norm": 3.9935462474823, "learning_rate": 9.92569172557037e-06, "loss": 0.4675, "step": 3060 }, { "epoch": 0.15001588116005765, "grad_norm": 1.6421153545379639, "learning_rate": 9.924219666072584e-06, "loss": 0.469, "step": 3070 }, { "epoch": 0.15050453223875493, "grad_norm": 6.065347671508789, "learning_rate": 9.922733279678376e-06, "loss": 0.478, "step": 3080 }, { "epoch": 0.15099318331745218, "grad_norm": 4.049252986907959, "learning_rate": 9.921232570712351e-06, "loss": 0.4734, "step": 3090 }, { "epoch": 0.15148183439614943, "grad_norm": 3.9484283924102783, "learning_rate": 9.919717543540786e-06, "loss": 0.4702, "step": 3100 }, { "epoch": 0.15197048547484668, "grad_norm": 3.8022537231445312, "learning_rate": 9.918188202571615e-06, "loss": 0.4674, "step": 3110 }, { "epoch": 0.15245913655354396, "grad_norm": 3.4525346755981445, "learning_rate": 9.916644552254417e-06, "loss": 0.4724, "step": 3120 }, { "epoch": 0.1529477876322412, "grad_norm": 1.305325984954834, "learning_rate": 9.915086597080407e-06, "loss": 0.468, "step": 3130 }, { "epoch": 0.15343643871093846, "grad_norm": 2.5436055660247803, "learning_rate": 9.913514341582415e-06, "loss": 0.4706, "step": 3140 }, { "epoch": 0.1539250897896357, "grad_norm": 2.798241376876831, "learning_rate": 9.911927790334882e-06, "loss": 0.4695, "step": 3150 }, { "epoch": 0.15441374086833295, "grad_norm": 2.0094220638275146, "learning_rate": 9.910326947953838e-06, "loss": 0.4694, "step": 3160 }, { "epoch": 0.15490239194703023, "grad_norm": 2.610715389251709, "learning_rate": 9.908711819096897e-06, "loss": 0.4668, "step": 3170 }, { "epoch": 0.15539104302572748, "grad_norm": 4.6221232414245605, "learning_rate": 9.907082408463234e-06, "loss": 0.4679, "step": 3180 }, { "epoch": 0.15587969410442473, "grad_norm": 5.538655757904053, "learning_rate": 9.905438720793582e-06, "loss": 0.474, "step": 3190 }, { "epoch": 0.15636834518312198, "grad_norm": 2.6583926677703857, "learning_rate": 9.903780760870208e-06, "loss": 0.475, "step": 3200 }, { "epoch": 0.15685699626181926, "grad_norm": 4.67283821105957, "learning_rate": 9.902108533516907e-06, "loss": 0.4693, "step": 3210 }, { "epoch": 0.1573456473405165, "grad_norm": 2.7513134479522705, "learning_rate": 9.900422043598982e-06, "loss": 0.4675, "step": 3220 }, { "epoch": 0.15783429841921376, "grad_norm": 1.7802903652191162, "learning_rate": 9.898721296023234e-06, "loss": 0.466, "step": 3230 }, { "epoch": 0.158322949497911, "grad_norm": 2.868180751800537, "learning_rate": 9.89700629573795e-06, "loss": 0.4652, "step": 3240 }, { "epoch": 0.15881160057660829, "grad_norm": 2.2115590572357178, "learning_rate": 9.895277047732879e-06, "loss": 0.4649, "step": 3250 }, { "epoch": 0.15930025165530554, "grad_norm": 2.7699434757232666, "learning_rate": 9.893533557039223e-06, "loss": 0.466, "step": 3260 }, { "epoch": 0.15978890273400279, "grad_norm": 2.4520747661590576, "learning_rate": 9.891775828729628e-06, "loss": 0.4639, "step": 3270 }, { "epoch": 0.16027755381270004, "grad_norm": 2.2992360591888428, "learning_rate": 9.890003867918162e-06, "loss": 0.4643, "step": 3280 }, { "epoch": 0.16076620489139729, "grad_norm": 2.04976224899292, "learning_rate": 9.888217679760303e-06, "loss": 0.4649, "step": 3290 }, { "epoch": 0.16125485597009456, "grad_norm": 1.9434853792190552, "learning_rate": 9.886417269452918e-06, "loss": 0.4665, "step": 3300 }, { "epoch": 0.1617435070487918, "grad_norm": 2.6264779567718506, "learning_rate": 9.884602642234257e-06, "loss": 0.4647, "step": 3310 }, { "epoch": 0.16223215812748906, "grad_norm": 3.206934690475464, "learning_rate": 9.882773803383934e-06, "loss": 0.4675, "step": 3320 }, { "epoch": 0.1627208092061863, "grad_norm": 7.612506866455078, "learning_rate": 9.880930758222912e-06, "loss": 0.4728, "step": 3330 }, { "epoch": 0.1632094602848836, "grad_norm": 1.3091853857040405, "learning_rate": 9.879073512113487e-06, "loss": 0.4691, "step": 3340 }, { "epoch": 0.16369811136358084, "grad_norm": 3.0943753719329834, "learning_rate": 9.877202070459268e-06, "loss": 0.4657, "step": 3350 }, { "epoch": 0.1641867624422781, "grad_norm": 1.4435592889785767, "learning_rate": 9.87531643870517e-06, "loss": 0.465, "step": 3360 }, { "epoch": 0.16467541352097534, "grad_norm": 1.4803426265716553, "learning_rate": 9.87341662233739e-06, "loss": 0.4637, "step": 3370 }, { "epoch": 0.16516406459967262, "grad_norm": 0.7361840605735779, "learning_rate": 9.871502626883403e-06, "loss": 0.463, "step": 3380 }, { "epoch": 0.16565271567836987, "grad_norm": 36.265968322753906, "learning_rate": 9.869574457911925e-06, "loss": 0.4701, "step": 3390 }, { "epoch": 0.16614136675706712, "grad_norm": 1.7011586427688599, "learning_rate": 9.86763212103292e-06, "loss": 0.4701, "step": 3400 }, { "epoch": 0.16663001783576437, "grad_norm": 2.6717042922973633, "learning_rate": 9.865675621897571e-06, "loss": 0.4644, "step": 3410 }, { "epoch": 0.16711866891446162, "grad_norm": 2.1945064067840576, "learning_rate": 9.86370496619826e-06, "loss": 0.4641, "step": 3420 }, { "epoch": 0.1676073199931589, "grad_norm": 1.7178106307983398, "learning_rate": 9.861720159668566e-06, "loss": 0.4628, "step": 3430 }, { "epoch": 0.16809597107185614, "grad_norm": 1.945646047592163, "learning_rate": 9.85972120808323e-06, "loss": 0.4623, "step": 3440 }, { "epoch": 0.1685846221505534, "grad_norm": 1.8980379104614258, "learning_rate": 9.857708117258158e-06, "loss": 0.4621, "step": 3450 }, { "epoch": 0.16907327322925064, "grad_norm": 1.8674242496490479, "learning_rate": 9.855680893050384e-06, "loss": 0.4621, "step": 3460 }, { "epoch": 0.16956192430794792, "grad_norm": 1.6167813539505005, "learning_rate": 9.853639541358069e-06, "loss": 0.4629, "step": 3470 }, { "epoch": 0.17005057538664517, "grad_norm": 2.2286250591278076, "learning_rate": 9.851584068120477e-06, "loss": 0.4634, "step": 3480 }, { "epoch": 0.17053922646534242, "grad_norm": 1.4811843633651733, "learning_rate": 9.849514479317955e-06, "loss": 0.4614, "step": 3490 }, { "epoch": 0.17102787754403967, "grad_norm": 3.2708358764648438, "learning_rate": 9.84743078097192e-06, "loss": 0.4616, "step": 3500 }, { "epoch": 0.17102787754403967, "eval_loss": 0.4388451874256134, "eval_runtime": 729.7082, "eval_samples_per_second": 242.437, "eval_steps_per_second": 0.474, "step": 3500 }, { "epoch": 0.17151652862273695, "grad_norm": 2.3546407222747803, "learning_rate": 9.845332979144845e-06, "loss": 0.4629, "step": 3510 }, { "epoch": 0.1720051797014342, "grad_norm": 2.703920841217041, "learning_rate": 9.84322107994023e-06, "loss": 0.4624, "step": 3520 }, { "epoch": 0.17249383078013145, "grad_norm": 2.2422356605529785, "learning_rate": 9.841095089502595e-06, "loss": 0.4625, "step": 3530 }, { "epoch": 0.1729824818588287, "grad_norm": 1.0636337995529175, "learning_rate": 9.838955014017455e-06, "loss": 0.46, "step": 3540 }, { "epoch": 0.17347113293752595, "grad_norm": 3.9872353076934814, "learning_rate": 9.836800859711311e-06, "loss": 0.4601, "step": 3550 }, { "epoch": 0.17395978401622322, "grad_norm": 1.2745929956436157, "learning_rate": 9.83463263285162e-06, "loss": 0.4628, "step": 3560 }, { "epoch": 0.17444843509492047, "grad_norm": 2.2762491703033447, "learning_rate": 9.832450339746785e-06, "loss": 0.4622, "step": 3570 }, { "epoch": 0.17493708617361772, "grad_norm": 1.6486016511917114, "learning_rate": 9.830253986746134e-06, "loss": 0.4699, "step": 3580 }, { "epoch": 0.17542573725231497, "grad_norm": 1.5666919946670532, "learning_rate": 9.8280435802399e-06, "loss": 0.4646, "step": 3590 }, { "epoch": 0.17591438833101225, "grad_norm": 1.8892680406570435, "learning_rate": 9.825819126659214e-06, "loss": 0.4646, "step": 3600 }, { "epoch": 0.1764030394097095, "grad_norm": 2.9722862243652344, "learning_rate": 9.823580632476062e-06, "loss": 0.4598, "step": 3610 }, { "epoch": 0.17689169048840675, "grad_norm": 2.4820001125335693, "learning_rate": 9.82132810420329e-06, "loss": 0.4629, "step": 3620 }, { "epoch": 0.177380341567104, "grad_norm": 2.4330859184265137, "learning_rate": 9.819061548394574e-06, "loss": 0.4611, "step": 3630 }, { "epoch": 0.17786899264580128, "grad_norm": 3.4170515537261963, "learning_rate": 9.816780971644403e-06, "loss": 0.4647, "step": 3640 }, { "epoch": 0.17835764372449853, "grad_norm": 1.6359800100326538, "learning_rate": 9.814486380588058e-06, "loss": 0.4629, "step": 3650 }, { "epoch": 0.17884629480319578, "grad_norm": 1.80881929397583, "learning_rate": 9.812177781901597e-06, "loss": 0.4607, "step": 3660 }, { "epoch": 0.17933494588189303, "grad_norm": 1.8495383262634277, "learning_rate": 9.80985518230183e-06, "loss": 0.4598, "step": 3670 }, { "epoch": 0.1798235969605903, "grad_norm": 2.4074761867523193, "learning_rate": 9.807518588546305e-06, "loss": 0.4609, "step": 3680 }, { "epoch": 0.18031224803928755, "grad_norm": 3.074289321899414, "learning_rate": 9.805168007433283e-06, "loss": 0.4599, "step": 3690 }, { "epoch": 0.1808008991179848, "grad_norm": 3.443209648132324, "learning_rate": 9.802803445801723e-06, "loss": 0.4589, "step": 3700 }, { "epoch": 0.18128955019668205, "grad_norm": 1.7589000463485718, "learning_rate": 9.800424910531256e-06, "loss": 0.4608, "step": 3710 }, { "epoch": 0.1817782012753793, "grad_norm": 1.6772186756134033, "learning_rate": 9.798032408542177e-06, "loss": 0.4614, "step": 3720 }, { "epoch": 0.18226685235407658, "grad_norm": 2.250244617462158, "learning_rate": 9.79562594679541e-06, "loss": 0.4601, "step": 3730 }, { "epoch": 0.18275550343277383, "grad_norm": 1.436660647392273, "learning_rate": 9.793205532292496e-06, "loss": 0.459, "step": 3740 }, { "epoch": 0.18324415451147108, "grad_norm": 2.040019989013672, "learning_rate": 9.79077117207557e-06, "loss": 0.4691, "step": 3750 }, { "epoch": 0.18373280559016833, "grad_norm": 2.091820240020752, "learning_rate": 9.788322873227347e-06, "loss": 0.4624, "step": 3760 }, { "epoch": 0.1842214566688656, "grad_norm": 2.1219372749328613, "learning_rate": 9.78586064287109e-06, "loss": 0.4614, "step": 3770 }, { "epoch": 0.18471010774756286, "grad_norm": 1.5753206014633179, "learning_rate": 9.783384488170598e-06, "loss": 0.4635, "step": 3780 }, { "epoch": 0.1851987588262601, "grad_norm": 2.6877732276916504, "learning_rate": 9.780894416330182e-06, "loss": 0.4626, "step": 3790 }, { "epoch": 0.18568740990495736, "grad_norm": 1.7835508584976196, "learning_rate": 9.778390434594647e-06, "loss": 0.461, "step": 3800 }, { "epoch": 0.18617606098365463, "grad_norm": 2.014145851135254, "learning_rate": 9.775872550249266e-06, "loss": 0.4595, "step": 3810 }, { "epoch": 0.18666471206235188, "grad_norm": 1.9438420534133911, "learning_rate": 9.77334077061976e-06, "loss": 0.459, "step": 3820 }, { "epoch": 0.18715336314104913, "grad_norm": 1.6419105529785156, "learning_rate": 9.770795103072281e-06, "loss": 0.4572, "step": 3830 }, { "epoch": 0.18764201421974638, "grad_norm": 1.0788559913635254, "learning_rate": 9.768235555013385e-06, "loss": 0.4582, "step": 3840 }, { "epoch": 0.18813066529844363, "grad_norm": 1.149911642074585, "learning_rate": 9.765662133890017e-06, "loss": 0.4573, "step": 3850 }, { "epoch": 0.1886193163771409, "grad_norm": 1.5427783727645874, "learning_rate": 9.763074847189483e-06, "loss": 0.4637, "step": 3860 }, { "epoch": 0.18910796745583816, "grad_norm": 2.3992674350738525, "learning_rate": 9.760473702439426e-06, "loss": 0.4629, "step": 3870 }, { "epoch": 0.1895966185345354, "grad_norm": 2.0136971473693848, "learning_rate": 9.757858707207815e-06, "loss": 0.4584, "step": 3880 }, { "epoch": 0.19008526961323266, "grad_norm": 0.9144098162651062, "learning_rate": 9.755229869102916e-06, "loss": 0.4597, "step": 3890 }, { "epoch": 0.19057392069192994, "grad_norm": 4.437480449676514, "learning_rate": 9.752587195773268e-06, "loss": 0.4584, "step": 3900 }, { "epoch": 0.1910625717706272, "grad_norm": 2.2352585792541504, "learning_rate": 9.749930694907666e-06, "loss": 0.4584, "step": 3910 }, { "epoch": 0.19155122284932444, "grad_norm": 1.6085118055343628, "learning_rate": 9.74726037423513e-06, "loss": 0.4598, "step": 3920 }, { "epoch": 0.1920398739280217, "grad_norm": 0.8404253721237183, "learning_rate": 9.744576241524895e-06, "loss": 0.4571, "step": 3930 }, { "epoch": 0.19252852500671896, "grad_norm": 1.5468897819519043, "learning_rate": 9.741878304586379e-06, "loss": 0.4586, "step": 3940 }, { "epoch": 0.19301717608541621, "grad_norm": 3.8875999450683594, "learning_rate": 9.739166571269166e-06, "loss": 0.4601, "step": 3950 }, { "epoch": 0.19350582716411346, "grad_norm": 2.8351383209228516, "learning_rate": 9.736441049462973e-06, "loss": 0.4598, "step": 3960 }, { "epoch": 0.19399447824281071, "grad_norm": 2.2148053646087646, "learning_rate": 9.733701747097641e-06, "loss": 0.4604, "step": 3970 }, { "epoch": 0.19448312932150796, "grad_norm": 2.287990093231201, "learning_rate": 9.730948672143105e-06, "loss": 0.4576, "step": 3980 }, { "epoch": 0.19497178040020524, "grad_norm": 1.138305902481079, "learning_rate": 9.728181832609366e-06, "loss": 0.458, "step": 3990 }, { "epoch": 0.1954604314789025, "grad_norm": 0.8007479906082153, "learning_rate": 9.725401236546476e-06, "loss": 0.4593, "step": 4000 }, { "epoch": 0.1954604314789025, "eval_loss": 0.43675804138183594, "eval_runtime": 728.6855, "eval_samples_per_second": 242.777, "eval_steps_per_second": 0.475, "step": 4000 }, { "epoch": 0.19594908255759974, "grad_norm": 3.4049859046936035, "learning_rate": 9.722606892044516e-06, "loss": 0.4573, "step": 4010 }, { "epoch": 0.196437733636297, "grad_norm": 1.6767579317092896, "learning_rate": 9.719798807233555e-06, "loss": 0.461, "step": 4020 }, { "epoch": 0.19692638471499427, "grad_norm": 1.4466297626495361, "learning_rate": 9.716976990283654e-06, "loss": 0.4629, "step": 4030 }, { "epoch": 0.19741503579369152, "grad_norm": 8.470952033996582, "learning_rate": 9.714141449404815e-06, "loss": 0.4857, "step": 4040 }, { "epoch": 0.19790368687238877, "grad_norm": 1.1967353820800781, "learning_rate": 9.711292192846979e-06, "loss": 0.4613, "step": 4050 }, { "epoch": 0.19839233795108602, "grad_norm": 1.025749921798706, "learning_rate": 9.708429228899984e-06, "loss": 0.4579, "step": 4060 }, { "epoch": 0.1988809890297833, "grad_norm": 2.6049964427948, "learning_rate": 9.705552565893557e-06, "loss": 0.46, "step": 4070 }, { "epoch": 0.19936964010848054, "grad_norm": 1.4117764234542847, "learning_rate": 9.702662212197277e-06, "loss": 0.4598, "step": 4080 }, { "epoch": 0.1998582911871778, "grad_norm": 1.602464199066162, "learning_rate": 9.699758176220558e-06, "loss": 0.4579, "step": 4090 }, { "epoch": 0.20034694226587504, "grad_norm": 2.6832380294799805, "learning_rate": 9.696840466412619e-06, "loss": 0.4582, "step": 4100 }, { "epoch": 0.20083559334457232, "grad_norm": 1.2473195791244507, "learning_rate": 9.693909091262467e-06, "loss": 0.457, "step": 4110 }, { "epoch": 0.20132424442326957, "grad_norm": 1.5877009630203247, "learning_rate": 9.690964059298866e-06, "loss": 0.4565, "step": 4120 }, { "epoch": 0.20181289550196682, "grad_norm": 2.6137261390686035, "learning_rate": 9.688005379090315e-06, "loss": 0.4566, "step": 4130 }, { "epoch": 0.20230154658066407, "grad_norm": 2.4244110584259033, "learning_rate": 9.68503305924502e-06, "loss": 0.4567, "step": 4140 }, { "epoch": 0.20279019765936132, "grad_norm": 2.0475914478302, "learning_rate": 9.682047108410875e-06, "loss": 0.458, "step": 4150 }, { "epoch": 0.2032788487380586, "grad_norm": 0.8052435517311096, "learning_rate": 9.679047535275427e-06, "loss": 0.4567, "step": 4160 }, { "epoch": 0.20376749981675585, "grad_norm": 3.230631113052368, "learning_rate": 9.676034348565865e-06, "loss": 0.4569, "step": 4170 }, { "epoch": 0.2042561508954531, "grad_norm": 2.166372776031494, "learning_rate": 9.673007557048981e-06, "loss": 0.4564, "step": 4180 }, { "epoch": 0.20474480197415035, "grad_norm": 1.2645494937896729, "learning_rate": 9.669967169531148e-06, "loss": 0.4547, "step": 4190 }, { "epoch": 0.20523345305284763, "grad_norm": 2.206819772720337, "learning_rate": 9.666913194858301e-06, "loss": 0.4563, "step": 4200 }, { "epoch": 0.20572210413154488, "grad_norm": 0.8847692608833313, "learning_rate": 9.663845641915901e-06, "loss": 0.4581, "step": 4210 }, { "epoch": 0.20621075521024212, "grad_norm": 2.756206512451172, "learning_rate": 9.660764519628925e-06, "loss": 0.458, "step": 4220 }, { "epoch": 0.20669940628893937, "grad_norm": 8.863795280456543, "learning_rate": 9.657669836961816e-06, "loss": 0.458, "step": 4230 }, { "epoch": 0.20718805736763665, "grad_norm": 1.3385523557662964, "learning_rate": 9.654561602918481e-06, "loss": 0.4597, "step": 4240 }, { "epoch": 0.2076767084463339, "grad_norm": 0.9093275666236877, "learning_rate": 9.651439826542252e-06, "loss": 0.4561, "step": 4250 }, { "epoch": 0.20816535952503115, "grad_norm": 5.106472492218018, "learning_rate": 9.648304516915856e-06, "loss": 0.457, "step": 4260 }, { "epoch": 0.2086540106037284, "grad_norm": 1.5481184720993042, "learning_rate": 9.645155683161405e-06, "loss": 0.4607, "step": 4270 }, { "epoch": 0.20914266168242565, "grad_norm": 1.1324431896209717, "learning_rate": 9.641993334440349e-06, "loss": 0.4578, "step": 4280 }, { "epoch": 0.20963131276112293, "grad_norm": 2.1077959537506104, "learning_rate": 9.638817479953466e-06, "loss": 0.4551, "step": 4290 }, { "epoch": 0.21011996383982018, "grad_norm": 1.6443114280700684, "learning_rate": 9.635628128940827e-06, "loss": 0.4564, "step": 4300 }, { "epoch": 0.21060861491851743, "grad_norm": 6.366663455963135, "learning_rate": 9.632425290681771e-06, "loss": 0.455, "step": 4310 }, { "epoch": 0.21109726599721468, "grad_norm": 1.1640760898590088, "learning_rate": 9.629208974494876e-06, "loss": 0.4568, "step": 4320 }, { "epoch": 0.21158591707591196, "grad_norm": 1.8168015480041504, "learning_rate": 9.625979189737935e-06, "loss": 0.4551, "step": 4330 }, { "epoch": 0.2120745681546092, "grad_norm": 1.9787592887878418, "learning_rate": 9.62273594580793e-06, "loss": 0.4578, "step": 4340 }, { "epoch": 0.21256321923330646, "grad_norm": 1.0346274375915527, "learning_rate": 9.619479252141e-06, "loss": 0.4559, "step": 4350 }, { "epoch": 0.2130518703120037, "grad_norm": 1.2450488805770874, "learning_rate": 9.61620911821241e-06, "loss": 0.454, "step": 4360 }, { "epoch": 0.21354052139070098, "grad_norm": 6.733485221862793, "learning_rate": 9.61292555353654e-06, "loss": 0.4844, "step": 4370 }, { "epoch": 0.21402917246939823, "grad_norm": 3.55191707611084, "learning_rate": 9.609628567666838e-06, "loss": 0.4783, "step": 4380 }, { "epoch": 0.21451782354809548, "grad_norm": 1.3721413612365723, "learning_rate": 9.606318170195805e-06, "loss": 0.4614, "step": 4390 }, { "epoch": 0.21500647462679273, "grad_norm": 1.7321209907531738, "learning_rate": 9.602994370754962e-06, "loss": 0.457, "step": 4400 }, { "epoch": 0.21549512570548998, "grad_norm": 4.538996696472168, "learning_rate": 9.599657179014821e-06, "loss": 0.4573, "step": 4410 }, { "epoch": 0.21598377678418726, "grad_norm": 1.3797237873077393, "learning_rate": 9.596306604684859e-06, "loss": 0.4569, "step": 4420 }, { "epoch": 0.2164724278628845, "grad_norm": 1.5394078493118286, "learning_rate": 9.59294265751349e-06, "loss": 0.454, "step": 4430 }, { "epoch": 0.21696107894158176, "grad_norm": 5.713876247406006, "learning_rate": 9.589565347288036e-06, "loss": 0.4559, "step": 4440 }, { "epoch": 0.217449730020279, "grad_norm": 1.5455598831176758, "learning_rate": 9.5861746838347e-06, "loss": 0.4556, "step": 4450 }, { "epoch": 0.21793838109897629, "grad_norm": 1.5440407991409302, "learning_rate": 9.58277067701853e-06, "loss": 0.4548, "step": 4460 }, { "epoch": 0.21842703217767354, "grad_norm": 2.495877981185913, "learning_rate": 9.579353336743406e-06, "loss": 0.4551, "step": 4470 }, { "epoch": 0.21891568325637079, "grad_norm": 1.8251252174377441, "learning_rate": 9.575922672951992e-06, "loss": 0.4543, "step": 4480 }, { "epoch": 0.21940433433506804, "grad_norm": 1.808957815170288, "learning_rate": 9.572478695625722e-06, "loss": 0.4533, "step": 4490 }, { "epoch": 0.2198929854137653, "grad_norm": 1.994743824005127, "learning_rate": 9.56902141478476e-06, "loss": 0.4536, "step": 4500 }, { "epoch": 0.2198929854137653, "eval_loss": 0.42861247062683105, "eval_runtime": 729.5786, "eval_samples_per_second": 242.48, "eval_steps_per_second": 0.474, "step": 4500 }, { "epoch": 0.22038163649246256, "grad_norm": 1.9364984035491943, "learning_rate": 9.565550840487987e-06, "loss": 0.4548, "step": 4510 }, { "epoch": 0.2208702875711598, "grad_norm": 2.0791361331939697, "learning_rate": 9.562066982832945e-06, "loss": 0.4546, "step": 4520 }, { "epoch": 0.22135893864985706, "grad_norm": 1.759068489074707, "learning_rate": 9.55856985195584e-06, "loss": 0.455, "step": 4530 }, { "epoch": 0.2218475897285543, "grad_norm": 1.7903980016708374, "learning_rate": 9.555059458031485e-06, "loss": 0.4536, "step": 4540 }, { "epoch": 0.2223362408072516, "grad_norm": 1.3520255088806152, "learning_rate": 9.551535811273285e-06, "loss": 0.4521, "step": 4550 }, { "epoch": 0.22282489188594884, "grad_norm": 1.4286073446273804, "learning_rate": 9.547998921933203e-06, "loss": 0.4541, "step": 4560 }, { "epoch": 0.2233135429646461, "grad_norm": 1.2026102542877197, "learning_rate": 9.544448800301736e-06, "loss": 0.4531, "step": 4570 }, { "epoch": 0.22380219404334334, "grad_norm": 3.257838010787964, "learning_rate": 9.54088545670787e-06, "loss": 0.4614, "step": 4580 }, { "epoch": 0.22429084512204062, "grad_norm": 1.2527670860290527, "learning_rate": 9.537308901519073e-06, "loss": 0.4606, "step": 4590 }, { "epoch": 0.22477949620073787, "grad_norm": 4.201780319213867, "learning_rate": 9.533719145141239e-06, "loss": 0.4577, "step": 4600 }, { "epoch": 0.22526814727943512, "grad_norm": 1.9157164096832275, "learning_rate": 9.530116198018677e-06, "loss": 0.4566, "step": 4610 }, { "epoch": 0.22575679835813237, "grad_norm": 1.9841718673706055, "learning_rate": 9.526500070634075e-06, "loss": 0.4561, "step": 4620 }, { "epoch": 0.22624544943682964, "grad_norm": 1.592416524887085, "learning_rate": 9.522870773508466e-06, "loss": 0.4538, "step": 4630 }, { "epoch": 0.2267341005155269, "grad_norm": 1.8579721450805664, "learning_rate": 9.519228317201201e-06, "loss": 0.4565, "step": 4640 }, { "epoch": 0.22722275159422414, "grad_norm": 0.8354905247688293, "learning_rate": 9.51557271230992e-06, "loss": 0.4535, "step": 4650 }, { "epoch": 0.2277114026729214, "grad_norm": 1.6300770044326782, "learning_rate": 9.51190396947051e-06, "loss": 0.4555, "step": 4660 }, { "epoch": 0.22820005375161867, "grad_norm": 2.1904120445251465, "learning_rate": 9.508222099357094e-06, "loss": 0.455, "step": 4670 }, { "epoch": 0.22868870483031592, "grad_norm": 2.8534996509552, "learning_rate": 9.504527112681978e-06, "loss": 0.4551, "step": 4680 }, { "epoch": 0.22917735590901317, "grad_norm": 1.0719540119171143, "learning_rate": 9.50081902019564e-06, "loss": 0.4531, "step": 4690 }, { "epoch": 0.22966600698771042, "grad_norm": 1.4179500341415405, "learning_rate": 9.497097832686682e-06, "loss": 0.4531, "step": 4700 }, { "epoch": 0.23015465806640767, "grad_norm": 3.2865960597991943, "learning_rate": 9.493363560981808e-06, "loss": 0.4531, "step": 4710 }, { "epoch": 0.23064330914510495, "grad_norm": 1.4232662916183472, "learning_rate": 9.489616215945788e-06, "loss": 0.4542, "step": 4720 }, { "epoch": 0.2311319602238022, "grad_norm": 1.7004929780960083, "learning_rate": 9.485855808481434e-06, "loss": 0.4537, "step": 4730 }, { "epoch": 0.23162061130249945, "grad_norm": 1.8315871953964233, "learning_rate": 9.482082349529558e-06, "loss": 0.4544, "step": 4740 }, { "epoch": 0.2321092623811967, "grad_norm": 1.7571625709533691, "learning_rate": 9.478295850068945e-06, "loss": 0.4528, "step": 4750 }, { "epoch": 0.23259791345989397, "grad_norm": 2.470423936843872, "learning_rate": 9.474496321116324e-06, "loss": 0.4523, "step": 4760 }, { "epoch": 0.23308656453859122, "grad_norm": 1.8932669162750244, "learning_rate": 9.470683773726331e-06, "loss": 0.4543, "step": 4770 }, { "epoch": 0.23357521561728847, "grad_norm": 0.8342353105545044, "learning_rate": 9.466858218991477e-06, "loss": 0.4537, "step": 4780 }, { "epoch": 0.23406386669598572, "grad_norm": 5.9539055824279785, "learning_rate": 9.463019668042123e-06, "loss": 0.4672, "step": 4790 }, { "epoch": 0.234552517774683, "grad_norm": 1.770120620727539, "learning_rate": 9.459168132046438e-06, "loss": 0.4571, "step": 4800 }, { "epoch": 0.23504116885338025, "grad_norm": 1.4648096561431885, "learning_rate": 9.455303622210371e-06, "loss": 0.4557, "step": 4810 }, { "epoch": 0.2355298199320775, "grad_norm": 1.6342428922653198, "learning_rate": 9.451426149777617e-06, "loss": 0.4531, "step": 4820 }, { "epoch": 0.23601847101077475, "grad_norm": 4.125144958496094, "learning_rate": 9.447535726029593e-06, "loss": 0.4532, "step": 4830 }, { "epoch": 0.236507122089472, "grad_norm": 1.3270002603530884, "learning_rate": 9.443632362285385e-06, "loss": 0.4571, "step": 4840 }, { "epoch": 0.23699577316816928, "grad_norm": 1.3583444356918335, "learning_rate": 9.439716069901735e-06, "loss": 0.4553, "step": 4850 }, { "epoch": 0.23748442424686653, "grad_norm": 1.0656195878982544, "learning_rate": 9.435786860273003e-06, "loss": 0.4501, "step": 4860 }, { "epoch": 0.23797307532556378, "grad_norm": 1.0093538761138916, "learning_rate": 9.431844744831126e-06, "loss": 0.4525, "step": 4870 }, { "epoch": 0.23846172640426103, "grad_norm": 1.5308141708374023, "learning_rate": 9.427889735045593e-06, "loss": 0.4533, "step": 4880 }, { "epoch": 0.2389503774829583, "grad_norm": 1.855191707611084, "learning_rate": 9.423921842423406e-06, "loss": 0.454, "step": 4890 }, { "epoch": 0.23943902856165555, "grad_norm": 1.1727728843688965, "learning_rate": 9.419941078509054e-06, "loss": 0.4523, "step": 4900 }, { "epoch": 0.2399276796403528, "grad_norm": 0.5924420356750488, "learning_rate": 9.415947454884471e-06, "loss": 0.4522, "step": 4910 }, { "epoch": 0.24041633071905005, "grad_norm": 2.744570732116699, "learning_rate": 9.411940983169006e-06, "loss": 0.4529, "step": 4920 }, { "epoch": 0.24090498179774733, "grad_norm": 1.7564640045166016, "learning_rate": 9.407921675019393e-06, "loss": 0.4532, "step": 4930 }, { "epoch": 0.24139363287644458, "grad_norm": 1.4309080839157104, "learning_rate": 9.403889542129707e-06, "loss": 0.4533, "step": 4940 }, { "epoch": 0.24188228395514183, "grad_norm": 1.6379081010818481, "learning_rate": 9.399844596231343e-06, "loss": 0.4515, "step": 4950 }, { "epoch": 0.24237093503383908, "grad_norm": 0.7086363434791565, "learning_rate": 9.39578684909297e-06, "loss": 0.4527, "step": 4960 }, { "epoch": 0.24285958611253633, "grad_norm": 0.987898051738739, "learning_rate": 9.391716312520503e-06, "loss": 0.453, "step": 4970 }, { "epoch": 0.2433482371912336, "grad_norm": 1.1785643100738525, "learning_rate": 9.387632998357073e-06, "loss": 0.4532, "step": 4980 }, { "epoch": 0.24383688826993086, "grad_norm": 2.234311819076538, "learning_rate": 9.383536918482976e-06, "loss": 0.4541, "step": 4990 }, { "epoch": 0.2443255393486281, "grad_norm": 0.9595701098442078, "learning_rate": 9.37942808481566e-06, "loss": 0.4532, "step": 5000 }, { "epoch": 0.2443255393486281, "eval_loss": 0.42559880018234253, "eval_runtime": 729.4388, "eval_samples_per_second": 242.526, "eval_steps_per_second": 0.474, "step": 5000 }, { "epoch": 0.24481419042732536, "grad_norm": 1.8065398931503296, "learning_rate": 9.375306509309676e-06, "loss": 0.4532, "step": 5010 }, { "epoch": 0.24530284150602263, "grad_norm": 1.7283066511154175, "learning_rate": 9.371172203956646e-06, "loss": 0.4534, "step": 5020 }, { "epoch": 0.24579149258471988, "grad_norm": 1.2136019468307495, "learning_rate": 9.367025180785229e-06, "loss": 0.4536, "step": 5030 }, { "epoch": 0.24628014366341713, "grad_norm": 0.9906538724899292, "learning_rate": 9.36286545186109e-06, "loss": 0.4536, "step": 5040 }, { "epoch": 0.24676879474211438, "grad_norm": 1.390766978263855, "learning_rate": 9.358693029286855e-06, "loss": 0.4514, "step": 5050 }, { "epoch": 0.24725744582081166, "grad_norm": 1.2268085479736328, "learning_rate": 9.354507925202088e-06, "loss": 0.4516, "step": 5060 }, { "epoch": 0.2477460968995089, "grad_norm": 2.122887372970581, "learning_rate": 9.350310151783244e-06, "loss": 0.4491, "step": 5070 }, { "epoch": 0.24823474797820616, "grad_norm": 1.7110397815704346, "learning_rate": 9.346099721243646e-06, "loss": 0.4522, "step": 5080 }, { "epoch": 0.2487233990569034, "grad_norm": 0.9016057252883911, "learning_rate": 9.341876645833434e-06, "loss": 0.4515, "step": 5090 }, { "epoch": 0.2492120501356007, "grad_norm": 1.6355116367340088, "learning_rate": 9.337640937839544e-06, "loss": 0.4545, "step": 5100 }, { "epoch": 0.24970070121429794, "grad_norm": 0.9655557870864868, "learning_rate": 9.333392609585667e-06, "loss": 0.455, "step": 5110 }, { "epoch": 0.25018935229299516, "grad_norm": 1.8593004941940308, "learning_rate": 9.329131673432208e-06, "loss": 0.4522, "step": 5120 }, { "epoch": 0.25067800337169244, "grad_norm": 1.764728307723999, "learning_rate": 9.324858141776254e-06, "loss": 0.4541, "step": 5130 }, { "epoch": 0.2511666544503897, "grad_norm": 0.7554106116294861, "learning_rate": 9.320572027051544e-06, "loss": 0.4566, "step": 5140 }, { "epoch": 0.25165530552908694, "grad_norm": 2.081536054611206, "learning_rate": 9.316273341728423e-06, "loss": 0.4518, "step": 5150 }, { "epoch": 0.2521439566077842, "grad_norm": 0.8827464580535889, "learning_rate": 9.311962098313809e-06, "loss": 0.4502, "step": 5160 }, { "epoch": 0.2526326076864815, "grad_norm": 0.9885526895523071, "learning_rate": 9.307638309351162e-06, "loss": 0.4533, "step": 5170 }, { "epoch": 0.2531212587651787, "grad_norm": 1.7395132780075073, "learning_rate": 9.303301987420436e-06, "loss": 0.4516, "step": 5180 }, { "epoch": 0.253609909843876, "grad_norm": 0.679470419883728, "learning_rate": 9.298953145138057e-06, "loss": 0.4514, "step": 5190 }, { "epoch": 0.2540985609225732, "grad_norm": 1.4864360094070435, "learning_rate": 9.294591795156873e-06, "loss": 0.4502, "step": 5200 }, { "epoch": 0.2545872120012705, "grad_norm": 1.3630485534667969, "learning_rate": 9.290217950166125e-06, "loss": 0.4508, "step": 5210 }, { "epoch": 0.25507586307996777, "grad_norm": 1.3194668292999268, "learning_rate": 9.285831622891409e-06, "loss": 0.4511, "step": 5220 }, { "epoch": 0.255564514158665, "grad_norm": 1.602607250213623, "learning_rate": 9.281432826094635e-06, "loss": 0.4523, "step": 5230 }, { "epoch": 0.25605316523736227, "grad_norm": 0.8694007992744446, "learning_rate": 9.277021572573996e-06, "loss": 0.4522, "step": 5240 }, { "epoch": 0.2565418163160595, "grad_norm": 0.9949777722358704, "learning_rate": 9.272597875163925e-06, "loss": 0.4532, "step": 5250 }, { "epoch": 0.25703046739475677, "grad_norm": 1.0901665687561035, "learning_rate": 9.268161746735063e-06, "loss": 0.4509, "step": 5260 }, { "epoch": 0.25751911847345405, "grad_norm": 0.87218177318573, "learning_rate": 9.263713200194212e-06, "loss": 0.4506, "step": 5270 }, { "epoch": 0.25800776955215127, "grad_norm": 2.6825311183929443, "learning_rate": 9.259252248484317e-06, "loss": 0.4508, "step": 5280 }, { "epoch": 0.25849642063084854, "grad_norm": 1.7756139039993286, "learning_rate": 9.2547789045844e-06, "loss": 0.4524, "step": 5290 }, { "epoch": 0.2589850717095458, "grad_norm": 1.459425449371338, "learning_rate": 9.250293181509551e-06, "loss": 0.4525, "step": 5300 }, { "epoch": 0.25947372278824304, "grad_norm": 0.5840021371841431, "learning_rate": 9.245795092310867e-06, "loss": 0.4508, "step": 5310 }, { "epoch": 0.2599623738669403, "grad_norm": 1.1396574974060059, "learning_rate": 9.241284650075432e-06, "loss": 0.4498, "step": 5320 }, { "epoch": 0.26045102494563754, "grad_norm": 2.9981930255889893, "learning_rate": 9.236761867926264e-06, "loss": 0.4538, "step": 5330 }, { "epoch": 0.2609396760243348, "grad_norm": 1.627025842666626, "learning_rate": 9.23222675902229e-06, "loss": 0.4542, "step": 5340 }, { "epoch": 0.2614283271030321, "grad_norm": 2.1768600940704346, "learning_rate": 9.227679336558295e-06, "loss": 0.4514, "step": 5350 }, { "epoch": 0.2619169781817293, "grad_norm": 0.6379441618919373, "learning_rate": 9.223119613764895e-06, "loss": 0.4504, "step": 5360 }, { "epoch": 0.2624056292604266, "grad_norm": 1.7971820831298828, "learning_rate": 9.21854760390849e-06, "loss": 0.4503, "step": 5370 }, { "epoch": 0.2628942803391239, "grad_norm": 2.099776029586792, "learning_rate": 9.213963320291232e-06, "loss": 0.4509, "step": 5380 }, { "epoch": 0.2633829314178211, "grad_norm": 1.0017653703689575, "learning_rate": 9.209366776250984e-06, "loss": 0.4504, "step": 5390 }, { "epoch": 0.2638715824965184, "grad_norm": 1.0879532098770142, "learning_rate": 9.204757985161274e-06, "loss": 0.4501, "step": 5400 }, { "epoch": 0.2643602335752156, "grad_norm": 1.28214693069458, "learning_rate": 9.20013696043127e-06, "loss": 0.4483, "step": 5410 }, { "epoch": 0.2648488846539129, "grad_norm": 2.457913398742676, "learning_rate": 9.195503715505729e-06, "loss": 0.4517, "step": 5420 }, { "epoch": 0.26533753573261015, "grad_norm": 0.9251576662063599, "learning_rate": 9.190858263864963e-06, "loss": 0.4515, "step": 5430 }, { "epoch": 0.2658261868113074, "grad_norm": 1.5031663179397583, "learning_rate": 9.1862006190248e-06, "loss": 0.4499, "step": 5440 }, { "epoch": 0.26631483789000465, "grad_norm": 1.5385842323303223, "learning_rate": 9.181530794536544e-06, "loss": 0.4497, "step": 5450 }, { "epoch": 0.2668034889687019, "grad_norm": 1.0565071105957031, "learning_rate": 9.176848803986934e-06, "loss": 0.451, "step": 5460 }, { "epoch": 0.26729214004739915, "grad_norm": 0.9009528756141663, "learning_rate": 9.172154660998108e-06, "loss": 0.4507, "step": 5470 }, { "epoch": 0.26778079112609643, "grad_norm": 0.7359398007392883, "learning_rate": 9.167448379227558e-06, "loss": 0.4493, "step": 5480 }, { "epoch": 0.26826944220479365, "grad_norm": 4.481854438781738, "learning_rate": 9.162729972368098e-06, "loss": 0.4516, "step": 5490 }, { "epoch": 0.26875809328349093, "grad_norm": 1.0901057720184326, "learning_rate": 9.157999454147814e-06, "loss": 0.4518, "step": 5500 }, { "epoch": 0.26875809328349093, "eval_loss": 0.4273635745048523, "eval_runtime": 728.6534, "eval_samples_per_second": 242.788, "eval_steps_per_second": 0.475, "step": 5500 }, { "epoch": 0.2692467443621882, "grad_norm": 1.3341872692108154, "learning_rate": 9.153256838330035e-06, "loss": 0.4499, "step": 5510 }, { "epoch": 0.26973539544088543, "grad_norm": 1.7751141786575317, "learning_rate": 9.148502138713286e-06, "loss": 0.4491, "step": 5520 }, { "epoch": 0.2702240465195827, "grad_norm": 1.0976356267929077, "learning_rate": 9.143735369131249e-06, "loss": 0.4496, "step": 5530 }, { "epoch": 0.2707126975982799, "grad_norm": 2.7799429893493652, "learning_rate": 9.13895654345272e-06, "loss": 0.4501, "step": 5540 }, { "epoch": 0.2712013486769772, "grad_norm": 1.4997122287750244, "learning_rate": 9.134165675581579e-06, "loss": 0.4494, "step": 5550 }, { "epoch": 0.2716899997556745, "grad_norm": 1.3157509565353394, "learning_rate": 9.129362779456737e-06, "loss": 0.4505, "step": 5560 }, { "epoch": 0.2721786508343717, "grad_norm": 2.182624101638794, "learning_rate": 9.124547869052103e-06, "loss": 0.4499, "step": 5570 }, { "epoch": 0.272667301913069, "grad_norm": 0.6629562377929688, "learning_rate": 9.11972095837654e-06, "loss": 0.4501, "step": 5580 }, { "epoch": 0.2731559529917662, "grad_norm": 0.7715067863464355, "learning_rate": 9.114882061473827e-06, "loss": 0.4496, "step": 5590 }, { "epoch": 0.2736446040704635, "grad_norm": 1.0679346323013306, "learning_rate": 9.110031192422613e-06, "loss": 0.4488, "step": 5600 }, { "epoch": 0.27413325514916076, "grad_norm": 2.0973806381225586, "learning_rate": 9.105168365336389e-06, "loss": 0.4505, "step": 5610 }, { "epoch": 0.274621906227858, "grad_norm": 1.7515530586242676, "learning_rate": 9.100293594363425e-06, "loss": 0.4498, "step": 5620 }, { "epoch": 0.27511055730655526, "grad_norm": 1.3219352960586548, "learning_rate": 9.095406893686752e-06, "loss": 0.45, "step": 5630 }, { "epoch": 0.27559920838525254, "grad_norm": 1.7914499044418335, "learning_rate": 9.090508277524103e-06, "loss": 0.4506, "step": 5640 }, { "epoch": 0.27608785946394976, "grad_norm": 1.048553228378296, "learning_rate": 9.085597760127884e-06, "loss": 0.4479, "step": 5650 }, { "epoch": 0.27657651054264704, "grad_norm": 0.9424349069595337, "learning_rate": 9.080675355785123e-06, "loss": 0.4479, "step": 5660 }, { "epoch": 0.27706516162134426, "grad_norm": 2.2007129192352295, "learning_rate": 9.075741078817435e-06, "loss": 0.4517, "step": 5670 }, { "epoch": 0.27755381270004154, "grad_norm": 1.4200412034988403, "learning_rate": 9.070794943580978e-06, "loss": 0.4503, "step": 5680 }, { "epoch": 0.2780424637787388, "grad_norm": 3.359553575515747, "learning_rate": 9.065836964466412e-06, "loss": 0.4504, "step": 5690 }, { "epoch": 0.27853111485743604, "grad_norm": 1.0638636350631714, "learning_rate": 9.060867155898856e-06, "loss": 0.4503, "step": 5700 }, { "epoch": 0.2790197659361333, "grad_norm": 1.592399001121521, "learning_rate": 9.055885532337847e-06, "loss": 0.4485, "step": 5710 }, { "epoch": 0.27950841701483053, "grad_norm": 0.6336447596549988, "learning_rate": 9.050892108277292e-06, "loss": 0.4486, "step": 5720 }, { "epoch": 0.2799970680935278, "grad_norm": 2.1107187271118164, "learning_rate": 9.045886898245441e-06, "loss": 0.451, "step": 5730 }, { "epoch": 0.2804857191722251, "grad_norm": 1.656101107597351, "learning_rate": 9.040869916804827e-06, "loss": 0.4494, "step": 5740 }, { "epoch": 0.2809743702509223, "grad_norm": 1.3328661918640137, "learning_rate": 9.035841178552236e-06, "loss": 0.4492, "step": 5750 }, { "epoch": 0.2814630213296196, "grad_norm": 0.48556625843048096, "learning_rate": 9.030800698118658e-06, "loss": 0.4494, "step": 5760 }, { "epoch": 0.28195167240831687, "grad_norm": 2.595662832260132, "learning_rate": 9.025748490169248e-06, "loss": 0.4498, "step": 5770 }, { "epoch": 0.2824403234870141, "grad_norm": 0.8997907042503357, "learning_rate": 9.02068456940328e-06, "loss": 0.4482, "step": 5780 }, { "epoch": 0.28292897456571137, "grad_norm": 1.9101444482803345, "learning_rate": 9.01560895055411e-06, "loss": 0.4495, "step": 5790 }, { "epoch": 0.2834176256444086, "grad_norm": 0.7567463517189026, "learning_rate": 9.010521648389122e-06, "loss": 0.4501, "step": 5800 }, { "epoch": 0.28390627672310587, "grad_norm": 3.035726547241211, "learning_rate": 9.005422677709701e-06, "loss": 0.4499, "step": 5810 }, { "epoch": 0.28439492780180314, "grad_norm": 1.5301775932312012, "learning_rate": 9.000312053351175e-06, "loss": 0.4484, "step": 5820 }, { "epoch": 0.28488357888050037, "grad_norm": 1.8312554359436035, "learning_rate": 8.995189790182782e-06, "loss": 0.4486, "step": 5830 }, { "epoch": 0.28537222995919764, "grad_norm": 1.362288236618042, "learning_rate": 8.99005590310762e-06, "loss": 0.4497, "step": 5840 }, { "epoch": 0.28586088103789487, "grad_norm": 1.4402492046356201, "learning_rate": 8.984910407062608e-06, "loss": 0.4496, "step": 5850 }, { "epoch": 0.28634953211659214, "grad_norm": 0.9459155201911926, "learning_rate": 8.97975331701844e-06, "loss": 0.4485, "step": 5860 }, { "epoch": 0.2868381831952894, "grad_norm": 1.4187127351760864, "learning_rate": 8.974584647979546e-06, "loss": 0.449, "step": 5870 }, { "epoch": 0.28732683427398664, "grad_norm": 2.6295182704925537, "learning_rate": 8.969404414984035e-06, "loss": 0.4493, "step": 5880 }, { "epoch": 0.2878154853526839, "grad_norm": 1.6124824285507202, "learning_rate": 8.964212633103674e-06, "loss": 0.4496, "step": 5890 }, { "epoch": 0.2883041364313812, "grad_norm": 0.6683453321456909, "learning_rate": 8.959009317443825e-06, "loss": 0.4484, "step": 5900 }, { "epoch": 0.2887927875100784, "grad_norm": 1.6014492511749268, "learning_rate": 8.953794483143406e-06, "loss": 0.4483, "step": 5910 }, { "epoch": 0.2892814385887757, "grad_norm": 1.014033317565918, "learning_rate": 8.948568145374849e-06, "loss": 0.449, "step": 5920 }, { "epoch": 0.2897700896674729, "grad_norm": 1.4940074682235718, "learning_rate": 8.943330319344055e-06, "loss": 0.4496, "step": 5930 }, { "epoch": 0.2902587407461702, "grad_norm": 0.863261342048645, "learning_rate": 8.938081020290352e-06, "loss": 0.4495, "step": 5940 }, { "epoch": 0.2907473918248675, "grad_norm": 1.5809766054153442, "learning_rate": 8.932820263486447e-06, "loss": 0.4493, "step": 5950 }, { "epoch": 0.2912360429035647, "grad_norm": 0.7684280276298523, "learning_rate": 8.927548064238383e-06, "loss": 0.4492, "step": 5960 }, { "epoch": 0.291724693982262, "grad_norm": 2.1927716732025146, "learning_rate": 8.922264437885492e-06, "loss": 0.451, "step": 5970 }, { "epoch": 0.2922133450609592, "grad_norm": 1.0817362070083618, "learning_rate": 8.916969399800359e-06, "loss": 0.4506, "step": 5980 }, { "epoch": 0.2927019961396565, "grad_norm": 0.7948960661888123, "learning_rate": 8.911662965388765e-06, "loss": 0.4499, "step": 5990 }, { "epoch": 0.29319064721835375, "grad_norm": 0.9926490187644958, "learning_rate": 8.906345150089652e-06, "loss": 0.4486, "step": 6000 }, { "epoch": 0.29319064721835375, "eval_loss": 0.4233919382095337, "eval_runtime": 728.6738, "eval_samples_per_second": 242.781, "eval_steps_per_second": 0.475, "step": 6000 }, { "epoch": 0.293679298297051, "grad_norm": 1.070708155632019, "learning_rate": 8.901015969375074e-06, "loss": 0.4497, "step": 6010 }, { "epoch": 0.29416794937574825, "grad_norm": 1.2505017518997192, "learning_rate": 8.89567543875015e-06, "loss": 0.4479, "step": 6020 }, { "epoch": 0.2946566004544455, "grad_norm": 0.6546292304992676, "learning_rate": 8.890323573753023e-06, "loss": 0.4495, "step": 6030 }, { "epoch": 0.29514525153314275, "grad_norm": 5.781423091888428, "learning_rate": 8.884960389954813e-06, "loss": 0.4478, "step": 6040 }, { "epoch": 0.29563390261184, "grad_norm": 1.123044490814209, "learning_rate": 8.879585902959573e-06, "loss": 0.4493, "step": 6050 }, { "epoch": 0.29612255369053725, "grad_norm": 1.8155452013015747, "learning_rate": 8.874200128404242e-06, "loss": 0.4504, "step": 6060 }, { "epoch": 0.2966112047692345, "grad_norm": 1.4578708410263062, "learning_rate": 8.868803081958597e-06, "loss": 0.4503, "step": 6070 }, { "epoch": 0.2970998558479318, "grad_norm": 1.241621971130371, "learning_rate": 8.863394779325212e-06, "loss": 0.4495, "step": 6080 }, { "epoch": 0.297588506926629, "grad_norm": 0.9442185759544373, "learning_rate": 8.857975236239412e-06, "loss": 0.4484, "step": 6090 }, { "epoch": 0.2980771580053263, "grad_norm": 1.3439468145370483, "learning_rate": 8.852544468469224e-06, "loss": 0.4488, "step": 6100 }, { "epoch": 0.2985658090840235, "grad_norm": 2.7450032234191895, "learning_rate": 8.847102491815336e-06, "loss": 0.4488, "step": 6110 }, { "epoch": 0.2990544601627208, "grad_norm": 1.1001813411712646, "learning_rate": 8.841649322111044e-06, "loss": 0.4501, "step": 6120 }, { "epoch": 0.2995431112414181, "grad_norm": 0.6491206884384155, "learning_rate": 8.836184975222212e-06, "loss": 0.4474, "step": 6130 }, { "epoch": 0.3000317623201153, "grad_norm": 0.40915462374687195, "learning_rate": 8.830709467047223e-06, "loss": 0.4486, "step": 6140 }, { "epoch": 0.3005204133988126, "grad_norm": 0.9558333158493042, "learning_rate": 8.825222813516933e-06, "loss": 0.4468, "step": 6150 }, { "epoch": 0.30100906447750986, "grad_norm": 1.2985563278198242, "learning_rate": 8.819725030594626e-06, "loss": 0.4484, "step": 6160 }, { "epoch": 0.3014977155562071, "grad_norm": 1.1261284351348877, "learning_rate": 8.81421613427597e-06, "loss": 0.4493, "step": 6170 }, { "epoch": 0.30198636663490436, "grad_norm": 1.677819848060608, "learning_rate": 8.80869614058896e-06, "loss": 0.4476, "step": 6180 }, { "epoch": 0.3024750177136016, "grad_norm": 1.6651966571807861, "learning_rate": 8.803165065593884e-06, "loss": 0.4473, "step": 6190 }, { "epoch": 0.30296366879229886, "grad_norm": 0.8978771567344666, "learning_rate": 8.797622925383267e-06, "loss": 0.4478, "step": 6200 }, { "epoch": 0.30345231987099613, "grad_norm": 0.6011471748352051, "learning_rate": 8.792069736081835e-06, "loss": 0.4478, "step": 6210 }, { "epoch": 0.30394097094969336, "grad_norm": 3.1353821754455566, "learning_rate": 8.78650551384645e-06, "loss": 0.4515, "step": 6220 }, { "epoch": 0.30442962202839063, "grad_norm": 1.1291117668151855, "learning_rate": 8.780930274866084e-06, "loss": 0.4498, "step": 6230 }, { "epoch": 0.3049182731070879, "grad_norm": 0.6393253803253174, "learning_rate": 8.775344035361758e-06, "loss": 0.4489, "step": 6240 }, { "epoch": 0.30540692418578513, "grad_norm": 1.493739366531372, "learning_rate": 8.7697468115865e-06, "loss": 0.4498, "step": 6250 }, { "epoch": 0.3058955752644824, "grad_norm": 1.8243303298950195, "learning_rate": 8.76413861982529e-06, "loss": 0.4492, "step": 6260 }, { "epoch": 0.30638422634317963, "grad_norm": 0.7140172719955444, "learning_rate": 8.758519476395029e-06, "loss": 0.4478, "step": 6270 }, { "epoch": 0.3068728774218769, "grad_norm": 0.9651872515678406, "learning_rate": 8.752889397644478e-06, "loss": 0.4484, "step": 6280 }, { "epoch": 0.3073615285005742, "grad_norm": 0.4499496817588806, "learning_rate": 8.747248399954212e-06, "loss": 0.4475, "step": 6290 }, { "epoch": 0.3078501795792714, "grad_norm": 1.09201180934906, "learning_rate": 8.741596499736573e-06, "loss": 0.4491, "step": 6300 }, { "epoch": 0.3083388306579687, "grad_norm": 0.835132360458374, "learning_rate": 8.735933713435627e-06, "loss": 0.4479, "step": 6310 }, { "epoch": 0.3088274817366659, "grad_norm": 0.7163196802139282, "learning_rate": 8.730260057527116e-06, "loss": 0.4484, "step": 6320 }, { "epoch": 0.3093161328153632, "grad_norm": 1.1830068826675415, "learning_rate": 8.724575548518397e-06, "loss": 0.4475, "step": 6330 }, { "epoch": 0.30980478389406046, "grad_norm": 1.2740248441696167, "learning_rate": 8.718880202948414e-06, "loss": 0.447, "step": 6340 }, { "epoch": 0.3102934349727577, "grad_norm": 1.1490364074707031, "learning_rate": 8.713174037387633e-06, "loss": 0.447, "step": 6350 }, { "epoch": 0.31078208605145496, "grad_norm": 1.9249966144561768, "learning_rate": 8.707457068438004e-06, "loss": 0.4477, "step": 6360 }, { "epoch": 0.31127073713015224, "grad_norm": 1.1233280897140503, "learning_rate": 8.701729312732907e-06, "loss": 0.45, "step": 6370 }, { "epoch": 0.31175938820884946, "grad_norm": 0.5614790916442871, "learning_rate": 8.695990786937109e-06, "loss": 0.447, "step": 6380 }, { "epoch": 0.31224803928754674, "grad_norm": 0.8090300559997559, "learning_rate": 8.690241507746706e-06, "loss": 0.4493, "step": 6390 }, { "epoch": 0.31273669036624396, "grad_norm": 0.9170634746551514, "learning_rate": 8.68448149188909e-06, "loss": 0.4479, "step": 6400 }, { "epoch": 0.31322534144494124, "grad_norm": 0.8162520527839661, "learning_rate": 8.67871075612288e-06, "loss": 0.4473, "step": 6410 }, { "epoch": 0.3137139925236385, "grad_norm": 2.09964656829834, "learning_rate": 8.672929317237897e-06, "loss": 0.4466, "step": 6420 }, { "epoch": 0.31420264360233574, "grad_norm": 1.2079427242279053, "learning_rate": 8.667137192055093e-06, "loss": 0.4483, "step": 6430 }, { "epoch": 0.314691294681033, "grad_norm": 0.8319594860076904, "learning_rate": 8.661334397426511e-06, "loss": 0.4457, "step": 6440 }, { "epoch": 0.31517994575973024, "grad_norm": 1.2110413312911987, "learning_rate": 8.655520950235243e-06, "loss": 0.449, "step": 6450 }, { "epoch": 0.3156685968384275, "grad_norm": 1.1097526550292969, "learning_rate": 8.649696867395372e-06, "loss": 0.4482, "step": 6460 }, { "epoch": 0.3161572479171248, "grad_norm": 0.4162759482860565, "learning_rate": 8.643862165851922e-06, "loss": 0.4465, "step": 6470 }, { "epoch": 0.316645898995822, "grad_norm": 0.8267191052436829, "learning_rate": 8.638016862580814e-06, "loss": 0.4469, "step": 6480 }, { "epoch": 0.3171345500745193, "grad_norm": 1.518624186515808, "learning_rate": 8.632160974588817e-06, "loss": 0.4482, "step": 6490 }, { "epoch": 0.31762320115321657, "grad_norm": 0.7973819375038147, "learning_rate": 8.62629451891349e-06, "loss": 0.448, "step": 6500 }, { "epoch": 0.31762320115321657, "eval_loss": 0.4212668538093567, "eval_runtime": 728.4104, "eval_samples_per_second": 242.869, "eval_steps_per_second": 0.475, "step": 6500 }, { "epoch": 0.3181118522319138, "grad_norm": 1.7393572330474854, "learning_rate": 8.620417512623145e-06, "loss": 0.4462, "step": 6510 }, { "epoch": 0.31860050331061107, "grad_norm": 0.8156083226203918, "learning_rate": 8.614529972816787e-06, "loss": 0.4478, "step": 6520 }, { "epoch": 0.3190891543893083, "grad_norm": 0.6622930765151978, "learning_rate": 8.608631916624069e-06, "loss": 0.4468, "step": 6530 }, { "epoch": 0.31957780546800557, "grad_norm": 1.1308300495147705, "learning_rate": 8.602723361205241e-06, "loss": 0.4467, "step": 6540 }, { "epoch": 0.32006645654670285, "grad_norm": 0.8318139314651489, "learning_rate": 8.596804323751098e-06, "loss": 0.4471, "step": 6550 }, { "epoch": 0.32055510762540007, "grad_norm": 0.5246617794036865, "learning_rate": 8.590874821482937e-06, "loss": 0.446, "step": 6560 }, { "epoch": 0.32104375870409735, "grad_norm": 0.8752800226211548, "learning_rate": 8.584934871652498e-06, "loss": 0.4468, "step": 6570 }, { "epoch": 0.32153240978279457, "grad_norm": 1.248165249824524, "learning_rate": 8.57898449154192e-06, "loss": 0.448, "step": 6580 }, { "epoch": 0.32202106086149185, "grad_norm": 1.0610485076904297, "learning_rate": 8.573023698463689e-06, "loss": 0.4468, "step": 6590 }, { "epoch": 0.3225097119401891, "grad_norm": 3.7733728885650635, "learning_rate": 8.567052509760586e-06, "loss": 0.4538, "step": 6600 }, { "epoch": 0.32299836301888635, "grad_norm": 3.644801616668701, "learning_rate": 8.561070942805636e-06, "loss": 0.449, "step": 6610 }, { "epoch": 0.3234870140975836, "grad_norm": 0.774163544178009, "learning_rate": 8.555079015002063e-06, "loss": 0.4471, "step": 6620 }, { "epoch": 0.3239756651762809, "grad_norm": 1.7043198347091675, "learning_rate": 8.549076743783236e-06, "loss": 0.4474, "step": 6630 }, { "epoch": 0.3244643162549781, "grad_norm": 1.1995218992233276, "learning_rate": 8.543064146612612e-06, "loss": 0.4477, "step": 6640 }, { "epoch": 0.3249529673336754, "grad_norm": 1.5275466442108154, "learning_rate": 8.5370412409837e-06, "loss": 0.448, "step": 6650 }, { "epoch": 0.3254416184123726, "grad_norm": 0.8573246002197266, "learning_rate": 8.53100804441999e-06, "loss": 0.4474, "step": 6660 }, { "epoch": 0.3259302694910699, "grad_norm": 1.1308470964431763, "learning_rate": 8.524964574474925e-06, "loss": 0.4466, "step": 6670 }, { "epoch": 0.3264189205697672, "grad_norm": 1.240512728691101, "learning_rate": 8.51891084873183e-06, "loss": 0.4463, "step": 6680 }, { "epoch": 0.3269075716484644, "grad_norm": 2.6846487522125244, "learning_rate": 8.512846884803874e-06, "loss": 0.4476, "step": 6690 }, { "epoch": 0.3273962227271617, "grad_norm": 0.7580792307853699, "learning_rate": 8.506772700334008e-06, "loss": 0.4463, "step": 6700 }, { "epoch": 0.3278848738058589, "grad_norm": 0.49652209877967834, "learning_rate": 8.500688312994925e-06, "loss": 0.4471, "step": 6710 }, { "epoch": 0.3283735248845562, "grad_norm": 2.0272531509399414, "learning_rate": 8.494593740489e-06, "loss": 0.4465, "step": 6720 }, { "epoch": 0.32886217596325346, "grad_norm": 1.3837034702301025, "learning_rate": 8.488489000548244e-06, "loss": 0.4493, "step": 6730 }, { "epoch": 0.3293508270419507, "grad_norm": 1.1367080211639404, "learning_rate": 8.482374110934246e-06, "loss": 0.4474, "step": 6740 }, { "epoch": 0.32983947812064796, "grad_norm": 1.121301531791687, "learning_rate": 8.476249089438129e-06, "loss": 0.4459, "step": 6750 }, { "epoch": 0.33032812919934523, "grad_norm": 0.9756953120231628, "learning_rate": 8.470113953880493e-06, "loss": 0.4468, "step": 6760 }, { "epoch": 0.33081678027804245, "grad_norm": 1.3827910423278809, "learning_rate": 8.463968722111362e-06, "loss": 0.4473, "step": 6770 }, { "epoch": 0.33130543135673973, "grad_norm": 0.6767109632492065, "learning_rate": 8.45781341201014e-06, "loss": 0.447, "step": 6780 }, { "epoch": 0.33179408243543695, "grad_norm": 1.0480477809906006, "learning_rate": 8.451648041485551e-06, "loss": 0.4469, "step": 6790 }, { "epoch": 0.33228273351413423, "grad_norm": 1.5709936618804932, "learning_rate": 8.445472628475588e-06, "loss": 0.4471, "step": 6800 }, { "epoch": 0.3327713845928315, "grad_norm": 1.5795131921768188, "learning_rate": 8.439287190947464e-06, "loss": 0.447, "step": 6810 }, { "epoch": 0.33326003567152873, "grad_norm": 1.1700830459594727, "learning_rate": 8.433091746897559e-06, "loss": 0.4455, "step": 6820 }, { "epoch": 0.333748686750226, "grad_norm": 1.7184573411941528, "learning_rate": 8.426886314351363e-06, "loss": 0.4458, "step": 6830 }, { "epoch": 0.33423733782892323, "grad_norm": 0.4313448667526245, "learning_rate": 8.420670911363433e-06, "loss": 0.447, "step": 6840 }, { "epoch": 0.3347259889076205, "grad_norm": 1.0812926292419434, "learning_rate": 8.41444555601733e-06, "loss": 0.4456, "step": 6850 }, { "epoch": 0.3352146399863178, "grad_norm": 1.1345865726470947, "learning_rate": 8.40821026642557e-06, "loss": 0.447, "step": 6860 }, { "epoch": 0.335703291065015, "grad_norm": 0.6373735070228577, "learning_rate": 8.401965060729582e-06, "loss": 0.4451, "step": 6870 }, { "epoch": 0.3361919421437123, "grad_norm": 6.616238594055176, "learning_rate": 8.395709957099633e-06, "loss": 0.4475, "step": 6880 }, { "epoch": 0.33668059322240956, "grad_norm": 0.9826495051383972, "learning_rate": 8.389444973734797e-06, "loss": 0.4486, "step": 6890 }, { "epoch": 0.3371692443011068, "grad_norm": 1.7973625659942627, "learning_rate": 8.383170128862887e-06, "loss": 0.4473, "step": 6900 }, { "epoch": 0.33765789537980406, "grad_norm": 0.9026411175727844, "learning_rate": 8.376885440740414e-06, "loss": 0.4472, "step": 6910 }, { "epoch": 0.3381465464585013, "grad_norm": 0.9952638149261475, "learning_rate": 8.37059092765252e-06, "loss": 0.4461, "step": 6920 }, { "epoch": 0.33863519753719856, "grad_norm": 2.210338830947876, "learning_rate": 8.364286607912938e-06, "loss": 0.4487, "step": 6930 }, { "epoch": 0.33912384861589584, "grad_norm": 1.286643385887146, "learning_rate": 8.357972499863933e-06, "loss": 0.4469, "step": 6940 }, { "epoch": 0.33961249969459306, "grad_norm": 1.2331130504608154, "learning_rate": 8.351648621876248e-06, "loss": 0.4479, "step": 6950 }, { "epoch": 0.34010115077329034, "grad_norm": 0.7784949541091919, "learning_rate": 8.345314992349047e-06, "loss": 0.4468, "step": 6960 }, { "epoch": 0.34058980185198756, "grad_norm": 3.558990955352783, "learning_rate": 8.338971629709873e-06, "loss": 0.4455, "step": 6970 }, { "epoch": 0.34107845293068484, "grad_norm": 0.712576150894165, "learning_rate": 8.332618552414585e-06, "loss": 0.4461, "step": 6980 }, { "epoch": 0.3415671040093821, "grad_norm": 1.1077570915222168, "learning_rate": 8.326255778947303e-06, "loss": 0.4453, "step": 6990 }, { "epoch": 0.34205575508807934, "grad_norm": 1.3067269325256348, "learning_rate": 8.319883327820363e-06, "loss": 0.4462, "step": 7000 }, { "epoch": 0.34205575508807934, "eval_loss": 0.4191921055316925, "eval_runtime": 728.4719, "eval_samples_per_second": 242.848, "eval_steps_per_second": 0.475, "step": 7000 }, { "epoch": 0.3425444061667766, "grad_norm": 1.001678705215454, "learning_rate": 8.313501217574253e-06, "loss": 0.4465, "step": 7010 }, { "epoch": 0.3430330572454739, "grad_norm": 0.7304960489273071, "learning_rate": 8.307109466777567e-06, "loss": 0.4458, "step": 7020 }, { "epoch": 0.3435217083241711, "grad_norm": 0.7707636952400208, "learning_rate": 8.30070809402695e-06, "loss": 0.4441, "step": 7030 }, { "epoch": 0.3440103594028684, "grad_norm": 0.9046769142150879, "learning_rate": 8.294297117947035e-06, "loss": 0.4445, "step": 7040 }, { "epoch": 0.3444990104815656, "grad_norm": 0.8245752453804016, "learning_rate": 8.287876557190402e-06, "loss": 0.444, "step": 7050 }, { "epoch": 0.3449876615602629, "grad_norm": 1.746430516242981, "learning_rate": 8.281446430437516e-06, "loss": 0.4469, "step": 7060 }, { "epoch": 0.34547631263896017, "grad_norm": 1.3313848972320557, "learning_rate": 8.27500675639667e-06, "loss": 0.4473, "step": 7070 }, { "epoch": 0.3459649637176574, "grad_norm": 1.182501196861267, "learning_rate": 8.26855755380394e-06, "loss": 0.4453, "step": 7080 }, { "epoch": 0.34645361479635467, "grad_norm": 2.6568055152893066, "learning_rate": 8.262098841423126e-06, "loss": 0.4462, "step": 7090 }, { "epoch": 0.3469422658750519, "grad_norm": 1.4778715372085571, "learning_rate": 8.255630638045685e-06, "loss": 0.4463, "step": 7100 }, { "epoch": 0.34743091695374917, "grad_norm": 1.463995099067688, "learning_rate": 8.249152962490705e-06, "loss": 0.4468, "step": 7110 }, { "epoch": 0.34791956803244645, "grad_norm": 0.9242321848869324, "learning_rate": 8.242665833604818e-06, "loss": 0.446, "step": 7120 }, { "epoch": 0.34840821911114367, "grad_norm": 0.8648793697357178, "learning_rate": 8.236169270262168e-06, "loss": 0.4447, "step": 7130 }, { "epoch": 0.34889687018984095, "grad_norm": 0.7932630777359009, "learning_rate": 8.229663291364349e-06, "loss": 0.4458, "step": 7140 }, { "epoch": 0.3493855212685382, "grad_norm": 2.303868055343628, "learning_rate": 8.223147915840347e-06, "loss": 0.446, "step": 7150 }, { "epoch": 0.34987417234723545, "grad_norm": 0.47625330090522766, "learning_rate": 8.216623162646487e-06, "loss": 0.4469, "step": 7160 }, { "epoch": 0.3503628234259327, "grad_norm": 0.5169132947921753, "learning_rate": 8.210089050766374e-06, "loss": 0.4461, "step": 7170 }, { "epoch": 0.35085147450462995, "grad_norm": 1.1093195676803589, "learning_rate": 8.203545599210851e-06, "loss": 0.4457, "step": 7180 }, { "epoch": 0.3513401255833272, "grad_norm": 1.9182569980621338, "learning_rate": 8.19699282701793e-06, "loss": 0.4453, "step": 7190 }, { "epoch": 0.3518287766620245, "grad_norm": 0.5894930958747864, "learning_rate": 8.190430753252742e-06, "loss": 0.4462, "step": 7200 }, { "epoch": 0.3523174277407217, "grad_norm": 1.633952260017395, "learning_rate": 8.183859397007476e-06, "loss": 0.4446, "step": 7210 }, { "epoch": 0.352806078819419, "grad_norm": 1.9727741479873657, "learning_rate": 8.177278777401332e-06, "loss": 0.448, "step": 7220 }, { "epoch": 0.3532947298981163, "grad_norm": 1.4541544914245605, "learning_rate": 8.170688913580465e-06, "loss": 0.4474, "step": 7230 }, { "epoch": 0.3537833809768135, "grad_norm": 2.3945956230163574, "learning_rate": 8.16408982471792e-06, "loss": 0.4456, "step": 7240 }, { "epoch": 0.3542720320555108, "grad_norm": 0.821062445640564, "learning_rate": 8.157481530013586e-06, "loss": 0.4459, "step": 7250 }, { "epoch": 0.354760683134208, "grad_norm": 0.6615464687347412, "learning_rate": 8.150864048694132e-06, "loss": 0.4458, "step": 7260 }, { "epoch": 0.3552493342129053, "grad_norm": 0.6758638620376587, "learning_rate": 8.14423740001296e-06, "loss": 0.4441, "step": 7270 }, { "epoch": 0.35573798529160255, "grad_norm": 1.2416491508483887, "learning_rate": 8.137601603250139e-06, "loss": 0.4454, "step": 7280 }, { "epoch": 0.3562266363702998, "grad_norm": 0.828959584236145, "learning_rate": 8.13095667771236e-06, "loss": 0.4444, "step": 7290 }, { "epoch": 0.35671528744899705, "grad_norm": 0.5700317025184631, "learning_rate": 8.124302642732871e-06, "loss": 0.4459, "step": 7300 }, { "epoch": 0.3572039385276943, "grad_norm": 0.6910264492034912, "learning_rate": 8.117639517671421e-06, "loss": 0.4446, "step": 7310 }, { "epoch": 0.35769258960639155, "grad_norm": 1.0732626914978027, "learning_rate": 8.11096732191421e-06, "loss": 0.4457, "step": 7320 }, { "epoch": 0.35818124068508883, "grad_norm": 0.9882492423057556, "learning_rate": 8.10428607487383e-06, "loss": 0.445, "step": 7330 }, { "epoch": 0.35866989176378605, "grad_norm": 0.5441588163375854, "learning_rate": 8.097595795989203e-06, "loss": 0.4453, "step": 7340 }, { "epoch": 0.35915854284248333, "grad_norm": 0.8513416647911072, "learning_rate": 8.090896504725534e-06, "loss": 0.4455, "step": 7350 }, { "epoch": 0.3596471939211806, "grad_norm": 0.5936821103096008, "learning_rate": 8.084188220574244e-06, "loss": 0.444, "step": 7360 }, { "epoch": 0.36013584499987783, "grad_norm": 4.0613017082214355, "learning_rate": 8.077470963052922e-06, "loss": 0.447, "step": 7370 }, { "epoch": 0.3606244960785751, "grad_norm": 0.7625659704208374, "learning_rate": 8.070744751705267e-06, "loss": 0.4463, "step": 7380 }, { "epoch": 0.36111314715727233, "grad_norm": 0.8564379811286926, "learning_rate": 8.064009606101023e-06, "loss": 0.4452, "step": 7390 }, { "epoch": 0.3616017982359696, "grad_norm": 0.671668291091919, "learning_rate": 8.05726554583593e-06, "loss": 0.4458, "step": 7400 }, { "epoch": 0.3620904493146669, "grad_norm": 1.2709118127822876, "learning_rate": 8.050512590531669e-06, "loss": 0.4454, "step": 7410 }, { "epoch": 0.3625791003933641, "grad_norm": 0.7745212912559509, "learning_rate": 8.043750759835795e-06, "loss": 0.446, "step": 7420 }, { "epoch": 0.3630677514720614, "grad_norm": 0.7901990413665771, "learning_rate": 8.036980073421693e-06, "loss": 0.4444, "step": 7430 }, { "epoch": 0.3635564025507586, "grad_norm": 1.0258527994155884, "learning_rate": 8.030200550988505e-06, "loss": 0.4437, "step": 7440 }, { "epoch": 0.3640450536294559, "grad_norm": 1.6445204019546509, "learning_rate": 8.023412212261088e-06, "loss": 0.444, "step": 7450 }, { "epoch": 0.36453370470815316, "grad_norm": 1.1179972887039185, "learning_rate": 8.016615076989947e-06, "loss": 0.4449, "step": 7460 }, { "epoch": 0.3650223557868504, "grad_norm": 0.4461180567741394, "learning_rate": 8.009809164951176e-06, "loss": 0.4446, "step": 7470 }, { "epoch": 0.36551100686554766, "grad_norm": 0.6667689681053162, "learning_rate": 8.002994495946415e-06, "loss": 0.4443, "step": 7480 }, { "epoch": 0.36599965794424494, "grad_norm": 0.691374659538269, "learning_rate": 7.996171089802774e-06, "loss": 0.4445, "step": 7490 }, { "epoch": 0.36648830902294216, "grad_norm": 1.3462163209915161, "learning_rate": 7.989338966372787e-06, "loss": 0.4431, "step": 7500 }, { "epoch": 0.36648830902294216, "eval_loss": 0.4194032549858093, "eval_runtime": 728.4338, "eval_samples_per_second": 242.861, "eval_steps_per_second": 0.475, "step": 7500 }, { "epoch": 0.36697696010163944, "grad_norm": 1.0293834209442139, "learning_rate": 7.982498145534348e-06, "loss": 0.4454, "step": 7510 }, { "epoch": 0.36746561118033666, "grad_norm": 1.0880999565124512, "learning_rate": 7.97564864719066e-06, "loss": 0.4435, "step": 7520 }, { "epoch": 0.36795426225903394, "grad_norm": 3.1764519214630127, "learning_rate": 7.968790491270165e-06, "loss": 0.4451, "step": 7530 }, { "epoch": 0.3684429133377312, "grad_norm": 0.6520982980728149, "learning_rate": 7.961923697726506e-06, "loss": 0.4464, "step": 7540 }, { "epoch": 0.36893156441642844, "grad_norm": 1.566203236579895, "learning_rate": 7.955048286538448e-06, "loss": 0.4455, "step": 7550 }, { "epoch": 0.3694202154951257, "grad_norm": 1.396600365638733, "learning_rate": 7.948164277709831e-06, "loss": 0.4466, "step": 7560 }, { "epoch": 0.36990886657382294, "grad_norm": 39.281192779541016, "learning_rate": 7.941271691269511e-06, "loss": 0.4899, "step": 7570 }, { "epoch": 0.3703975176525202, "grad_norm": 2.0359652042388916, "learning_rate": 7.934370547271297e-06, "loss": 0.4587, "step": 7580 }, { "epoch": 0.3708861687312175, "grad_norm": 0.7175349593162537, "learning_rate": 7.9274608657939e-06, "loss": 0.4484, "step": 7590 }, { "epoch": 0.3713748198099147, "grad_norm": 1.1124777793884277, "learning_rate": 7.920542666940871e-06, "loss": 0.4465, "step": 7600 }, { "epoch": 0.371863470888612, "grad_norm": 1.0177866220474243, "learning_rate": 7.913615970840535e-06, "loss": 0.4447, "step": 7610 }, { "epoch": 0.37235212196730927, "grad_norm": 0.7671780586242676, "learning_rate": 7.90668079764595e-06, "loss": 0.4455, "step": 7620 }, { "epoch": 0.3728407730460065, "grad_norm": 1.171650767326355, "learning_rate": 7.899737167534827e-06, "loss": 0.4456, "step": 7630 }, { "epoch": 0.37332942412470377, "grad_norm": 0.5443609356880188, "learning_rate": 7.892785100709492e-06, "loss": 0.4461, "step": 7640 }, { "epoch": 0.373818075203401, "grad_norm": 1.2549580335617065, "learning_rate": 7.885824617396812e-06, "loss": 0.4451, "step": 7650 }, { "epoch": 0.37430672628209827, "grad_norm": 0.7662185430526733, "learning_rate": 7.878855737848139e-06, "loss": 0.4446, "step": 7660 }, { "epoch": 0.37479537736079555, "grad_norm": 1.3419959545135498, "learning_rate": 7.871878482339264e-06, "loss": 0.4468, "step": 7670 }, { "epoch": 0.37528402843949277, "grad_norm": 1.2521858215332031, "learning_rate": 7.864892871170335e-06, "loss": 0.4451, "step": 7680 }, { "epoch": 0.37577267951819004, "grad_norm": 2.5343024730682373, "learning_rate": 7.857898924665817e-06, "loss": 0.4458, "step": 7690 }, { "epoch": 0.37626133059688727, "grad_norm": 0.9986534118652344, "learning_rate": 7.85089666317443e-06, "loss": 0.4451, "step": 7700 }, { "epoch": 0.37674998167558454, "grad_norm": 0.8709741830825806, "learning_rate": 7.843886107069077e-06, "loss": 0.4439, "step": 7710 }, { "epoch": 0.3772386327542818, "grad_norm": 0.8361919522285461, "learning_rate": 7.836867276746805e-06, "loss": 0.4444, "step": 7720 }, { "epoch": 0.37772728383297904, "grad_norm": 1.1930742263793945, "learning_rate": 7.829840192628723e-06, "loss": 0.4461, "step": 7730 }, { "epoch": 0.3782159349116763, "grad_norm": 1.6097028255462646, "learning_rate": 7.822804875159962e-06, "loss": 0.4444, "step": 7740 }, { "epoch": 0.3787045859903736, "grad_norm": 0.6868306994438171, "learning_rate": 7.815761344809609e-06, "loss": 0.4457, "step": 7750 }, { "epoch": 0.3791932370690708, "grad_norm": 0.5000033974647522, "learning_rate": 7.808709622070639e-06, "loss": 0.4449, "step": 7760 }, { "epoch": 0.3796818881477681, "grad_norm": 0.3964043855667114, "learning_rate": 7.801649727459868e-06, "loss": 0.4439, "step": 7770 }, { "epoch": 0.3801705392264653, "grad_norm": 1.3012721538543701, "learning_rate": 7.794581681517886e-06, "loss": 0.4454, "step": 7780 }, { "epoch": 0.3806591903051626, "grad_norm": 0.6892145276069641, "learning_rate": 7.787505504808997e-06, "loss": 0.4456, "step": 7790 }, { "epoch": 0.3811478413838599, "grad_norm": 0.48608964681625366, "learning_rate": 7.780421217921169e-06, "loss": 0.4439, "step": 7800 }, { "epoch": 0.3816364924625571, "grad_norm": 0.7753750085830688, "learning_rate": 7.773328841465958e-06, "loss": 0.4438, "step": 7810 }, { "epoch": 0.3821251435412544, "grad_norm": 0.5739250183105469, "learning_rate": 7.766228396078458e-06, "loss": 0.4444, "step": 7820 }, { "epoch": 0.3826137946199516, "grad_norm": 0.6620212197303772, "learning_rate": 7.759119902417244e-06, "loss": 0.445, "step": 7830 }, { "epoch": 0.3831024456986489, "grad_norm": 0.5474065542221069, "learning_rate": 7.7520033811643e-06, "loss": 0.4436, "step": 7840 }, { "epoch": 0.38359109677734615, "grad_norm": 1.7903695106506348, "learning_rate": 7.744878853024976e-06, "loss": 0.444, "step": 7850 }, { "epoch": 0.3840797478560434, "grad_norm": 0.9528830051422119, "learning_rate": 7.737746338727908e-06, "loss": 0.4436, "step": 7860 }, { "epoch": 0.38456839893474065, "grad_norm": 0.9075807332992554, "learning_rate": 7.730605859024971e-06, "loss": 0.4433, "step": 7870 }, { "epoch": 0.38505705001343793, "grad_norm": 1.1544967889785767, "learning_rate": 7.723457434691216e-06, "loss": 0.4456, "step": 7880 }, { "epoch": 0.38554570109213515, "grad_norm": 1.7026115655899048, "learning_rate": 7.71630108652481e-06, "loss": 0.4458, "step": 7890 }, { "epoch": 0.38603435217083243, "grad_norm": 0.6825501918792725, "learning_rate": 7.709136835346973e-06, "loss": 0.4447, "step": 7900 }, { "epoch": 0.38652300324952965, "grad_norm": 1.6804189682006836, "learning_rate": 7.701964702001916e-06, "loss": 0.4446, "step": 7910 }, { "epoch": 0.38701165432822693, "grad_norm": 3.464137077331543, "learning_rate": 7.694784707356786e-06, "loss": 0.4467, "step": 7920 }, { "epoch": 0.3875003054069242, "grad_norm": 0.6467346549034119, "learning_rate": 7.687596872301603e-06, "loss": 0.4446, "step": 7930 }, { "epoch": 0.38798895648562143, "grad_norm": 1.6307556629180908, "learning_rate": 7.680401217749194e-06, "loss": 0.4454, "step": 7940 }, { "epoch": 0.3884776075643187, "grad_norm": 1.3172680139541626, "learning_rate": 7.67319776463514e-06, "loss": 0.447, "step": 7950 }, { "epoch": 0.3889662586430159, "grad_norm": 0.94371497631073, "learning_rate": 7.665986533917715e-06, "loss": 0.4443, "step": 7960 }, { "epoch": 0.3894549097217132, "grad_norm": 1.032759666442871, "learning_rate": 7.658767546577815e-06, "loss": 0.4435, "step": 7970 }, { "epoch": 0.3899435608004105, "grad_norm": 0.6555205583572388, "learning_rate": 7.651540823618906e-06, "loss": 0.4456, "step": 7980 }, { "epoch": 0.3904322118791077, "grad_norm": 0.8276070952415466, "learning_rate": 7.644306386066964e-06, "loss": 0.4437, "step": 7990 }, { "epoch": 0.390920862957805, "grad_norm": 0.9051567912101746, "learning_rate": 7.637064254970404e-06, "loss": 0.4439, "step": 8000 }, { "epoch": 0.390920862957805, "eval_loss": 0.41898027062416077, "eval_runtime": 729.9138, "eval_samples_per_second": 242.368, "eval_steps_per_second": 0.474, "step": 8000 }, { "epoch": 0.39140951403650226, "grad_norm": 0.7855016589164734, "learning_rate": 7.629814451400034e-06, "loss": 0.4434, "step": 8010 }, { "epoch": 0.3918981651151995, "grad_norm": 1.8473398685455322, "learning_rate": 7.622556996448973e-06, "loss": 0.4441, "step": 8020 }, { "epoch": 0.39238681619389676, "grad_norm": 1.2307816743850708, "learning_rate": 7.615291911232614e-06, "loss": 0.4426, "step": 8030 }, { "epoch": 0.392875467272594, "grad_norm": 0.9610106945037842, "learning_rate": 7.6080192168885436e-06, "loss": 0.4439, "step": 8040 }, { "epoch": 0.39336411835129126, "grad_norm": 0.8011897206306458, "learning_rate": 7.600738934576484e-06, "loss": 0.4424, "step": 8050 }, { "epoch": 0.39385276942998854, "grad_norm": 0.9333787560462952, "learning_rate": 7.593451085478243e-06, "loss": 0.443, "step": 8060 }, { "epoch": 0.39434142050868576, "grad_norm": 0.5144811868667603, "learning_rate": 7.586155690797636e-06, "loss": 0.4446, "step": 8070 }, { "epoch": 0.39483007158738304, "grad_norm": 1.6834224462509155, "learning_rate": 7.578852771760437e-06, "loss": 0.4443, "step": 8080 }, { "epoch": 0.39531872266608026, "grad_norm": 1.0620421171188354, "learning_rate": 7.571542349614307e-06, "loss": 0.4436, "step": 8090 }, { "epoch": 0.39580737374477754, "grad_norm": 0.8550513386726379, "learning_rate": 7.564224445628741e-06, "loss": 0.4439, "step": 8100 }, { "epoch": 0.3962960248234748, "grad_norm": 0.5044734477996826, "learning_rate": 7.556899081095004e-06, "loss": 0.4446, "step": 8110 }, { "epoch": 0.39678467590217203, "grad_norm": 0.8119836449623108, "learning_rate": 7.549566277326061e-06, "loss": 0.4438, "step": 8120 }, { "epoch": 0.3972733269808693, "grad_norm": 10.883358001708984, "learning_rate": 7.542226055656527e-06, "loss": 0.4461, "step": 8130 }, { "epoch": 0.3977619780595666, "grad_norm": 1.7727267742156982, "learning_rate": 7.534878437442597e-06, "loss": 0.4482, "step": 8140 }, { "epoch": 0.3982506291382638, "grad_norm": 1.0288087129592896, "learning_rate": 7.527523444061984e-06, "loss": 0.4443, "step": 8150 }, { "epoch": 0.3987392802169611, "grad_norm": 1.184952974319458, "learning_rate": 7.520161096913863e-06, "loss": 0.4466, "step": 8160 }, { "epoch": 0.3992279312956583, "grad_norm": 0.9457073211669922, "learning_rate": 7.512791417418802e-06, "loss": 0.4454, "step": 8170 }, { "epoch": 0.3997165823743556, "grad_norm": 0.771334171295166, "learning_rate": 7.505414427018704e-06, "loss": 0.445, "step": 8180 }, { "epoch": 0.40020523345305287, "grad_norm": 1.0723953247070312, "learning_rate": 7.4980301471767404e-06, "loss": 0.4449, "step": 8190 }, { "epoch": 0.4006938845317501, "grad_norm": 0.9210856556892395, "learning_rate": 7.490638599377291e-06, "loss": 0.4432, "step": 8200 }, { "epoch": 0.40118253561044737, "grad_norm": 0.8094615340232849, "learning_rate": 7.483239805125886e-06, "loss": 0.4443, "step": 8210 }, { "epoch": 0.40167118668914464, "grad_norm": 1.3815480470657349, "learning_rate": 7.475833785949134e-06, "loss": 0.4431, "step": 8220 }, { "epoch": 0.40215983776784187, "grad_norm": 1.4028229713439941, "learning_rate": 7.468420563394667e-06, "loss": 0.4449, "step": 8230 }, { "epoch": 0.40264848884653914, "grad_norm": 0.7880713939666748, "learning_rate": 7.461000159031073e-06, "loss": 0.4444, "step": 8240 }, { "epoch": 0.40313713992523637, "grad_norm": 0.573472797870636, "learning_rate": 7.45357259444784e-06, "loss": 0.4432, "step": 8250 }, { "epoch": 0.40362579100393364, "grad_norm": 1.1918740272521973, "learning_rate": 7.4461378912552806e-06, "loss": 0.4428, "step": 8260 }, { "epoch": 0.4041144420826309, "grad_norm": 0.6638442277908325, "learning_rate": 7.438696071084483e-06, "loss": 0.4447, "step": 8270 }, { "epoch": 0.40460309316132814, "grad_norm": 1.2030208110809326, "learning_rate": 7.431247155587243e-06, "loss": 0.4436, "step": 8280 }, { "epoch": 0.4050917442400254, "grad_norm": 0.3726930320262909, "learning_rate": 7.423791166435997e-06, "loss": 0.4433, "step": 8290 }, { "epoch": 0.40558039531872264, "grad_norm": 0.8080679178237915, "learning_rate": 7.4163281253237604e-06, "loss": 0.4437, "step": 8300 }, { "epoch": 0.4060690463974199, "grad_norm": 0.7469872832298279, "learning_rate": 7.40885805396407e-06, "loss": 0.4427, "step": 8310 }, { "epoch": 0.4065576974761172, "grad_norm": 1.38739812374115, "learning_rate": 7.4013809740909135e-06, "loss": 0.443, "step": 8320 }, { "epoch": 0.4070463485548144, "grad_norm": 0.823733389377594, "learning_rate": 7.393896907458674e-06, "loss": 0.4427, "step": 8330 }, { "epoch": 0.4075349996335117, "grad_norm": 0.47151875495910645, "learning_rate": 7.3864058758420595e-06, "loss": 0.445, "step": 8340 }, { "epoch": 0.408023650712209, "grad_norm": 0.34016215801239014, "learning_rate": 7.378907901036042e-06, "loss": 0.4437, "step": 8350 }, { "epoch": 0.4085123017909062, "grad_norm": 0.9797572493553162, "learning_rate": 7.3714030048557935e-06, "loss": 0.4431, "step": 8360 }, { "epoch": 0.4090009528696035, "grad_norm": 0.8803391456604004, "learning_rate": 7.363891209136631e-06, "loss": 0.4431, "step": 8370 }, { "epoch": 0.4094896039483007, "grad_norm": 0.9852266907691956, "learning_rate": 7.356372535733934e-06, "loss": 0.443, "step": 8380 }, { "epoch": 0.409978255026998, "grad_norm": 1.409609317779541, "learning_rate": 7.348847006523103e-06, "loss": 0.4447, "step": 8390 }, { "epoch": 0.41046690610569525, "grad_norm": 0.47717586159706116, "learning_rate": 7.341314643399479e-06, "loss": 0.4443, "step": 8400 }, { "epoch": 0.4109555571843925, "grad_norm": 0.3413306176662445, "learning_rate": 7.333775468278285e-06, "loss": 0.443, "step": 8410 }, { "epoch": 0.41144420826308975, "grad_norm": 0.5356876254081726, "learning_rate": 7.326229503094573e-06, "loss": 0.4429, "step": 8420 }, { "epoch": 0.41193285934178697, "grad_norm": 0.5036433339118958, "learning_rate": 7.318676769803137e-06, "loss": 0.4441, "step": 8430 }, { "epoch": 0.41242151042048425, "grad_norm": 0.9086324572563171, "learning_rate": 7.311117290378473e-06, "loss": 0.4431, "step": 8440 }, { "epoch": 0.4129101614991815, "grad_norm": 0.827485203742981, "learning_rate": 7.303551086814702e-06, "loss": 0.4428, "step": 8450 }, { "epoch": 0.41339881257787875, "grad_norm": 1.1920230388641357, "learning_rate": 7.295978181125503e-06, "loss": 0.445, "step": 8460 }, { "epoch": 0.413887463656576, "grad_norm": 0.9056548476219177, "learning_rate": 7.2883985953440636e-06, "loss": 0.4442, "step": 8470 }, { "epoch": 0.4143761147352733, "grad_norm": 0.5254775881767273, "learning_rate": 7.280812351523003e-06, "loss": 0.4432, "step": 8480 }, { "epoch": 0.4148647658139705, "grad_norm": 0.6151171922683716, "learning_rate": 7.27321947173431e-06, "loss": 0.4442, "step": 8490 }, { "epoch": 0.4153534168926678, "grad_norm": 0.3920780420303345, "learning_rate": 7.265619978069281e-06, "loss": 0.4432, "step": 8500 }, { "epoch": 0.4153534168926678, "eval_loss": 0.41748544573783875, "eval_runtime": 729.7588, "eval_samples_per_second": 242.42, "eval_steps_per_second": 0.474, "step": 8500 }, { "epoch": 0.415842067971365, "grad_norm": 0.5901490449905396, "learning_rate": 7.25801389263846e-06, "loss": 0.4442, "step": 8510 }, { "epoch": 0.4163307190500623, "grad_norm": 0.5799441337585449, "learning_rate": 7.2504012375715645e-06, "loss": 0.4427, "step": 8520 }, { "epoch": 0.4168193701287596, "grad_norm": 0.9592375755310059, "learning_rate": 7.242782035017428e-06, "loss": 0.4439, "step": 8530 }, { "epoch": 0.4173080212074568, "grad_norm": 0.6781924962997437, "learning_rate": 7.235156307143933e-06, "loss": 0.4429, "step": 8540 }, { "epoch": 0.4177966722861541, "grad_norm": 0.37766560912132263, "learning_rate": 7.2275240761379464e-06, "loss": 0.4422, "step": 8550 }, { "epoch": 0.4182853233648513, "grad_norm": 1.2287683486938477, "learning_rate": 7.2198853642052615e-06, "loss": 0.4426, "step": 8560 }, { "epoch": 0.4187739744435486, "grad_norm": 0.9670842289924622, "learning_rate": 7.212240193570519e-06, "loss": 0.4434, "step": 8570 }, { "epoch": 0.41926262552224586, "grad_norm": 0.5393080115318298, "learning_rate": 7.204588586477157e-06, "loss": 0.4433, "step": 8580 }, { "epoch": 0.4197512766009431, "grad_norm": 0.5459208488464355, "learning_rate": 7.196930565187341e-06, "loss": 0.4433, "step": 8590 }, { "epoch": 0.42023992767964036, "grad_norm": 0.8376490473747253, "learning_rate": 7.189266151981893e-06, "loss": 0.4424, "step": 8600 }, { "epoch": 0.42072857875833763, "grad_norm": 3.4486372470855713, "learning_rate": 7.181595369160237e-06, "loss": 0.4425, "step": 8610 }, { "epoch": 0.42121722983703486, "grad_norm": 2.3472955226898193, "learning_rate": 7.173918239040329e-06, "loss": 0.445, "step": 8620 }, { "epoch": 0.42170588091573213, "grad_norm": 2.3312840461730957, "learning_rate": 7.166234783958587e-06, "loss": 0.4447, "step": 8630 }, { "epoch": 0.42219453199442936, "grad_norm": 0.7450709342956543, "learning_rate": 7.158545026269838e-06, "loss": 0.4438, "step": 8640 }, { "epoch": 0.42268318307312663, "grad_norm": 1.204588532447815, "learning_rate": 7.150848988347244e-06, "loss": 0.4441, "step": 8650 }, { "epoch": 0.4231718341518239, "grad_norm": 0.7559615969657898, "learning_rate": 7.143146692582237e-06, "loss": 0.4423, "step": 8660 }, { "epoch": 0.42366048523052113, "grad_norm": 1.6019837856292725, "learning_rate": 7.135438161384458e-06, "loss": 0.4436, "step": 8670 }, { "epoch": 0.4241491363092184, "grad_norm": 1.278933048248291, "learning_rate": 7.127723417181691e-06, "loss": 0.4429, "step": 8680 }, { "epoch": 0.42463778738791563, "grad_norm": 0.6044679284095764, "learning_rate": 7.1200024824197945e-06, "loss": 0.442, "step": 8690 }, { "epoch": 0.4251264384666129, "grad_norm": 0.771743655204773, "learning_rate": 7.1122753795626385e-06, "loss": 0.4429, "step": 8700 }, { "epoch": 0.4256150895453102, "grad_norm": 1.0729281902313232, "learning_rate": 7.1045421310920386e-06, "loss": 0.4436, "step": 8710 }, { "epoch": 0.4261037406240074, "grad_norm": 0.48893994092941284, "learning_rate": 7.096802759507693e-06, "loss": 0.4427, "step": 8720 }, { "epoch": 0.4265923917027047, "grad_norm": 0.5487367510795593, "learning_rate": 7.0890572873271125e-06, "loss": 0.4435, "step": 8730 }, { "epoch": 0.42708104278140196, "grad_norm": 0.39890584349632263, "learning_rate": 7.08130573708556e-06, "loss": 0.4427, "step": 8740 }, { "epoch": 0.4275696938600992, "grad_norm": 0.437925785779953, "learning_rate": 7.07354813133598e-06, "loss": 0.4423, "step": 8750 }, { "epoch": 0.42805834493879646, "grad_norm": 1.0761085748672485, "learning_rate": 7.065784492648937e-06, "loss": 0.4447, "step": 8760 }, { "epoch": 0.4285469960174937, "grad_norm": 0.6409640312194824, "learning_rate": 7.058014843612546e-06, "loss": 0.4432, "step": 8770 }, { "epoch": 0.42903564709619096, "grad_norm": 0.8142459988594055, "learning_rate": 7.050239206832412e-06, "loss": 0.4431, "step": 8780 }, { "epoch": 0.42952429817488824, "grad_norm": 0.7957897782325745, "learning_rate": 7.042457604931558e-06, "loss": 0.4427, "step": 8790 }, { "epoch": 0.43001294925358546, "grad_norm": 0.8293124437332153, "learning_rate": 7.034670060550367e-06, "loss": 0.4425, "step": 8800 }, { "epoch": 0.43050160033228274, "grad_norm": 0.3750956654548645, "learning_rate": 7.026876596346505e-06, "loss": 0.4416, "step": 8810 }, { "epoch": 0.43099025141097996, "grad_norm": 0.755920946598053, "learning_rate": 7.019077234994865e-06, "loss": 0.443, "step": 8820 }, { "epoch": 0.43147890248967724, "grad_norm": 0.6560993194580078, "learning_rate": 7.0112719991875025e-06, "loss": 0.443, "step": 8830 }, { "epoch": 0.4319675535683745, "grad_norm": 0.3859688341617584, "learning_rate": 7.003460911633555e-06, "loss": 0.443, "step": 8840 }, { "epoch": 0.43245620464707174, "grad_norm": 0.6885735988616943, "learning_rate": 6.9956439950591915e-06, "loss": 0.4418, "step": 8850 }, { "epoch": 0.432944855725769, "grad_norm": 1.1823225021362305, "learning_rate": 6.98782127220754e-06, "loss": 0.4433, "step": 8860 }, { "epoch": 0.4334335068044663, "grad_norm": 0.9184996485710144, "learning_rate": 6.979992765838619e-06, "loss": 0.4439, "step": 8870 }, { "epoch": 0.4339221578831635, "grad_norm": 0.6856487989425659, "learning_rate": 6.97215849872928e-06, "loss": 0.4431, "step": 8880 }, { "epoch": 0.4344108089618608, "grad_norm": 0.4063749611377716, "learning_rate": 6.964318493673126e-06, "loss": 0.4435, "step": 8890 }, { "epoch": 0.434899460040558, "grad_norm": 1.1154191493988037, "learning_rate": 6.956472773480463e-06, "loss": 0.4435, "step": 8900 }, { "epoch": 0.4353881111192553, "grad_norm": 0.4631388485431671, "learning_rate": 6.948621360978221e-06, "loss": 0.4424, "step": 8910 }, { "epoch": 0.43587676219795257, "grad_norm": 0.6873944997787476, "learning_rate": 6.94076427900989e-06, "loss": 0.443, "step": 8920 }, { "epoch": 0.4363654132766498, "grad_norm": 0.37667331099510193, "learning_rate": 6.9329015504354605e-06, "loss": 0.4422, "step": 8930 }, { "epoch": 0.43685406435534707, "grad_norm": 1.4186402559280396, "learning_rate": 6.925033198131347e-06, "loss": 0.4428, "step": 8940 }, { "epoch": 0.4373427154340443, "grad_norm": 0.6768743395805359, "learning_rate": 6.917159244990328e-06, "loss": 0.443, "step": 8950 }, { "epoch": 0.43783136651274157, "grad_norm": 0.6607493162155151, "learning_rate": 6.909279713921477e-06, "loss": 0.4429, "step": 8960 }, { "epoch": 0.43832001759143885, "grad_norm": 1.2457571029663086, "learning_rate": 6.9013946278500964e-06, "loss": 0.4431, "step": 8970 }, { "epoch": 0.43880866867013607, "grad_norm": 0.506984531879425, "learning_rate": 6.89350400971765e-06, "loss": 0.444, "step": 8980 }, { "epoch": 0.43929731974883335, "grad_norm": 0.9251278638839722, "learning_rate": 6.885607882481699e-06, "loss": 0.4426, "step": 8990 }, { "epoch": 0.4397859708275306, "grad_norm": 1.2666517496109009, "learning_rate": 6.8777062691158335e-06, "loss": 0.4428, "step": 9000 }, { "epoch": 0.4397859708275306, "eval_loss": 0.4181945323944092, "eval_runtime": 729.4373, "eval_samples_per_second": 242.527, "eval_steps_per_second": 0.474, "step": 9000 }, { "epoch": 0.44027462190622785, "grad_norm": 0.909946620464325, "learning_rate": 6.869799192609602e-06, "loss": 0.4423, "step": 9010 }, { "epoch": 0.4407632729849251, "grad_norm": 0.6974407434463501, "learning_rate": 6.8618866759684496e-06, "loss": 0.4421, "step": 9020 }, { "epoch": 0.44125192406362235, "grad_norm": 1.4556212425231934, "learning_rate": 6.85396874221365e-06, "loss": 0.4421, "step": 9030 }, { "epoch": 0.4417405751423196, "grad_norm": 0.7077080607414246, "learning_rate": 6.846045414382237e-06, "loss": 0.4415, "step": 9040 }, { "epoch": 0.4422292262210169, "grad_norm": 1.2867698669433594, "learning_rate": 6.838116715526941e-06, "loss": 0.4431, "step": 9050 }, { "epoch": 0.4427178772997141, "grad_norm": 0.350985586643219, "learning_rate": 6.8301826687161135e-06, "loss": 0.4425, "step": 9060 }, { "epoch": 0.4432065283784114, "grad_norm": 0.9761406779289246, "learning_rate": 6.822243297033671e-06, "loss": 0.4415, "step": 9070 }, { "epoch": 0.4436951794571086, "grad_norm": 0.7296372652053833, "learning_rate": 6.814298623579021e-06, "loss": 0.4432, "step": 9080 }, { "epoch": 0.4441838305358059, "grad_norm": 0.8322256803512573, "learning_rate": 6.806348671466996e-06, "loss": 0.442, "step": 9090 }, { "epoch": 0.4446724816145032, "grad_norm": 0.6768003106117249, "learning_rate": 6.798393463827786e-06, "loss": 0.442, "step": 9100 }, { "epoch": 0.4451611326932004, "grad_norm": 0.9105594754219055, "learning_rate": 6.790433023806874e-06, "loss": 0.4426, "step": 9110 }, { "epoch": 0.4456497837718977, "grad_norm": 0.8735663890838623, "learning_rate": 6.782467374564964e-06, "loss": 0.4414, "step": 9120 }, { "epoch": 0.44613843485059496, "grad_norm": 0.4745177626609802, "learning_rate": 6.774496539277917e-06, "loss": 0.4428, "step": 9130 }, { "epoch": 0.4466270859292922, "grad_norm": 0.35364508628845215, "learning_rate": 6.766520541136684e-06, "loss": 0.4425, "step": 9140 }, { "epoch": 0.44711573700798946, "grad_norm": 1.5570448637008667, "learning_rate": 6.758539403347235e-06, "loss": 0.4423, "step": 9150 }, { "epoch": 0.4476043880866867, "grad_norm": 0.6677067279815674, "learning_rate": 6.750553149130498e-06, "loss": 0.4425, "step": 9160 }, { "epoch": 0.44809303916538396, "grad_norm": 0.5844752192497253, "learning_rate": 6.74256180172228e-06, "loss": 0.4427, "step": 9170 }, { "epoch": 0.44858169024408123, "grad_norm": 0.5263113379478455, "learning_rate": 6.734565384373211e-06, "loss": 0.4419, "step": 9180 }, { "epoch": 0.44907034132277845, "grad_norm": 0.7214266061782837, "learning_rate": 6.726563920348671e-06, "loss": 0.442, "step": 9190 }, { "epoch": 0.44955899240147573, "grad_norm": 1.2973275184631348, "learning_rate": 6.718557432928725e-06, "loss": 0.4428, "step": 9200 }, { "epoch": 0.450047643480173, "grad_norm": 1.9566432237625122, "learning_rate": 6.7105459454080535e-06, "loss": 0.4444, "step": 9210 }, { "epoch": 0.45053629455887023, "grad_norm": 1.5999767780303955, "learning_rate": 6.7025294810958785e-06, "loss": 0.4439, "step": 9220 }, { "epoch": 0.4510249456375675, "grad_norm": 1.2058864831924438, "learning_rate": 6.6945080633159096e-06, "loss": 0.4428, "step": 9230 }, { "epoch": 0.45151359671626473, "grad_norm": 0.682574987411499, "learning_rate": 6.686481715406264e-06, "loss": 0.442, "step": 9240 }, { "epoch": 0.452002247794962, "grad_norm": 0.6059571504592896, "learning_rate": 6.678450460719405e-06, "loss": 0.4428, "step": 9250 }, { "epoch": 0.4524908988736593, "grad_norm": 0.9549880027770996, "learning_rate": 6.670414322622072e-06, "loss": 0.4421, "step": 9260 }, { "epoch": 0.4529795499523565, "grad_norm": 0.7796644568443298, "learning_rate": 6.66237332449521e-06, "loss": 0.4428, "step": 9270 }, { "epoch": 0.4534682010310538, "grad_norm": 1.1869465112686157, "learning_rate": 6.6543274897339075e-06, "loss": 0.4439, "step": 9280 }, { "epoch": 0.453956852109751, "grad_norm": 4.104377269744873, "learning_rate": 6.6462768417473215e-06, "loss": 0.4455, "step": 9290 }, { "epoch": 0.4544455031884483, "grad_norm": 0.8395638465881348, "learning_rate": 6.638221403958616e-06, "loss": 0.443, "step": 9300 }, { "epoch": 0.45493415426714556, "grad_norm": 0.7057262659072876, "learning_rate": 6.63016119980489e-06, "loss": 0.443, "step": 9310 }, { "epoch": 0.4554228053458428, "grad_norm": 1.067874789237976, "learning_rate": 6.622096252737111e-06, "loss": 0.4434, "step": 9320 }, { "epoch": 0.45591145642454006, "grad_norm": 1.1366690397262573, "learning_rate": 6.614026586220043e-06, "loss": 0.4442, "step": 9330 }, { "epoch": 0.45640010750323734, "grad_norm": 0.8740336298942566, "learning_rate": 6.605952223732183e-06, "loss": 0.4419, "step": 9340 }, { "epoch": 0.45688875858193456, "grad_norm": 1.2686458826065063, "learning_rate": 6.597873188765693e-06, "loss": 0.4413, "step": 9350 }, { "epoch": 0.45737740966063184, "grad_norm": 0.4457259774208069, "learning_rate": 6.589789504826325e-06, "loss": 0.4421, "step": 9360 }, { "epoch": 0.45786606073932906, "grad_norm": 0.5987876057624817, "learning_rate": 6.581701195433358e-06, "loss": 0.4418, "step": 9370 }, { "epoch": 0.45835471181802634, "grad_norm": 0.430936336517334, "learning_rate": 6.573608284119536e-06, "loss": 0.4415, "step": 9380 }, { "epoch": 0.4588433628967236, "grad_norm": 0.9248373508453369, "learning_rate": 6.565510794430978e-06, "loss": 0.4408, "step": 9390 }, { "epoch": 0.45933201397542084, "grad_norm": 0.5061573386192322, "learning_rate": 6.557408749927139e-06, "loss": 0.4436, "step": 9400 }, { "epoch": 0.4598206650541181, "grad_norm": 0.6956728100776672, "learning_rate": 6.5493021741807125e-06, "loss": 0.4424, "step": 9410 }, { "epoch": 0.46030931613281534, "grad_norm": 0.5525333881378174, "learning_rate": 6.541191090777586e-06, "loss": 0.4419, "step": 9420 }, { "epoch": 0.4607979672115126, "grad_norm": 0.5926039218902588, "learning_rate": 6.5330755233167586e-06, "loss": 0.4417, "step": 9430 }, { "epoch": 0.4612866182902099, "grad_norm": 0.7355937361717224, "learning_rate": 6.524955495410271e-06, "loss": 0.441, "step": 9440 }, { "epoch": 0.4617752693689071, "grad_norm": 0.9713565111160278, "learning_rate": 6.516831030683148e-06, "loss": 0.4412, "step": 9450 }, { "epoch": 0.4622639204476044, "grad_norm": 1.2393561601638794, "learning_rate": 6.508702152773323e-06, "loss": 0.4418, "step": 9460 }, { "epoch": 0.46275257152630167, "grad_norm": 0.83049476146698, "learning_rate": 6.5005688853315615e-06, "loss": 0.4432, "step": 9470 }, { "epoch": 0.4632412226049989, "grad_norm": 0.4689672291278839, "learning_rate": 6.492431252021408e-06, "loss": 0.4425, "step": 9480 }, { "epoch": 0.46372987368369617, "grad_norm": 0.5514821410179138, "learning_rate": 6.484289276519109e-06, "loss": 0.442, "step": 9490 }, { "epoch": 0.4642185247623934, "grad_norm": 0.4042249321937561, "learning_rate": 6.47614298251354e-06, "loss": 0.442, "step": 9500 }, { "epoch": 0.4642185247623934, "eval_loss": 0.41554516553878784, "eval_runtime": 729.5945, "eval_samples_per_second": 242.474, "eval_steps_per_second": 0.474, "step": 9500 }, { "epoch": 0.46470717584109067, "grad_norm": 0.44334179162979126, "learning_rate": 6.467992393706147e-06, "loss": 0.4403, "step": 9510 }, { "epoch": 0.46519582691978795, "grad_norm": 0.49329543113708496, "learning_rate": 6.4598375338108656e-06, "loss": 0.4418, "step": 9520 }, { "epoch": 0.46568447799848517, "grad_norm": 0.5903816223144531, "learning_rate": 6.451678426554061e-06, "loss": 0.4409, "step": 9530 }, { "epoch": 0.46617312907718245, "grad_norm": 1.2968121767044067, "learning_rate": 6.443515095674456e-06, "loss": 0.443, "step": 9540 }, { "epoch": 0.46666178015587967, "grad_norm": 0.7769078612327576, "learning_rate": 6.435347564923062e-06, "loss": 0.4432, "step": 9550 }, { "epoch": 0.46715043123457695, "grad_norm": 0.8744146823883057, "learning_rate": 6.42717585806311e-06, "loss": 0.4411, "step": 9560 }, { "epoch": 0.4676390823132742, "grad_norm": 0.47742319107055664, "learning_rate": 6.418999998869982e-06, "loss": 0.4426, "step": 9570 }, { "epoch": 0.46812773339197145, "grad_norm": 0.4284425973892212, "learning_rate": 6.4108200111311355e-06, "loss": 0.4426, "step": 9580 }, { "epoch": 0.4686163844706687, "grad_norm": 0.37580737471580505, "learning_rate": 6.402635918646049e-06, "loss": 0.4425, "step": 9590 }, { "epoch": 0.469105035549366, "grad_norm": 0.3638119399547577, "learning_rate": 6.394447745226137e-06, "loss": 0.4411, "step": 9600 }, { "epoch": 0.4695936866280632, "grad_norm": 2.9128997325897217, "learning_rate": 6.386255514694688e-06, "loss": 0.4418, "step": 9610 }, { "epoch": 0.4700823377067605, "grad_norm": 0.9645544290542603, "learning_rate": 6.378059250886799e-06, "loss": 0.4419, "step": 9620 }, { "epoch": 0.4705709887854577, "grad_norm": 0.43301165103912354, "learning_rate": 6.369858977649297e-06, "loss": 0.4429, "step": 9630 }, { "epoch": 0.471059639864155, "grad_norm": 1.5179802179336548, "learning_rate": 6.361654718840675e-06, "loss": 0.4414, "step": 9640 }, { "epoch": 0.4715482909428523, "grad_norm": 0.3464379608631134, "learning_rate": 6.353446498331024e-06, "loss": 0.4428, "step": 9650 }, { "epoch": 0.4720369420215495, "grad_norm": 0.89571613073349, "learning_rate": 6.34523434000196e-06, "loss": 0.441, "step": 9660 }, { "epoch": 0.4725255931002468, "grad_norm": 0.6052807569503784, "learning_rate": 6.337018267746558e-06, "loss": 0.4412, "step": 9670 }, { "epoch": 0.473014244178944, "grad_norm": 1.2590041160583496, "learning_rate": 6.328798305469278e-06, "loss": 0.4415, "step": 9680 }, { "epoch": 0.4735028952576413, "grad_norm": 0.6158220171928406, "learning_rate": 6.3205744770858965e-06, "loss": 0.4419, "step": 9690 }, { "epoch": 0.47399154633633855, "grad_norm": 0.46361032128334045, "learning_rate": 6.312346806523444e-06, "loss": 0.4417, "step": 9700 }, { "epoch": 0.4744801974150358, "grad_norm": 1.2066395282745361, "learning_rate": 6.304115317720123e-06, "loss": 0.4415, "step": 9710 }, { "epoch": 0.47496884849373305, "grad_norm": 0.9531863331794739, "learning_rate": 6.295880034625251e-06, "loss": 0.4421, "step": 9720 }, { "epoch": 0.47545749957243033, "grad_norm": 0.3842741549015045, "learning_rate": 6.287640981199183e-06, "loss": 0.4412, "step": 9730 }, { "epoch": 0.47594615065112755, "grad_norm": 0.3795391023159027, "learning_rate": 6.27939818141324e-06, "loss": 0.4414, "step": 9740 }, { "epoch": 0.47643480172982483, "grad_norm": 0.5748067498207092, "learning_rate": 6.2711516592496455e-06, "loss": 0.4411, "step": 9750 }, { "epoch": 0.47692345280852205, "grad_norm": 0.7015309929847717, "learning_rate": 6.262901438701459e-06, "loss": 0.4417, "step": 9760 }, { "epoch": 0.47741210388721933, "grad_norm": 0.4260580539703369, "learning_rate": 6.254647543772489e-06, "loss": 0.4419, "step": 9770 }, { "epoch": 0.4779007549659166, "grad_norm": 0.9640613198280334, "learning_rate": 6.246389998477245e-06, "loss": 0.4405, "step": 9780 }, { "epoch": 0.47838940604461383, "grad_norm": 0.7557575106620789, "learning_rate": 6.23812882684085e-06, "loss": 0.4409, "step": 9790 }, { "epoch": 0.4788780571233111, "grad_norm": 1.2757539749145508, "learning_rate": 6.22986405289898e-06, "loss": 0.4421, "step": 9800 }, { "epoch": 0.47936670820200833, "grad_norm": 1.3108956813812256, "learning_rate": 6.221595700697794e-06, "loss": 0.4434, "step": 9810 }, { "epoch": 0.4798553592807056, "grad_norm": 0.7379423379898071, "learning_rate": 6.2133237942938594e-06, "loss": 0.4423, "step": 9820 }, { "epoch": 0.4803440103594029, "grad_norm": 0.6387200951576233, "learning_rate": 6.2050483577540845e-06, "loss": 0.4419, "step": 9830 }, { "epoch": 0.4808326614381001, "grad_norm": 1.4142051935195923, "learning_rate": 6.19676941515565e-06, "loss": 0.4422, "step": 9840 }, { "epoch": 0.4813213125167974, "grad_norm": 0.9402855038642883, "learning_rate": 6.188486990585936e-06, "loss": 0.4415, "step": 9850 }, { "epoch": 0.48180996359549466, "grad_norm": 1.5236409902572632, "learning_rate": 6.180201108142454e-06, "loss": 0.4409, "step": 9860 }, { "epoch": 0.4822986146741919, "grad_norm": 1.1364696025848389, "learning_rate": 6.171911791932774e-06, "loss": 0.4414, "step": 9870 }, { "epoch": 0.48278726575288916, "grad_norm": 0.48199930787086487, "learning_rate": 6.163619066074462e-06, "loss": 0.4403, "step": 9880 }, { "epoch": 0.4832759168315864, "grad_norm": 0.3505820333957672, "learning_rate": 6.1553229546949975e-06, "loss": 0.4394, "step": 9890 }, { "epoch": 0.48376456791028366, "grad_norm": 1.0344538688659668, "learning_rate": 6.147023481931716e-06, "loss": 0.4408, "step": 9900 }, { "epoch": 0.48425321898898094, "grad_norm": 0.39767566323280334, "learning_rate": 6.138720671931726e-06, "loss": 0.4408, "step": 9910 }, { "epoch": 0.48474187006767816, "grad_norm": 0.6819673180580139, "learning_rate": 6.130414548851854e-06, "loss": 0.4412, "step": 9920 }, { "epoch": 0.48523052114637544, "grad_norm": 1.247071623802185, "learning_rate": 6.122105136858558e-06, "loss": 0.4402, "step": 9930 }, { "epoch": 0.48571917222507266, "grad_norm": 1.1983033418655396, "learning_rate": 6.113792460127872e-06, "loss": 0.442, "step": 9940 }, { "epoch": 0.48620782330376994, "grad_norm": 0.9668486714363098, "learning_rate": 6.105476542845324e-06, "loss": 0.4421, "step": 9950 }, { "epoch": 0.4866964743824672, "grad_norm": 0.5192340016365051, "learning_rate": 6.097157409205867e-06, "loss": 0.4415, "step": 9960 }, { "epoch": 0.48718512546116444, "grad_norm": 1.4621678590774536, "learning_rate": 6.088835083413823e-06, "loss": 0.4413, "step": 9970 }, { "epoch": 0.4876737765398617, "grad_norm": 0.4883491098880768, "learning_rate": 6.080509589682793e-06, "loss": 0.4417, "step": 9980 }, { "epoch": 0.488162427618559, "grad_norm": 0.4201609194278717, "learning_rate": 6.072180952235593e-06, "loss": 0.4414, "step": 9990 }, { "epoch": 0.4886510786972562, "grad_norm": 0.8927011489868164, "learning_rate": 6.063849195304194e-06, "loss": 0.4404, "step": 10000 }, { "epoch": 0.4886510786972562, "eval_loss": 0.4168914556503296, "eval_runtime": 729.6388, "eval_samples_per_second": 242.46, "eval_steps_per_second": 0.474, "step": 10000 }, { "epoch": 0.4891397297759535, "grad_norm": 1.4013339281082153, "learning_rate": 6.055514343129638e-06, "loss": 0.4427, "step": 10010 }, { "epoch": 0.4896283808546507, "grad_norm": 0.5623286366462708, "learning_rate": 6.047176419961972e-06, "loss": 0.4414, "step": 10020 }, { "epoch": 0.490117031933348, "grad_norm": 0.5934634804725647, "learning_rate": 6.038835450060181e-06, "loss": 0.4419, "step": 10030 }, { "epoch": 0.49060568301204527, "grad_norm": 0.5222377181053162, "learning_rate": 6.030491457692108e-06, "loss": 0.4415, "step": 10040 }, { "epoch": 0.4910943340907425, "grad_norm": 0.4764785170555115, "learning_rate": 6.022144467134399e-06, "loss": 0.4407, "step": 10050 }, { "epoch": 0.49158298516943977, "grad_norm": 0.738297700881958, "learning_rate": 6.013794502672415e-06, "loss": 0.442, "step": 10060 }, { "epoch": 0.49207163624813705, "grad_norm": 5.993337631225586, "learning_rate": 6.005441588600176e-06, "loss": 0.4424, "step": 10070 }, { "epoch": 0.49256028732683427, "grad_norm": 2.3225927352905273, "learning_rate": 5.99708574922028e-06, "loss": 0.4487, "step": 10080 }, { "epoch": 0.49304893840553154, "grad_norm": 0.8819851875305176, "learning_rate": 5.988727008843834e-06, "loss": 0.443, "step": 10090 }, { "epoch": 0.49353758948422877, "grad_norm": 0.8179062008857727, "learning_rate": 5.980365391790392e-06, "loss": 0.4415, "step": 10100 }, { "epoch": 0.49402624056292604, "grad_norm": 1.1633132696151733, "learning_rate": 5.97200092238787e-06, "loss": 0.4415, "step": 10110 }, { "epoch": 0.4945148916416233, "grad_norm": 0.5630601048469543, "learning_rate": 5.963633624972491e-06, "loss": 0.4421, "step": 10120 }, { "epoch": 0.49500354272032054, "grad_norm": 0.95186847448349, "learning_rate": 5.955263523888699e-06, "loss": 0.4424, "step": 10130 }, { "epoch": 0.4954921937990178, "grad_norm": 0.9137486219406128, "learning_rate": 5.9468906434890995e-06, "loss": 0.4409, "step": 10140 }, { "epoch": 0.49598084487771504, "grad_norm": 0.5341358184814453, "learning_rate": 5.938515008134381e-06, "loss": 0.4407, "step": 10150 }, { "epoch": 0.4964694959564123, "grad_norm": 0.8407842516899109, "learning_rate": 5.9301366421932505e-06, "loss": 0.4404, "step": 10160 }, { "epoch": 0.4969581470351096, "grad_norm": 0.7001408338546753, "learning_rate": 5.921755570042358e-06, "loss": 0.4412, "step": 10170 }, { "epoch": 0.4974467981138068, "grad_norm": 0.8030371069908142, "learning_rate": 5.913371816066226e-06, "loss": 0.4415, "step": 10180 }, { "epoch": 0.4979354491925041, "grad_norm": 0.9030990600585938, "learning_rate": 5.904985404657187e-06, "loss": 0.4409, "step": 10190 }, { "epoch": 0.4984241002712014, "grad_norm": 1.0445612668991089, "learning_rate": 5.896596360215292e-06, "loss": 0.4419, "step": 10200 }, { "epoch": 0.4989127513498986, "grad_norm": 0.8249901533126831, "learning_rate": 5.888204707148263e-06, "loss": 0.4406, "step": 10210 }, { "epoch": 0.4994014024285959, "grad_norm": 0.4994339048862457, "learning_rate": 5.8798104698714095e-06, "loss": 0.4397, "step": 10220 }, { "epoch": 0.4998900535072931, "grad_norm": 0.5726603865623474, "learning_rate": 5.87141367280756e-06, "loss": 0.4403, "step": 10230 }, { "epoch": 0.5003787045859903, "grad_norm": 0.7047241926193237, "learning_rate": 5.863014340386988e-06, "loss": 0.4416, "step": 10240 }, { "epoch": 0.5008673556646877, "grad_norm": 0.730197012424469, "learning_rate": 5.854612497047347e-06, "loss": 0.4419, "step": 10250 }, { "epoch": 0.5013560067433849, "grad_norm": 0.6394559741020203, "learning_rate": 5.846208167233593e-06, "loss": 0.4407, "step": 10260 }, { "epoch": 0.5018446578220821, "grad_norm": 0.4507567882537842, "learning_rate": 5.837801375397916e-06, "loss": 0.4399, "step": 10270 }, { "epoch": 0.5023333089007794, "grad_norm": 0.6874068975448608, "learning_rate": 5.829392145999673e-06, "loss": 0.442, "step": 10280 }, { "epoch": 0.5028219599794767, "grad_norm": 0.48060235381126404, "learning_rate": 5.820980503505311e-06, "loss": 0.4397, "step": 10290 }, { "epoch": 0.5033106110581739, "grad_norm": 0.4969087541103363, "learning_rate": 5.812566472388298e-06, "loss": 0.4399, "step": 10300 }, { "epoch": 0.5037992621368712, "grad_norm": 0.8934044241905212, "learning_rate": 5.804150077129049e-06, "loss": 0.4406, "step": 10310 }, { "epoch": 0.5042879132155684, "grad_norm": 0.6583065390586853, "learning_rate": 5.795731342214861e-06, "loss": 0.4406, "step": 10320 }, { "epoch": 0.5047765642942657, "grad_norm": 0.7381777167320251, "learning_rate": 5.787310292139837e-06, "loss": 0.4414, "step": 10330 }, { "epoch": 0.505265215372963, "grad_norm": 0.5181640386581421, "learning_rate": 5.778886951404816e-06, "loss": 0.4409, "step": 10340 }, { "epoch": 0.5057538664516602, "grad_norm": 0.44236427545547485, "learning_rate": 5.770461344517302e-06, "loss": 0.4415, "step": 10350 }, { "epoch": 0.5062425175303574, "grad_norm": 0.40523165464401245, "learning_rate": 5.76203349599139e-06, "loss": 0.4404, "step": 10360 }, { "epoch": 0.5067311686090546, "grad_norm": 0.36556363105773926, "learning_rate": 5.753603430347699e-06, "loss": 0.443, "step": 10370 }, { "epoch": 0.507219819687752, "grad_norm": 0.3584481477737427, "learning_rate": 5.7451711721133e-06, "loss": 0.44, "step": 10380 }, { "epoch": 0.5077084707664492, "grad_norm": 0.5849773287773132, "learning_rate": 5.736736745821641e-06, "loss": 0.4398, "step": 10390 }, { "epoch": 0.5081971218451464, "grad_norm": 1.39704167842865, "learning_rate": 5.728300176012476e-06, "loss": 0.4406, "step": 10400 }, { "epoch": 0.5086857729238438, "grad_norm": 1.3421454429626465, "learning_rate": 5.719861487231802e-06, "loss": 0.4411, "step": 10410 }, { "epoch": 0.509174424002541, "grad_norm": 0.8897213935852051, "learning_rate": 5.711420704031774e-06, "loss": 0.4418, "step": 10420 }, { "epoch": 0.5096630750812382, "grad_norm": 0.8177825212478638, "learning_rate": 5.702977850970646e-06, "loss": 0.4414, "step": 10430 }, { "epoch": 0.5101517261599355, "grad_norm": 0.5944052934646606, "learning_rate": 5.694532952612692e-06, "loss": 0.4406, "step": 10440 }, { "epoch": 0.5106403772386328, "grad_norm": 0.48135659098625183, "learning_rate": 5.686086033528135e-06, "loss": 0.4409, "step": 10450 }, { "epoch": 0.51112902831733, "grad_norm": 0.6524203419685364, "learning_rate": 5.67763711829308e-06, "loss": 0.4413, "step": 10460 }, { "epoch": 0.5116176793960273, "grad_norm": 0.8007875084877014, "learning_rate": 5.66918623148944e-06, "loss": 0.4399, "step": 10470 }, { "epoch": 0.5121063304747245, "grad_norm": 0.9331921339035034, "learning_rate": 5.660733397704861e-06, "loss": 0.4407, "step": 10480 }, { "epoch": 0.5125949815534218, "grad_norm": 0.5154340863227844, "learning_rate": 5.652278641532657e-06, "loss": 0.4399, "step": 10490 }, { "epoch": 0.513083632632119, "grad_norm": 0.5443922877311707, "learning_rate": 5.643821987571732e-06, "loss": 0.4418, "step": 10500 }, { "epoch": 0.513083632632119, "eval_loss": 0.41731706261634827, "eval_runtime": 729.1332, "eval_samples_per_second": 242.628, "eval_steps_per_second": 0.475, "step": 10500 }, { "epoch": 0.5135722837108163, "grad_norm": 0.7409442067146301, "learning_rate": 5.635363460426516e-06, "loss": 0.4416, "step": 10510 }, { "epoch": 0.5140609347895135, "grad_norm": 0.5923414826393127, "learning_rate": 5.6269030847068855e-06, "loss": 0.4398, "step": 10520 }, { "epoch": 0.5145495858682108, "grad_norm": 0.4530554711818695, "learning_rate": 5.6184408850280955e-06, "loss": 0.4408, "step": 10530 }, { "epoch": 0.5150382369469081, "grad_norm": 0.49950364232063293, "learning_rate": 5.609976886010708e-06, "loss": 0.4409, "step": 10540 }, { "epoch": 0.5155268880256053, "grad_norm": 2.171323776245117, "learning_rate": 5.601511112280525e-06, "loss": 0.4396, "step": 10550 }, { "epoch": 0.5160155391043025, "grad_norm": 0.5502694249153137, "learning_rate": 5.593043588468502e-06, "loss": 0.4399, "step": 10560 }, { "epoch": 0.5165041901829999, "grad_norm": 0.3139466941356659, "learning_rate": 5.584574339210694e-06, "loss": 0.4405, "step": 10570 }, { "epoch": 0.5169928412616971, "grad_norm": 0.756894588470459, "learning_rate": 5.576103389148175e-06, "loss": 0.4401, "step": 10580 }, { "epoch": 0.5174814923403943, "grad_norm": 0.5437245965003967, "learning_rate": 5.567630762926967e-06, "loss": 0.4412, "step": 10590 }, { "epoch": 0.5179701434190916, "grad_norm": 0.796293318271637, "learning_rate": 5.559156485197967e-06, "loss": 0.441, "step": 10600 }, { "epoch": 0.5184587944977889, "grad_norm": 0.642201840877533, "learning_rate": 5.550680580616878e-06, "loss": 0.4412, "step": 10610 }, { "epoch": 0.5189474455764861, "grad_norm": 0.2663089632987976, "learning_rate": 5.542203073844139e-06, "loss": 0.441, "step": 10620 }, { "epoch": 0.5194360966551834, "grad_norm": 0.45160502195358276, "learning_rate": 5.533723989544844e-06, "loss": 0.4404, "step": 10630 }, { "epoch": 0.5199247477338806, "grad_norm": 0.4790808856487274, "learning_rate": 5.525243352388686e-06, "loss": 0.4402, "step": 10640 }, { "epoch": 0.5204133988125779, "grad_norm": 0.3323618471622467, "learning_rate": 5.5167611870498676e-06, "loss": 0.4398, "step": 10650 }, { "epoch": 0.5209020498912751, "grad_norm": 0.3828358054161072, "learning_rate": 5.508277518207042e-06, "loss": 0.4402, "step": 10660 }, { "epoch": 0.5213907009699724, "grad_norm": 4.394709587097168, "learning_rate": 5.499792370543236e-06, "loss": 0.4401, "step": 10670 }, { "epoch": 0.5218793520486696, "grad_norm": 0.34605780243873596, "learning_rate": 5.491305768745776e-06, "loss": 0.4409, "step": 10680 }, { "epoch": 0.5223680031273669, "grad_norm": 0.41763895750045776, "learning_rate": 5.4828177375062255e-06, "loss": 0.4398, "step": 10690 }, { "epoch": 0.5228566542060642, "grad_norm": 1.0943188667297363, "learning_rate": 5.474328301520302e-06, "loss": 0.4395, "step": 10700 }, { "epoch": 0.5233453052847614, "grad_norm": 0.8608265519142151, "learning_rate": 5.465837485487813e-06, "loss": 0.4413, "step": 10710 }, { "epoch": 0.5238339563634586, "grad_norm": 1.6863247156143188, "learning_rate": 5.457345314112577e-06, "loss": 0.4413, "step": 10720 }, { "epoch": 0.524322607442156, "grad_norm": 0.5766188502311707, "learning_rate": 5.448851812102357e-06, "loss": 0.4406, "step": 10730 }, { "epoch": 0.5248112585208532, "grad_norm": 0.84405517578125, "learning_rate": 5.440357004168795e-06, "loss": 0.441, "step": 10740 }, { "epoch": 0.5252999095995504, "grad_norm": 0.7851320505142212, "learning_rate": 5.431860915027321e-06, "loss": 0.4402, "step": 10750 }, { "epoch": 0.5257885606782478, "grad_norm": 0.4214421510696411, "learning_rate": 5.423363569397101e-06, "loss": 0.441, "step": 10760 }, { "epoch": 0.526277211756945, "grad_norm": 1.1546157598495483, "learning_rate": 5.4148649920009534e-06, "loss": 0.4394, "step": 10770 }, { "epoch": 0.5267658628356422, "grad_norm": 0.7156729102134705, "learning_rate": 5.4063652075652786e-06, "loss": 0.4404, "step": 10780 }, { "epoch": 0.5272545139143394, "grad_norm": 1.8909116983413696, "learning_rate": 5.3978642408199934e-06, "loss": 0.4409, "step": 10790 }, { "epoch": 0.5277431649930368, "grad_norm": 0.5709353685379028, "learning_rate": 5.3893621164984524e-06, "loss": 0.4403, "step": 10800 }, { "epoch": 0.528231816071734, "grad_norm": 0.8182409405708313, "learning_rate": 5.380858859337375e-06, "loss": 0.4404, "step": 10810 }, { "epoch": 0.5287204671504312, "grad_norm": 0.432432621717453, "learning_rate": 5.372354494076784e-06, "loss": 0.4402, "step": 10820 }, { "epoch": 0.5292091182291285, "grad_norm": 0.8491529226303101, "learning_rate": 5.363849045459918e-06, "loss": 0.44, "step": 10830 }, { "epoch": 0.5296977693078257, "grad_norm": 0.4220905900001526, "learning_rate": 5.355342538233172e-06, "loss": 0.4399, "step": 10840 }, { "epoch": 0.530186420386523, "grad_norm": 1.0726776123046875, "learning_rate": 5.346834997146023e-06, "loss": 0.44, "step": 10850 }, { "epoch": 0.5306750714652203, "grad_norm": 0.43123483657836914, "learning_rate": 5.3383264469509484e-06, "loss": 0.4411, "step": 10860 }, { "epoch": 0.5311637225439175, "grad_norm": 0.3041502833366394, "learning_rate": 5.32981691240337e-06, "loss": 0.4414, "step": 10870 }, { "epoch": 0.5316523736226147, "grad_norm": 0.7714064121246338, "learning_rate": 5.321306418261572e-06, "loss": 0.4402, "step": 10880 }, { "epoch": 0.5321410247013121, "grad_norm": 0.441977322101593, "learning_rate": 5.31279498928662e-06, "loss": 0.44, "step": 10890 }, { "epoch": 0.5326296757800093, "grad_norm": 1.5782145261764526, "learning_rate": 5.304282650242318e-06, "loss": 0.4406, "step": 10900 }, { "epoch": 0.5331183268587065, "grad_norm": 0.678400993347168, "learning_rate": 5.295769425895102e-06, "loss": 0.4412, "step": 10910 }, { "epoch": 0.5336069779374037, "grad_norm": 0.9773678183555603, "learning_rate": 5.28725534101399e-06, "loss": 0.4407, "step": 10920 }, { "epoch": 0.5340956290161011, "grad_norm": 0.6579413414001465, "learning_rate": 5.278740420370506e-06, "loss": 0.442, "step": 10930 }, { "epoch": 0.5345842800947983, "grad_norm": 0.760147213935852, "learning_rate": 5.2702246887386e-06, "loss": 0.4407, "step": 10940 }, { "epoch": 0.5350729311734955, "grad_norm": 0.9420449137687683, "learning_rate": 5.261708170894585e-06, "loss": 0.4395, "step": 10950 }, { "epoch": 0.5355615822521929, "grad_norm": 1.1415859460830688, "learning_rate": 5.253190891617063e-06, "loss": 0.4402, "step": 10960 }, { "epoch": 0.5360502333308901, "grad_norm": 0.4278971552848816, "learning_rate": 5.244672875686847e-06, "loss": 0.4405, "step": 10970 }, { "epoch": 0.5365388844095873, "grad_norm": 0.6837897300720215, "learning_rate": 5.236154147886896e-06, "loss": 0.4399, "step": 10980 }, { "epoch": 0.5370275354882846, "grad_norm": 0.7087698578834534, "learning_rate": 5.227634733002241e-06, "loss": 0.4397, "step": 10990 }, { "epoch": 0.5375161865669819, "grad_norm": 1.1717066764831543, "learning_rate": 5.219114655819909e-06, "loss": 0.4408, "step": 11000 }, { "epoch": 0.5375161865669819, "eval_loss": 0.4167872965335846, "eval_runtime": 729.0747, "eval_samples_per_second": 242.647, "eval_steps_per_second": 0.475, "step": 11000 }, { "epoch": 0.5380048376456791, "grad_norm": 0.7513532638549805, "learning_rate": 5.210593941128858e-06, "loss": 0.4408, "step": 11010 }, { "epoch": 0.5384934887243764, "grad_norm": 0.6454597115516663, "learning_rate": 5.202072613719895e-06, "loss": 0.4406, "step": 11020 }, { "epoch": 0.5389821398030736, "grad_norm": 0.459091454744339, "learning_rate": 5.193550698385616e-06, "loss": 0.4411, "step": 11030 }, { "epoch": 0.5394707908817709, "grad_norm": 0.40384477376937866, "learning_rate": 5.185028219920325e-06, "loss": 0.4406, "step": 11040 }, { "epoch": 0.5399594419604681, "grad_norm": 0.44627973437309265, "learning_rate": 5.1765052031199626e-06, "loss": 0.4393, "step": 11050 }, { "epoch": 0.5404480930391654, "grad_norm": 0.9470422267913818, "learning_rate": 5.167981672782038e-06, "loss": 0.4395, "step": 11060 }, { "epoch": 0.5409367441178626, "grad_norm": 0.968473494052887, "learning_rate": 5.1594576537055555e-06, "loss": 0.4401, "step": 11070 }, { "epoch": 0.5414253951965599, "grad_norm": 0.4251641631126404, "learning_rate": 5.150933170690936e-06, "loss": 0.439, "step": 11080 }, { "epoch": 0.5419140462752572, "grad_norm": 0.5823407173156738, "learning_rate": 5.142408248539956e-06, "loss": 0.4398, "step": 11090 }, { "epoch": 0.5424026973539544, "grad_norm": 0.7198439836502075, "learning_rate": 5.133882912055669e-06, "loss": 0.439, "step": 11100 }, { "epoch": 0.5428913484326516, "grad_norm": 0.8078601360321045, "learning_rate": 5.125357186042329e-06, "loss": 0.44, "step": 11110 }, { "epoch": 0.543379999511349, "grad_norm": 0.713046133518219, "learning_rate": 5.116831095305331e-06, "loss": 0.4398, "step": 11120 }, { "epoch": 0.5438686505900462, "grad_norm": 0.5632086396217346, "learning_rate": 5.108304664651123e-06, "loss": 0.4398, "step": 11130 }, { "epoch": 0.5443573016687434, "grad_norm": 1.3256471157073975, "learning_rate": 5.099777918887149e-06, "loss": 0.4396, "step": 11140 }, { "epoch": 0.5448459527474407, "grad_norm": 0.9530927538871765, "learning_rate": 5.0912508828217645e-06, "loss": 0.4389, "step": 11150 }, { "epoch": 0.545334603826138, "grad_norm": 2.566054582595825, "learning_rate": 5.082723581264174e-06, "loss": 0.44, "step": 11160 }, { "epoch": 0.5458232549048352, "grad_norm": 0.6221000552177429, "learning_rate": 5.074196039024351e-06, "loss": 0.4399, "step": 11170 }, { "epoch": 0.5463119059835324, "grad_norm": 0.5202614665031433, "learning_rate": 5.065668280912972e-06, "loss": 0.4394, "step": 11180 }, { "epoch": 0.5468005570622297, "grad_norm": 0.9228209257125854, "learning_rate": 5.057140331741337e-06, "loss": 0.4402, "step": 11190 }, { "epoch": 0.547289208140927, "grad_norm": 0.3940802216529846, "learning_rate": 5.048612216321311e-06, "loss": 0.4393, "step": 11200 }, { "epoch": 0.5477778592196242, "grad_norm": 1.3075381517410278, "learning_rate": 5.04008395946523e-06, "loss": 0.4407, "step": 11210 }, { "epoch": 0.5482665102983215, "grad_norm": 0.4319058060646057, "learning_rate": 5.031555585985852e-06, "loss": 0.4396, "step": 11220 }, { "epoch": 0.5487551613770187, "grad_norm": 0.9323760867118835, "learning_rate": 5.023027120696271e-06, "loss": 0.4395, "step": 11230 }, { "epoch": 0.549243812455716, "grad_norm": 0.726767361164093, "learning_rate": 5.014498588409847e-06, "loss": 0.4403, "step": 11240 }, { "epoch": 0.5497324635344133, "grad_norm": 0.6504103541374207, "learning_rate": 5.005970013940133e-06, "loss": 0.4397, "step": 11250 }, { "epoch": 0.5502211146131105, "grad_norm": 1.1144918203353882, "learning_rate": 4.9974414221008125e-06, "loss": 0.4412, "step": 11260 }, { "epoch": 0.5507097656918077, "grad_norm": 0.6615655422210693, "learning_rate": 4.98891283770561e-06, "loss": 0.4397, "step": 11270 }, { "epoch": 0.5511984167705051, "grad_norm": 0.5302955508232117, "learning_rate": 4.980384285568235e-06, "loss": 0.4395, "step": 11280 }, { "epoch": 0.5516870678492023, "grad_norm": 0.4470592439174652, "learning_rate": 4.9718557905023e-06, "loss": 0.4402, "step": 11290 }, { "epoch": 0.5521757189278995, "grad_norm": 0.4626651108264923, "learning_rate": 4.963327377321253e-06, "loss": 0.4382, "step": 11300 }, { "epoch": 0.5526643700065967, "grad_norm": 0.48710301518440247, "learning_rate": 4.954799070838304e-06, "loss": 0.4404, "step": 11310 }, { "epoch": 0.5531530210852941, "grad_norm": 0.31981727480888367, "learning_rate": 4.946270895866347e-06, "loss": 0.4391, "step": 11320 }, { "epoch": 0.5536416721639913, "grad_norm": 1.153678297996521, "learning_rate": 4.937742877217906e-06, "loss": 0.4403, "step": 11330 }, { "epoch": 0.5541303232426885, "grad_norm": 1.0284217596054077, "learning_rate": 4.929215039705035e-06, "loss": 0.4402, "step": 11340 }, { "epoch": 0.5546189743213858, "grad_norm": 0.7204963564872742, "learning_rate": 4.920687408139271e-06, "loss": 0.439, "step": 11350 }, { "epoch": 0.5551076254000831, "grad_norm": 0.6162496209144592, "learning_rate": 4.91216000733155e-06, "loss": 0.439, "step": 11360 }, { "epoch": 0.5555962764787803, "grad_norm": 0.5891590118408203, "learning_rate": 4.903632862092135e-06, "loss": 0.439, "step": 11370 }, { "epoch": 0.5560849275574776, "grad_norm": 0.5290629863739014, "learning_rate": 4.895105997230544e-06, "loss": 0.4407, "step": 11380 }, { "epoch": 0.5565735786361748, "grad_norm": 1.0910426378250122, "learning_rate": 4.886579437555484e-06, "loss": 0.4386, "step": 11390 }, { "epoch": 0.5570622297148721, "grad_norm": 0.4426107108592987, "learning_rate": 4.878053207874771e-06, "loss": 0.4393, "step": 11400 }, { "epoch": 0.5575508807935694, "grad_norm": 0.7471179366111755, "learning_rate": 4.8695273329952605e-06, "loss": 0.4396, "step": 11410 }, { "epoch": 0.5580395318722666, "grad_norm": 0.6447209119796753, "learning_rate": 4.861001837722775e-06, "loss": 0.4401, "step": 11420 }, { "epoch": 0.5585281829509638, "grad_norm": 0.42997971177101135, "learning_rate": 4.852476746862036e-06, "loss": 0.4389, "step": 11430 }, { "epoch": 0.5590168340296611, "grad_norm": 2.535978317260742, "learning_rate": 4.8439520852165874e-06, "loss": 0.4398, "step": 11440 }, { "epoch": 0.5595054851083584, "grad_norm": 0.5462396740913391, "learning_rate": 4.8354278775887215e-06, "loss": 0.4402, "step": 11450 }, { "epoch": 0.5599941361870556, "grad_norm": 0.6172703504562378, "learning_rate": 4.8269041487794115e-06, "loss": 0.4396, "step": 11460 }, { "epoch": 0.5604827872657528, "grad_norm": 0.6260773539543152, "learning_rate": 4.81838092358824e-06, "loss": 0.4387, "step": 11470 }, { "epoch": 0.5609714383444502, "grad_norm": 0.45732706785202026, "learning_rate": 4.809858226813317e-06, "loss": 0.4398, "step": 11480 }, { "epoch": 0.5614600894231474, "grad_norm": 0.5570266246795654, "learning_rate": 4.801336083251224e-06, "loss": 0.4393, "step": 11490 }, { "epoch": 0.5619487405018446, "grad_norm": 0.4119241535663605, "learning_rate": 4.792814517696927e-06, "loss": 0.4403, "step": 11500 }, { "epoch": 0.5619487405018446, "eval_loss": 0.414816677570343, "eval_runtime": 728.7625, "eval_samples_per_second": 242.751, "eval_steps_per_second": 0.475, "step": 11500 }, { "epoch": 0.562437391580542, "grad_norm": 0.9447699785232544, "learning_rate": 4.784293554943712e-06, "loss": 0.4389, "step": 11510 }, { "epoch": 0.5629260426592392, "grad_norm": 0.5308591723442078, "learning_rate": 4.775773219783112e-06, "loss": 0.4406, "step": 11520 }, { "epoch": 0.5634146937379364, "grad_norm": 1.3727697134017944, "learning_rate": 4.767253537004832e-06, "loss": 0.4401, "step": 11530 }, { "epoch": 0.5639033448166337, "grad_norm": 0.9330416321754456, "learning_rate": 4.7587345313966815e-06, "loss": 0.4406, "step": 11540 }, { "epoch": 0.564391995895331, "grad_norm": 0.32605278491973877, "learning_rate": 4.7502162277445e-06, "loss": 0.44, "step": 11550 }, { "epoch": 0.5648806469740282, "grad_norm": 0.7518082857131958, "learning_rate": 4.741698650832081e-06, "loss": 0.4393, "step": 11560 }, { "epoch": 0.5653692980527254, "grad_norm": 0.4452798068523407, "learning_rate": 4.7331818254411046e-06, "loss": 0.44, "step": 11570 }, { "epoch": 0.5658579491314227, "grad_norm": 0.46914970874786377, "learning_rate": 4.724665776351069e-06, "loss": 0.44, "step": 11580 }, { "epoch": 0.56634660021012, "grad_norm": 0.7172492146492004, "learning_rate": 4.716150528339208e-06, "loss": 0.4404, "step": 11590 }, { "epoch": 0.5668352512888172, "grad_norm": 0.5215288996696472, "learning_rate": 4.7076361061804264e-06, "loss": 0.4399, "step": 11600 }, { "epoch": 0.5673239023675145, "grad_norm": 0.5500718355178833, "learning_rate": 4.69912253464723e-06, "loss": 0.4399, "step": 11610 }, { "epoch": 0.5678125534462117, "grad_norm": 0.9018455147743225, "learning_rate": 4.690609838509642e-06, "loss": 0.4396, "step": 11620 }, { "epoch": 0.568301204524909, "grad_norm": 0.46901988983154297, "learning_rate": 4.682098042535145e-06, "loss": 0.4382, "step": 11630 }, { "epoch": 0.5687898556036063, "grad_norm": 1.1770741939544678, "learning_rate": 4.673587171488601e-06, "loss": 0.4402, "step": 11640 }, { "epoch": 0.5692785066823035, "grad_norm": 0.3521255552768707, "learning_rate": 4.665077250132183e-06, "loss": 0.4388, "step": 11650 }, { "epoch": 0.5697671577610007, "grad_norm": 0.4423331618309021, "learning_rate": 4.656568303225296e-06, "loss": 0.4402, "step": 11660 }, { "epoch": 0.5702558088396981, "grad_norm": 0.4402877390384674, "learning_rate": 4.648060355524512e-06, "loss": 0.4391, "step": 11670 }, { "epoch": 0.5707444599183953, "grad_norm": 0.3995070457458496, "learning_rate": 4.639553431783498e-06, "loss": 0.4404, "step": 11680 }, { "epoch": 0.5712331109970925, "grad_norm": 0.5264308452606201, "learning_rate": 4.63104755675294e-06, "loss": 0.4389, "step": 11690 }, { "epoch": 0.5717217620757897, "grad_norm": 0.28230753540992737, "learning_rate": 4.622542755180471e-06, "loss": 0.4389, "step": 11700 }, { "epoch": 0.5722104131544871, "grad_norm": 0.7925990223884583, "learning_rate": 4.6140390518106034e-06, "loss": 0.4395, "step": 11710 }, { "epoch": 0.5726990642331843, "grad_norm": 1.0194525718688965, "learning_rate": 4.605536471384656e-06, "loss": 0.44, "step": 11720 }, { "epoch": 0.5731877153118815, "grad_norm": 0.510903000831604, "learning_rate": 4.597035038640676e-06, "loss": 0.439, "step": 11730 }, { "epoch": 0.5736763663905788, "grad_norm": 0.42407867312431335, "learning_rate": 4.5885347783133725e-06, "loss": 0.4401, "step": 11740 }, { "epoch": 0.5741650174692761, "grad_norm": 0.5859852433204651, "learning_rate": 4.580035715134047e-06, "loss": 0.4381, "step": 11750 }, { "epoch": 0.5746536685479733, "grad_norm": 0.5147973895072937, "learning_rate": 4.571537873830515e-06, "loss": 0.4399, "step": 11760 }, { "epoch": 0.5751423196266706, "grad_norm": 0.6203701496124268, "learning_rate": 4.563041279127038e-06, "loss": 0.4389, "step": 11770 }, { "epoch": 0.5756309707053678, "grad_norm": 0.4585236608982086, "learning_rate": 4.554545955744247e-06, "loss": 0.4383, "step": 11780 }, { "epoch": 0.5761196217840651, "grad_norm": 0.41942375898361206, "learning_rate": 4.546051928399081e-06, "loss": 0.4386, "step": 11790 }, { "epoch": 0.5766082728627624, "grad_norm": 0.5585193037986755, "learning_rate": 4.537559221804703e-06, "loss": 0.4389, "step": 11800 }, { "epoch": 0.5770969239414596, "grad_norm": 0.4607734680175781, "learning_rate": 4.529067860670433e-06, "loss": 0.4388, "step": 11810 }, { "epoch": 0.5775855750201568, "grad_norm": 0.6180665493011475, "learning_rate": 4.520577869701679e-06, "loss": 0.4382, "step": 11820 }, { "epoch": 0.5780742260988541, "grad_norm": 0.7965272068977356, "learning_rate": 4.5120892735998636e-06, "loss": 0.4387, "step": 11830 }, { "epoch": 0.5785628771775514, "grad_norm": 0.37461355328559875, "learning_rate": 4.503602097062344e-06, "loss": 0.4395, "step": 11840 }, { "epoch": 0.5790515282562486, "grad_norm": 0.5917596817016602, "learning_rate": 4.4951163647823595e-06, "loss": 0.4385, "step": 11850 }, { "epoch": 0.5795401793349458, "grad_norm": 0.47392183542251587, "learning_rate": 4.486632101448935e-06, "loss": 0.4372, "step": 11860 }, { "epoch": 0.5800288304136432, "grad_norm": 0.43549230694770813, "learning_rate": 4.478149331746829e-06, "loss": 0.4387, "step": 11870 }, { "epoch": 0.5805174814923404, "grad_norm": 0.5697550177574158, "learning_rate": 4.469668080356451e-06, "loss": 0.4387, "step": 11880 }, { "epoch": 0.5810061325710376, "grad_norm": 0.3437957763671875, "learning_rate": 4.461188371953795e-06, "loss": 0.4388, "step": 11890 }, { "epoch": 0.581494783649735, "grad_norm": 1.4066935777664185, "learning_rate": 4.4527102312103624e-06, "loss": 0.4402, "step": 11900 }, { "epoch": 0.5819834347284322, "grad_norm": 0.5635364055633545, "learning_rate": 4.4442336827930995e-06, "loss": 0.4387, "step": 11910 }, { "epoch": 0.5824720858071294, "grad_norm": 0.42688384652137756, "learning_rate": 4.435758751364312e-06, "loss": 0.4408, "step": 11920 }, { "epoch": 0.5829607368858267, "grad_norm": 0.5010594725608826, "learning_rate": 4.427285461581609e-06, "loss": 0.4385, "step": 11930 }, { "epoch": 0.583449387964524, "grad_norm": 0.6035897135734558, "learning_rate": 4.418813838097815e-06, "loss": 0.4402, "step": 11940 }, { "epoch": 0.5839380390432212, "grad_norm": 0.7641412019729614, "learning_rate": 4.410343905560916e-06, "loss": 0.4391, "step": 11950 }, { "epoch": 0.5844266901219184, "grad_norm": 0.4700312614440918, "learning_rate": 4.401875688613971e-06, "loss": 0.4379, "step": 11960 }, { "epoch": 0.5849153412006157, "grad_norm": 0.9198450446128845, "learning_rate": 4.3934092118950485e-06, "loss": 0.4374, "step": 11970 }, { "epoch": 0.585403992279313, "grad_norm": 0.896514356136322, "learning_rate": 4.384944500037156e-06, "loss": 0.4384, "step": 11980 }, { "epoch": 0.5858926433580102, "grad_norm": 0.49591732025146484, "learning_rate": 4.376481577668167e-06, "loss": 0.44, "step": 11990 }, { "epoch": 0.5863812944367075, "grad_norm": 0.5625073909759521, "learning_rate": 4.368020469410742e-06, "loss": 0.4389, "step": 12000 }, { "epoch": 0.5863812944367075, "eval_loss": 0.41703999042510986, "eval_runtime": 727.8065, "eval_samples_per_second": 243.07, "eval_steps_per_second": 0.475, "step": 12000 }, { "epoch": 0.5868699455154047, "grad_norm": 0.6674771904945374, "learning_rate": 4.359561199882272e-06, "loss": 0.4393, "step": 12010 }, { "epoch": 0.587358596594102, "grad_norm": 0.5143821239471436, "learning_rate": 4.351103793694794e-06, "loss": 0.4375, "step": 12020 }, { "epoch": 0.5878472476727993, "grad_norm": 0.4788214862346649, "learning_rate": 4.342648275454922e-06, "loss": 0.4386, "step": 12030 }, { "epoch": 0.5883358987514965, "grad_norm": 0.5421459078788757, "learning_rate": 4.334194669763781e-06, "loss": 0.4386, "step": 12040 }, { "epoch": 0.5888245498301937, "grad_norm": 0.6345226168632507, "learning_rate": 4.325743001216926e-06, "loss": 0.4388, "step": 12050 }, { "epoch": 0.589313200908891, "grad_norm": 1.1048717498779297, "learning_rate": 4.317293294404285e-06, "loss": 0.44, "step": 12060 }, { "epoch": 0.5898018519875883, "grad_norm": 0.5707539916038513, "learning_rate": 4.308845573910071e-06, "loss": 0.4379, "step": 12070 }, { "epoch": 0.5902905030662855, "grad_norm": 0.7084303498268127, "learning_rate": 4.300399864312718e-06, "loss": 0.4388, "step": 12080 }, { "epoch": 0.5907791541449827, "grad_norm": 0.5199768543243408, "learning_rate": 4.291956190184811e-06, "loss": 0.4385, "step": 12090 }, { "epoch": 0.59126780522368, "grad_norm": 0.35853302478790283, "learning_rate": 4.283514576093015e-06, "loss": 0.44, "step": 12100 }, { "epoch": 0.5917564563023773, "grad_norm": 0.6634894609451294, "learning_rate": 4.275075046597997e-06, "loss": 0.4386, "step": 12110 }, { "epoch": 0.5922451073810745, "grad_norm": 0.3389874994754791, "learning_rate": 4.266637626254363e-06, "loss": 0.439, "step": 12120 }, { "epoch": 0.5927337584597718, "grad_norm": 0.38937532901763916, "learning_rate": 4.258202339610581e-06, "loss": 0.4389, "step": 12130 }, { "epoch": 0.593222409538469, "grad_norm": 0.47301584482192993, "learning_rate": 4.2497692112089086e-06, "loss": 0.4382, "step": 12140 }, { "epoch": 0.5937110606171663, "grad_norm": 0.4262164533138275, "learning_rate": 4.241338265585327e-06, "loss": 0.4384, "step": 12150 }, { "epoch": 0.5941997116958636, "grad_norm": 0.3946975767612457, "learning_rate": 4.232909527269465e-06, "loss": 0.4389, "step": 12160 }, { "epoch": 0.5946883627745608, "grad_norm": 0.30611652135849, "learning_rate": 4.2244830207845335e-06, "loss": 0.4384, "step": 12170 }, { "epoch": 0.595177013853258, "grad_norm": 0.5015509128570557, "learning_rate": 4.2160587706472445e-06, "loss": 0.4386, "step": 12180 }, { "epoch": 0.5956656649319554, "grad_norm": 2.779911518096924, "learning_rate": 4.207636801367746e-06, "loss": 0.4388, "step": 12190 }, { "epoch": 0.5961543160106526, "grad_norm": 0.940437912940979, "learning_rate": 4.199217137449553e-06, "loss": 0.4403, "step": 12200 }, { "epoch": 0.5966429670893498, "grad_norm": 1.1815273761749268, "learning_rate": 4.190799803389472e-06, "loss": 0.4384, "step": 12210 }, { "epoch": 0.597131618168047, "grad_norm": 0.4534102976322174, "learning_rate": 4.182384823677527e-06, "loss": 0.4385, "step": 12220 }, { "epoch": 0.5976202692467444, "grad_norm": 0.694245457649231, "learning_rate": 4.173972222796897e-06, "loss": 0.4382, "step": 12230 }, { "epoch": 0.5981089203254416, "grad_norm": 0.5328917503356934, "learning_rate": 4.165562025223839e-06, "loss": 0.4392, "step": 12240 }, { "epoch": 0.5985975714041388, "grad_norm": 0.7805267572402954, "learning_rate": 4.157154255427613e-06, "loss": 0.4394, "step": 12250 }, { "epoch": 0.5990862224828362, "grad_norm": 0.6959843635559082, "learning_rate": 4.148748937870425e-06, "loss": 0.4366, "step": 12260 }, { "epoch": 0.5995748735615334, "grad_norm": 0.9793679714202881, "learning_rate": 4.140346097007336e-06, "loss": 0.4383, "step": 12270 }, { "epoch": 0.6000635246402306, "grad_norm": 0.3973505198955536, "learning_rate": 4.1319457572862066e-06, "loss": 0.4394, "step": 12280 }, { "epoch": 0.6005521757189279, "grad_norm": 0.5687869191169739, "learning_rate": 4.123547943147621e-06, "loss": 0.4388, "step": 12290 }, { "epoch": 0.6010408267976252, "grad_norm": 0.4026346802711487, "learning_rate": 4.115152679024811e-06, "loss": 0.4391, "step": 12300 }, { "epoch": 0.6015294778763224, "grad_norm": 0.513808012008667, "learning_rate": 4.106759989343594e-06, "loss": 0.4381, "step": 12310 }, { "epoch": 0.6020181289550197, "grad_norm": 0.36706215143203735, "learning_rate": 4.0983698985222935e-06, "loss": 0.4384, "step": 12320 }, { "epoch": 0.6025067800337169, "grad_norm": 0.5302925705909729, "learning_rate": 4.089982430971673e-06, "loss": 0.4387, "step": 12330 }, { "epoch": 0.6029954311124142, "grad_norm": 0.6953673362731934, "learning_rate": 4.081597611094864e-06, "loss": 0.4385, "step": 12340 }, { "epoch": 0.6034840821911114, "grad_norm": 0.46951332688331604, "learning_rate": 4.073215463287296e-06, "loss": 0.4385, "step": 12350 }, { "epoch": 0.6039727332698087, "grad_norm": 0.30504921078681946, "learning_rate": 4.064836011936618e-06, "loss": 0.4378, "step": 12360 }, { "epoch": 0.6044613843485059, "grad_norm": 0.34291717410087585, "learning_rate": 4.056459281422644e-06, "loss": 0.4367, "step": 12370 }, { "epoch": 0.6049500354272032, "grad_norm": 0.3311258852481842, "learning_rate": 4.0480852961172635e-06, "loss": 0.4387, "step": 12380 }, { "epoch": 0.6054386865059005, "grad_norm": 0.48355287313461304, "learning_rate": 4.039714080384381e-06, "loss": 0.4385, "step": 12390 }, { "epoch": 0.6059273375845977, "grad_norm": 0.6378800868988037, "learning_rate": 4.031345658579846e-06, "loss": 0.438, "step": 12400 }, { "epoch": 0.6064159886632949, "grad_norm": 0.3167429566383362, "learning_rate": 4.022980055051372e-06, "loss": 0.4395, "step": 12410 }, { "epoch": 0.6069046397419923, "grad_norm": 1.2204922437667847, "learning_rate": 4.014617294138482e-06, "loss": 0.4394, "step": 12420 }, { "epoch": 0.6073932908206895, "grad_norm": 0.775138258934021, "learning_rate": 4.006257400172422e-06, "loss": 0.4393, "step": 12430 }, { "epoch": 0.6078819418993867, "grad_norm": 0.5826382637023926, "learning_rate": 3.9979003974760985e-06, "loss": 0.4379, "step": 12440 }, { "epoch": 0.608370592978084, "grad_norm": 0.5798311233520508, "learning_rate": 3.989546310364005e-06, "loss": 0.4379, "step": 12450 }, { "epoch": 0.6088592440567813, "grad_norm": 0.749792218208313, "learning_rate": 3.981195163142154e-06, "loss": 0.4379, "step": 12460 }, { "epoch": 0.6093478951354785, "grad_norm": 0.514415979385376, "learning_rate": 3.972846980108005e-06, "loss": 0.4391, "step": 12470 }, { "epoch": 0.6098365462141758, "grad_norm": 0.38157758116722107, "learning_rate": 3.964501785550392e-06, "loss": 0.4375, "step": 12480 }, { "epoch": 0.610325197292873, "grad_norm": 0.915421724319458, "learning_rate": 3.956159603749452e-06, "loss": 0.437, "step": 12490 }, { "epoch": 0.6108138483715703, "grad_norm": 0.5357415080070496, "learning_rate": 3.947820458976559e-06, "loss": 0.4388, "step": 12500 }, { "epoch": 0.6108138483715703, "eval_loss": 0.41651272773742676, "eval_runtime": 728.6136, "eval_samples_per_second": 242.801, "eval_steps_per_second": 0.475, "step": 12500 }, { "epoch": 0.6113024994502675, "grad_norm": 0.8443652391433716, "learning_rate": 3.939484375494252e-06, "loss": 0.4405, "step": 12510 }, { "epoch": 0.6117911505289648, "grad_norm": 0.604301929473877, "learning_rate": 3.931151377556159e-06, "loss": 0.4383, "step": 12520 }, { "epoch": 0.612279801607662, "grad_norm": 0.36815837025642395, "learning_rate": 3.922821489406935e-06, "loss": 0.4386, "step": 12530 }, { "epoch": 0.6127684526863593, "grad_norm": 0.6259467005729675, "learning_rate": 3.914494735282185e-06, "loss": 0.4392, "step": 12540 }, { "epoch": 0.6132571037650566, "grad_norm": 0.6359371542930603, "learning_rate": 3.9061711394083965e-06, "loss": 0.4392, "step": 12550 }, { "epoch": 0.6137457548437538, "grad_norm": 0.5041322112083435, "learning_rate": 3.897850726002864e-06, "loss": 0.4399, "step": 12560 }, { "epoch": 0.614234405922451, "grad_norm": 0.5972697138786316, "learning_rate": 3.889533519273633e-06, "loss": 0.4391, "step": 12570 }, { "epoch": 0.6147230570011484, "grad_norm": 0.7836823463439941, "learning_rate": 3.881219543419407e-06, "loss": 0.4387, "step": 12580 }, { "epoch": 0.6152117080798456, "grad_norm": 0.44390979409217834, "learning_rate": 3.8729088226294995e-06, "loss": 0.4384, "step": 12590 }, { "epoch": 0.6157003591585428, "grad_norm": 0.32042884826660156, "learning_rate": 3.8646013810837445e-06, "loss": 0.4379, "step": 12600 }, { "epoch": 0.6161890102372402, "grad_norm": 0.5421732664108276, "learning_rate": 3.856297242952442e-06, "loss": 0.4384, "step": 12610 }, { "epoch": 0.6166776613159374, "grad_norm": 0.5136971473693848, "learning_rate": 3.847996432396279e-06, "loss": 0.4371, "step": 12620 }, { "epoch": 0.6171663123946346, "grad_norm": 0.46279609203338623, "learning_rate": 3.839698973566258e-06, "loss": 0.4378, "step": 12630 }, { "epoch": 0.6176549634733318, "grad_norm": 0.7376791834831238, "learning_rate": 3.831404890603634e-06, "loss": 0.4381, "step": 12640 }, { "epoch": 0.6181436145520292, "grad_norm": 0.5303279757499695, "learning_rate": 3.823114207639838e-06, "loss": 0.4386, "step": 12650 }, { "epoch": 0.6186322656307264, "grad_norm": 0.7225260138511658, "learning_rate": 3.814826948796404e-06, "loss": 0.438, "step": 12660 }, { "epoch": 0.6191209167094236, "grad_norm": 0.8428411483764648, "learning_rate": 3.8065431381849178e-06, "loss": 0.4385, "step": 12670 }, { "epoch": 0.6196095677881209, "grad_norm": 0.40499812364578247, "learning_rate": 3.7982627999069164e-06, "loss": 0.4382, "step": 12680 }, { "epoch": 0.6200982188668182, "grad_norm": 0.44530633091926575, "learning_rate": 3.7899859580538436e-06, "loss": 0.4386, "step": 12690 }, { "epoch": 0.6205868699455154, "grad_norm": 0.4268031418323517, "learning_rate": 3.7817126367069674e-06, "loss": 0.4374, "step": 12700 }, { "epoch": 0.6210755210242127, "grad_norm": 0.2745535373687744, "learning_rate": 3.773442859937313e-06, "loss": 0.4383, "step": 12710 }, { "epoch": 0.6215641721029099, "grad_norm": 0.5120725035667419, "learning_rate": 3.765176651805593e-06, "loss": 0.4383, "step": 12720 }, { "epoch": 0.6220528231816072, "grad_norm": 0.3301103413105011, "learning_rate": 3.7569140363621393e-06, "loss": 0.4384, "step": 12730 }, { "epoch": 0.6225414742603045, "grad_norm": 0.34257206320762634, "learning_rate": 3.7486550376468266e-06, "loss": 0.4366, "step": 12740 }, { "epoch": 0.6230301253390017, "grad_norm": 0.37387409806251526, "learning_rate": 3.7403996796890096e-06, "loss": 0.4381, "step": 12750 }, { "epoch": 0.6235187764176989, "grad_norm": 0.5832339525222778, "learning_rate": 3.732147986507453e-06, "loss": 0.4389, "step": 12760 }, { "epoch": 0.6240074274963961, "grad_norm": 0.39319491386413574, "learning_rate": 3.723899982110249e-06, "loss": 0.4379, "step": 12770 }, { "epoch": 0.6244960785750935, "grad_norm": 1.1208192110061646, "learning_rate": 3.7156556904947725e-06, "loss": 0.4374, "step": 12780 }, { "epoch": 0.6249847296537907, "grad_norm": 1.2163150310516357, "learning_rate": 3.7074151356475828e-06, "loss": 0.4386, "step": 12790 }, { "epoch": 0.6254733807324879, "grad_norm": 0.5749249458312988, "learning_rate": 3.6991783415443726e-06, "loss": 0.4376, "step": 12800 }, { "epoch": 0.6259620318111853, "grad_norm": 0.3662860095500946, "learning_rate": 3.6909453321498954e-06, "loss": 0.4387, "step": 12810 }, { "epoch": 0.6264506828898825, "grad_norm": 0.7711629271507263, "learning_rate": 3.682716131417887e-06, "loss": 0.4387, "step": 12820 }, { "epoch": 0.6269393339685797, "grad_norm": 0.4106141924858093, "learning_rate": 3.6744907632910064e-06, "loss": 0.4376, "step": 12830 }, { "epoch": 0.627427985047277, "grad_norm": 0.8427706956863403, "learning_rate": 3.6662692517007613e-06, "loss": 0.4376, "step": 12840 }, { "epoch": 0.6279166361259743, "grad_norm": 0.4671982526779175, "learning_rate": 3.6580516205674367e-06, "loss": 0.4375, "step": 12850 }, { "epoch": 0.6284052872046715, "grad_norm": 0.643839955329895, "learning_rate": 3.64983789380003e-06, "loss": 0.4382, "step": 12860 }, { "epoch": 0.6288939382833688, "grad_norm": 0.3143644630908966, "learning_rate": 3.6416280952961756e-06, "loss": 0.4378, "step": 12870 }, { "epoch": 0.629382589362066, "grad_norm": 0.5174784064292908, "learning_rate": 3.6334222489420845e-06, "loss": 0.4386, "step": 12880 }, { "epoch": 0.6298712404407633, "grad_norm": 0.35816308856010437, "learning_rate": 3.625220378612465e-06, "loss": 0.4381, "step": 12890 }, { "epoch": 0.6303598915194605, "grad_norm": 0.4106110632419586, "learning_rate": 3.617022508170456e-06, "loss": 0.4372, "step": 12900 }, { "epoch": 0.6308485425981578, "grad_norm": 1.5037391185760498, "learning_rate": 3.608828661467561e-06, "loss": 0.4366, "step": 12910 }, { "epoch": 0.631337193676855, "grad_norm": 0.6403370499610901, "learning_rate": 3.6006388623435778e-06, "loss": 0.4373, "step": 12920 }, { "epoch": 0.6318258447555523, "grad_norm": 0.4930186867713928, "learning_rate": 3.5924531346265235e-06, "loss": 0.4379, "step": 12930 }, { "epoch": 0.6323144958342496, "grad_norm": 0.3067891001701355, "learning_rate": 3.5842715021325745e-06, "loss": 0.4368, "step": 12940 }, { "epoch": 0.6328031469129468, "grad_norm": 0.7694682478904724, "learning_rate": 3.5760939886659896e-06, "loss": 0.438, "step": 12950 }, { "epoch": 0.633291797991644, "grad_norm": 0.5115815997123718, "learning_rate": 3.567920618019043e-06, "loss": 0.4377, "step": 12960 }, { "epoch": 0.6337804490703414, "grad_norm": 0.6964974999427795, "learning_rate": 3.559751413971955e-06, "loss": 0.4375, "step": 12970 }, { "epoch": 0.6342691001490386, "grad_norm": 0.5830658078193665, "learning_rate": 3.551586400292828e-06, "loss": 0.4381, "step": 12980 }, { "epoch": 0.6347577512277358, "grad_norm": 0.8513720035552979, "learning_rate": 3.5434256007375666e-06, "loss": 0.4376, "step": 12990 }, { "epoch": 0.6352464023064331, "grad_norm": 0.4420766234397888, "learning_rate": 3.535269039049819e-06, "loss": 0.436, "step": 13000 }, { "epoch": 0.6352464023064331, "eval_loss": 0.41433966159820557, "eval_runtime": 729.8346, "eval_samples_per_second": 242.395, "eval_steps_per_second": 0.474, "step": 13000 }, { "epoch": 0.6357350533851304, "grad_norm": 1.997313380241394, "learning_rate": 3.5271167389608996e-06, "loss": 0.4376, "step": 13010 }, { "epoch": 0.6362237044638276, "grad_norm": 0.5262558460235596, "learning_rate": 3.518968724189727e-06, "loss": 0.439, "step": 13020 }, { "epoch": 0.6367123555425248, "grad_norm": 0.7942774295806885, "learning_rate": 3.5108250184427507e-06, "loss": 0.4368, "step": 13030 }, { "epoch": 0.6372010066212221, "grad_norm": 0.44571954011917114, "learning_rate": 3.50268564541388e-06, "loss": 0.4386, "step": 13040 }, { "epoch": 0.6376896576999194, "grad_norm": 0.3043385148048401, "learning_rate": 3.4945506287844245e-06, "loss": 0.4377, "step": 13050 }, { "epoch": 0.6381783087786166, "grad_norm": 0.41446077823638916, "learning_rate": 3.4864199922230156e-06, "loss": 0.4376, "step": 13060 }, { "epoch": 0.6386669598573139, "grad_norm": 0.361817866563797, "learning_rate": 3.4782937593855386e-06, "loss": 0.4368, "step": 13070 }, { "epoch": 0.6391556109360111, "grad_norm": 0.2429763674736023, "learning_rate": 3.4701719539150692e-06, "loss": 0.4384, "step": 13080 }, { "epoch": 0.6396442620147084, "grad_norm": 0.6479557156562805, "learning_rate": 3.4620545994418044e-06, "loss": 0.4369, "step": 13090 }, { "epoch": 0.6401329130934057, "grad_norm": 0.2731977105140686, "learning_rate": 3.453941719582985e-06, "loss": 0.4367, "step": 13100 }, { "epoch": 0.6406215641721029, "grad_norm": 0.3237663209438324, "learning_rate": 3.445833337942838e-06, "loss": 0.4389, "step": 13110 }, { "epoch": 0.6411102152508001, "grad_norm": 0.6228281855583191, "learning_rate": 3.4377294781124997e-06, "loss": 0.4361, "step": 13120 }, { "epoch": 0.6415988663294975, "grad_norm": 0.7347028255462646, "learning_rate": 3.4296301636699527e-06, "loss": 0.4378, "step": 13130 }, { "epoch": 0.6420875174081947, "grad_norm": 0.8885689377784729, "learning_rate": 3.421535418179953e-06, "loss": 0.4379, "step": 13140 }, { "epoch": 0.6425761684868919, "grad_norm": 0.7151497602462769, "learning_rate": 3.413445265193964e-06, "loss": 0.4373, "step": 13150 }, { "epoch": 0.6430648195655891, "grad_norm": 0.46183907985687256, "learning_rate": 3.4053597282500882e-06, "loss": 0.4373, "step": 13160 }, { "epoch": 0.6435534706442865, "grad_norm": 0.7960475087165833, "learning_rate": 3.397278830872998e-06, "loss": 0.4358, "step": 13170 }, { "epoch": 0.6440421217229837, "grad_norm": 0.5535709261894226, "learning_rate": 3.3892025965738616e-06, "loss": 0.4373, "step": 13180 }, { "epoch": 0.6445307728016809, "grad_norm": 0.8837286829948425, "learning_rate": 3.3811310488502924e-06, "loss": 0.4372, "step": 13190 }, { "epoch": 0.6450194238803783, "grad_norm": 0.5701731443405151, "learning_rate": 3.3730642111862543e-06, "loss": 0.4381, "step": 13200 }, { "epoch": 0.6455080749590755, "grad_norm": 0.449485182762146, "learning_rate": 3.365002107052017e-06, "loss": 0.4381, "step": 13210 }, { "epoch": 0.6459967260377727, "grad_norm": 0.43208661675453186, "learning_rate": 3.356944759904075e-06, "loss": 0.4387, "step": 13220 }, { "epoch": 0.64648537711647, "grad_norm": 0.5452390313148499, "learning_rate": 3.3488921931850794e-06, "loss": 0.4374, "step": 13230 }, { "epoch": 0.6469740281951673, "grad_norm": 0.37224072217941284, "learning_rate": 3.3408444303237786e-06, "loss": 0.4376, "step": 13240 }, { "epoch": 0.6474626792738645, "grad_norm": 0.6517390608787537, "learning_rate": 3.3328014947349406e-06, "loss": 0.4377, "step": 13250 }, { "epoch": 0.6479513303525618, "grad_norm": 0.3955247402191162, "learning_rate": 3.3247634098192884e-06, "loss": 0.4388, "step": 13260 }, { "epoch": 0.648439981431259, "grad_norm": 0.3447047770023346, "learning_rate": 3.316730198963433e-06, "loss": 0.4377, "step": 13270 }, { "epoch": 0.6489286325099562, "grad_norm": 0.8046542406082153, "learning_rate": 3.3087018855398045e-06, "loss": 0.4374, "step": 13280 }, { "epoch": 0.6494172835886535, "grad_norm": 0.5053970217704773, "learning_rate": 3.300678492906586e-06, "loss": 0.4377, "step": 13290 }, { "epoch": 0.6499059346673508, "grad_norm": 1.129328727722168, "learning_rate": 3.292660044407642e-06, "loss": 0.4373, "step": 13300 }, { "epoch": 0.650394585746048, "grad_norm": 1.0235140323638916, "learning_rate": 3.2846465633724487e-06, "loss": 0.438, "step": 13310 }, { "epoch": 0.6508832368247452, "grad_norm": 1.2467355728149414, "learning_rate": 3.2766380731160342e-06, "loss": 0.4376, "step": 13320 }, { "epoch": 0.6513718879034426, "grad_norm": 0.42103078961372375, "learning_rate": 3.268634596938906e-06, "loss": 0.4369, "step": 13330 }, { "epoch": 0.6518605389821398, "grad_norm": 0.6862124800682068, "learning_rate": 3.26063615812698e-06, "loss": 0.4384, "step": 13340 }, { "epoch": 0.652349190060837, "grad_norm": 0.4259004294872284, "learning_rate": 3.252642779951518e-06, "loss": 0.4385, "step": 13350 }, { "epoch": 0.6528378411395344, "grad_norm": 0.3901737630367279, "learning_rate": 3.2446544856690595e-06, "loss": 0.4375, "step": 13360 }, { "epoch": 0.6533264922182316, "grad_norm": 0.5543071627616882, "learning_rate": 3.236671298521349e-06, "loss": 0.4373, "step": 13370 }, { "epoch": 0.6538151432969288, "grad_norm": 0.4534682333469391, "learning_rate": 3.228693241735274e-06, "loss": 0.4379, "step": 13380 }, { "epoch": 0.6543037943756261, "grad_norm": 0.25920426845550537, "learning_rate": 3.220720338522795e-06, "loss": 0.4371, "step": 13390 }, { "epoch": 0.6547924454543234, "grad_norm": 0.3495095372200012, "learning_rate": 3.2127526120808807e-06, "loss": 0.4386, "step": 13400 }, { "epoch": 0.6552810965330206, "grad_norm": 0.5815818309783936, "learning_rate": 3.204790085591435e-06, "loss": 0.4386, "step": 13410 }, { "epoch": 0.6557697476117178, "grad_norm": 0.48536261916160583, "learning_rate": 3.1968327822212325e-06, "loss": 0.4376, "step": 13420 }, { "epoch": 0.6562583986904151, "grad_norm": 0.46819832921028137, "learning_rate": 3.1888807251218524e-06, "loss": 0.4364, "step": 13430 }, { "epoch": 0.6567470497691124, "grad_norm": 0.6933978199958801, "learning_rate": 3.180933937429612e-06, "loss": 0.4366, "step": 13440 }, { "epoch": 0.6572357008478096, "grad_norm": 0.6232645511627197, "learning_rate": 3.1729924422654917e-06, "loss": 0.4372, "step": 13450 }, { "epoch": 0.6577243519265069, "grad_norm": 0.6195946335792542, "learning_rate": 3.1650562627350797e-06, "loss": 0.4379, "step": 13460 }, { "epoch": 0.6582130030052041, "grad_norm": 0.47936639189720154, "learning_rate": 3.157125421928496e-06, "loss": 0.4375, "step": 13470 }, { "epoch": 0.6587016540839014, "grad_norm": 0.7483202219009399, "learning_rate": 3.1491999429203253e-06, "loss": 0.4375, "step": 13480 }, { "epoch": 0.6591903051625987, "grad_norm": 0.6134311556816101, "learning_rate": 3.141279848769555e-06, "loss": 0.4373, "step": 13490 }, { "epoch": 0.6596789562412959, "grad_norm": 0.623481810092926, "learning_rate": 3.1333651625195065e-06, "loss": 0.4377, "step": 13500 }, { "epoch": 0.6596789562412959, "eval_loss": 0.41565391421318054, "eval_runtime": 728.9931, "eval_samples_per_second": 242.674, "eval_steps_per_second": 0.475, "step": 13500 }, { "epoch": 0.6601676073199931, "grad_norm": 0.6795092225074768, "learning_rate": 3.125455907197765e-06, "loss": 0.4376, "step": 13510 }, { "epoch": 0.6606562583986905, "grad_norm": 0.766830325126648, "learning_rate": 3.117552105816116e-06, "loss": 0.4361, "step": 13520 }, { "epoch": 0.6611449094773877, "grad_norm": 1.3391401767730713, "learning_rate": 3.109653781370473e-06, "loss": 0.4367, "step": 13530 }, { "epoch": 0.6616335605560849, "grad_norm": 0.4134541451931, "learning_rate": 3.101760956840819e-06, "loss": 0.4379, "step": 13540 }, { "epoch": 0.6621222116347821, "grad_norm": 0.8742764592170715, "learning_rate": 3.093873655191135e-06, "loss": 0.4365, "step": 13550 }, { "epoch": 0.6626108627134795, "grad_norm": 0.5794457197189331, "learning_rate": 3.0859918993693294e-06, "loss": 0.4375, "step": 13560 }, { "epoch": 0.6630995137921767, "grad_norm": 0.5920469760894775, "learning_rate": 3.0781157123071782e-06, "loss": 0.4366, "step": 13570 }, { "epoch": 0.6635881648708739, "grad_norm": 0.8090350031852722, "learning_rate": 3.070245116920255e-06, "loss": 0.4375, "step": 13580 }, { "epoch": 0.6640768159495712, "grad_norm": 0.6335200071334839, "learning_rate": 3.062380136107863e-06, "loss": 0.4372, "step": 13590 }, { "epoch": 0.6645654670282685, "grad_norm": 0.5986902117729187, "learning_rate": 3.054520792752973e-06, "loss": 0.4369, "step": 13600 }, { "epoch": 0.6650541181069657, "grad_norm": 0.5582621693611145, "learning_rate": 3.0466671097221506e-06, "loss": 0.4373, "step": 13610 }, { "epoch": 0.665542769185663, "grad_norm": 0.40097367763519287, "learning_rate": 3.038819109865495e-06, "loss": 0.437, "step": 13620 }, { "epoch": 0.6660314202643602, "grad_norm": 0.3417106866836548, "learning_rate": 3.0309768160165697e-06, "loss": 0.4367, "step": 13630 }, { "epoch": 0.6665200713430575, "grad_norm": 0.8929291367530823, "learning_rate": 3.0231402509923347e-06, "loss": 0.4371, "step": 13640 }, { "epoch": 0.6670087224217548, "grad_norm": 0.5947937965393066, "learning_rate": 3.015309437593084e-06, "loss": 0.4381, "step": 13650 }, { "epoch": 0.667497373500452, "grad_norm": 0.42644399404525757, "learning_rate": 3.00748439860238e-06, "loss": 0.4368, "step": 13660 }, { "epoch": 0.6679860245791492, "grad_norm": 0.378689706325531, "learning_rate": 2.9996651567869784e-06, "loss": 0.4358, "step": 13670 }, { "epoch": 0.6684746756578465, "grad_norm": 0.5530552268028259, "learning_rate": 2.9918517348967734e-06, "loss": 0.4376, "step": 13680 }, { "epoch": 0.6689633267365438, "grad_norm": 0.5646296739578247, "learning_rate": 2.9840441556647247e-06, "loss": 0.4371, "step": 13690 }, { "epoch": 0.669451977815241, "grad_norm": 0.7569136619567871, "learning_rate": 2.9762424418067905e-06, "loss": 0.4373, "step": 13700 }, { "epoch": 0.6699406288939382, "grad_norm": 1.0428658723831177, "learning_rate": 2.968446616021868e-06, "loss": 0.4379, "step": 13710 }, { "epoch": 0.6704292799726356, "grad_norm": 0.7594118714332581, "learning_rate": 2.9606567009917218e-06, "loss": 0.4375, "step": 13720 }, { "epoch": 0.6709179310513328, "grad_norm": 2.9701902866363525, "learning_rate": 2.952872719380917e-06, "loss": 0.4379, "step": 13730 }, { "epoch": 0.67140658213003, "grad_norm": 0.9236831665039062, "learning_rate": 2.94509469383676e-06, "loss": 0.4371, "step": 13740 }, { "epoch": 0.6718952332087273, "grad_norm": 1.1951662302017212, "learning_rate": 2.9373226469892223e-06, "loss": 0.4377, "step": 13750 }, { "epoch": 0.6723838842874246, "grad_norm": 0.3135634958744049, "learning_rate": 2.9295566014508853e-06, "loss": 0.4369, "step": 13760 }, { "epoch": 0.6728725353661218, "grad_norm": 0.4447099566459656, "learning_rate": 2.9217965798168685e-06, "loss": 0.4376, "step": 13770 }, { "epoch": 0.6733611864448191, "grad_norm": 0.3684927821159363, "learning_rate": 2.914042604664764e-06, "loss": 0.4373, "step": 13780 }, { "epoch": 0.6738498375235163, "grad_norm": 0.3362119495868683, "learning_rate": 2.9062946985545707e-06, "loss": 0.4371, "step": 13790 }, { "epoch": 0.6743384886022136, "grad_norm": 0.6925981640815735, "learning_rate": 2.898552884028634e-06, "loss": 0.4371, "step": 13800 }, { "epoch": 0.6748271396809108, "grad_norm": 0.49009522795677185, "learning_rate": 2.8908171836115736e-06, "loss": 0.4382, "step": 13810 }, { "epoch": 0.6753157907596081, "grad_norm": 0.49105721712112427, "learning_rate": 2.8830876198102176e-06, "loss": 0.4369, "step": 13820 }, { "epoch": 0.6758044418383053, "grad_norm": 0.5330390930175781, "learning_rate": 2.875364215113547e-06, "loss": 0.4365, "step": 13830 }, { "epoch": 0.6762930929170026, "grad_norm": 0.43516021966934204, "learning_rate": 2.8676469919926152e-06, "loss": 0.437, "step": 13840 }, { "epoch": 0.6767817439956999, "grad_norm": 0.4716795086860657, "learning_rate": 2.859935972900492e-06, "loss": 0.4361, "step": 13850 }, { "epoch": 0.6772703950743971, "grad_norm": 0.38898736238479614, "learning_rate": 2.8522311802722038e-06, "loss": 0.4369, "step": 13860 }, { "epoch": 0.6777590461530943, "grad_norm": 0.34487384557724, "learning_rate": 2.8445326365246516e-06, "loss": 0.4381, "step": 13870 }, { "epoch": 0.6782476972317917, "grad_norm": 0.30314865708351135, "learning_rate": 2.836840364056559e-06, "loss": 0.4371, "step": 13880 }, { "epoch": 0.6787363483104889, "grad_norm": 0.5969054102897644, "learning_rate": 2.829154385248409e-06, "loss": 0.4367, "step": 13890 }, { "epoch": 0.6792249993891861, "grad_norm": 0.32903793454170227, "learning_rate": 2.8214747224623627e-06, "loss": 0.4357, "step": 13900 }, { "epoch": 0.6797136504678835, "grad_norm": 0.3053576648235321, "learning_rate": 2.8138013980422164e-06, "loss": 0.4365, "step": 13910 }, { "epoch": 0.6802023015465807, "grad_norm": 0.43716076016426086, "learning_rate": 2.8061344343133144e-06, "loss": 0.4364, "step": 13920 }, { "epoch": 0.6806909526252779, "grad_norm": 0.4108024537563324, "learning_rate": 2.7984738535825044e-06, "loss": 0.4379, "step": 13930 }, { "epoch": 0.6811796037039751, "grad_norm": 0.45622798800468445, "learning_rate": 2.790819678138056e-06, "loss": 0.4368, "step": 13940 }, { "epoch": 0.6816682547826725, "grad_norm": 0.416840523481369, "learning_rate": 2.783171930249603e-06, "loss": 0.4374, "step": 13950 }, { "epoch": 0.6821569058613697, "grad_norm": 0.5477868914604187, "learning_rate": 2.775530632168084e-06, "loss": 0.4372, "step": 13960 }, { "epoch": 0.6826455569400669, "grad_norm": 0.3617335259914398, "learning_rate": 2.7678958061256667e-06, "loss": 0.4363, "step": 13970 }, { "epoch": 0.6831342080187642, "grad_norm": 0.45384445786476135, "learning_rate": 2.7602674743356893e-06, "loss": 0.4349, "step": 13980 }, { "epoch": 0.6836228590974615, "grad_norm": 0.3232516944408417, "learning_rate": 2.752645658992599e-06, "loss": 0.4369, "step": 13990 }, { "epoch": 0.6841115101761587, "grad_norm": 0.4313335418701172, "learning_rate": 2.745030382271879e-06, "loss": 0.4378, "step": 14000 }, { "epoch": 0.6841115101761587, "eval_loss": 0.41417059302330017, "eval_runtime": 727.8695, "eval_samples_per_second": 243.049, "eval_steps_per_second": 0.475, "step": 14000 }, { "epoch": 0.684600161254856, "grad_norm": 0.5871222615242004, "learning_rate": 2.737421666329987e-06, "loss": 0.4373, "step": 14010 }, { "epoch": 0.6850888123335532, "grad_norm": 0.4553307294845581, "learning_rate": 2.7298195333043022e-06, "loss": 0.4372, "step": 14020 }, { "epoch": 0.6855774634122505, "grad_norm": 0.49893367290496826, "learning_rate": 2.722224005313041e-06, "loss": 0.4366, "step": 14030 }, { "epoch": 0.6860661144909478, "grad_norm": 0.401821494102478, "learning_rate": 2.7146351044552045e-06, "loss": 0.4372, "step": 14040 }, { "epoch": 0.686554765569645, "grad_norm": 0.2880902588367462, "learning_rate": 2.7070528528105165e-06, "loss": 0.4366, "step": 14050 }, { "epoch": 0.6870434166483422, "grad_norm": 0.4244653880596161, "learning_rate": 2.6994772724393516e-06, "loss": 0.4368, "step": 14060 }, { "epoch": 0.6875320677270395, "grad_norm": 0.4931180775165558, "learning_rate": 2.6919083853826724e-06, "loss": 0.4371, "step": 14070 }, { "epoch": 0.6880207188057368, "grad_norm": 0.5409946441650391, "learning_rate": 2.684346213661974e-06, "loss": 0.4363, "step": 14080 }, { "epoch": 0.688509369884434, "grad_norm": 0.4695432484149933, "learning_rate": 2.676790779279209e-06, "loss": 0.4369, "step": 14090 }, { "epoch": 0.6889980209631312, "grad_norm": 4.034379005432129, "learning_rate": 2.669242104216725e-06, "loss": 0.4363, "step": 14100 }, { "epoch": 0.6894866720418286, "grad_norm": 0.6742619872093201, "learning_rate": 2.6617002104372096e-06, "loss": 0.4373, "step": 14110 }, { "epoch": 0.6899753231205258, "grad_norm": 0.6923062801361084, "learning_rate": 2.6541651198836207e-06, "loss": 0.4365, "step": 14120 }, { "epoch": 0.690463974199223, "grad_norm": 0.6054366230964661, "learning_rate": 2.6466368544791164e-06, "loss": 0.4364, "step": 14130 }, { "epoch": 0.6909526252779203, "grad_norm": 0.809479296207428, "learning_rate": 2.639115436126999e-06, "loss": 0.4358, "step": 14140 }, { "epoch": 0.6914412763566176, "grad_norm": 0.458893358707428, "learning_rate": 2.6316008867106547e-06, "loss": 0.4365, "step": 14150 }, { "epoch": 0.6919299274353148, "grad_norm": 1.5249381065368652, "learning_rate": 2.6240932280934794e-06, "loss": 0.4353, "step": 14160 }, { "epoch": 0.6924185785140121, "grad_norm": 0.435376912355423, "learning_rate": 2.616592482118818e-06, "loss": 0.4358, "step": 14170 }, { "epoch": 0.6929072295927093, "grad_norm": 0.5011893510818481, "learning_rate": 2.6090986706099135e-06, "loss": 0.4361, "step": 14180 }, { "epoch": 0.6933958806714066, "grad_norm": 0.42486095428466797, "learning_rate": 2.6016118153698235e-06, "loss": 0.4374, "step": 14190 }, { "epoch": 0.6938845317501038, "grad_norm": 0.29725852608680725, "learning_rate": 2.594131938181368e-06, "loss": 0.4367, "step": 14200 }, { "epoch": 0.6943731828288011, "grad_norm": 1.0349030494689941, "learning_rate": 2.586659060807068e-06, "loss": 0.4382, "step": 14210 }, { "epoch": 0.6948618339074983, "grad_norm": 0.3708353340625763, "learning_rate": 2.579193204989079e-06, "loss": 0.4373, "step": 14220 }, { "epoch": 0.6953504849861956, "grad_norm": 0.4205668568611145, "learning_rate": 2.5717343924491224e-06, "loss": 0.4362, "step": 14230 }, { "epoch": 0.6958391360648929, "grad_norm": 0.6266738772392273, "learning_rate": 2.564282644888434e-06, "loss": 0.4372, "step": 14240 }, { "epoch": 0.6963277871435901, "grad_norm": 0.43474340438842773, "learning_rate": 2.5568379839876883e-06, "loss": 0.4359, "step": 14250 }, { "epoch": 0.6968164382222873, "grad_norm": 0.7086150646209717, "learning_rate": 2.5494004314069422e-06, "loss": 0.4357, "step": 14260 }, { "epoch": 0.6973050893009847, "grad_norm": 0.6918942332267761, "learning_rate": 2.5419700087855765e-06, "loss": 0.4358, "step": 14270 }, { "epoch": 0.6977937403796819, "grad_norm": 0.7701777219772339, "learning_rate": 2.5345467377422216e-06, "loss": 0.4369, "step": 14280 }, { "epoch": 0.6982823914583791, "grad_norm": 0.40936869382858276, "learning_rate": 2.527130639874701e-06, "loss": 0.4364, "step": 14290 }, { "epoch": 0.6987710425370764, "grad_norm": 0.432035356760025, "learning_rate": 2.5197217367599726e-06, "loss": 0.4366, "step": 14300 }, { "epoch": 0.6992596936157737, "grad_norm": 0.41449683904647827, "learning_rate": 2.512320049954056e-06, "loss": 0.4359, "step": 14310 }, { "epoch": 0.6997483446944709, "grad_norm": 0.49594905972480774, "learning_rate": 2.50492560099198e-06, "loss": 0.4364, "step": 14320 }, { "epoch": 0.7002369957731681, "grad_norm": 0.38190391659736633, "learning_rate": 2.4975384113877093e-06, "loss": 0.4362, "step": 14330 }, { "epoch": 0.7007256468518654, "grad_norm": 0.8239844441413879, "learning_rate": 2.490158502634095e-06, "loss": 0.4361, "step": 14340 }, { "epoch": 0.7012142979305627, "grad_norm": 0.5367412567138672, "learning_rate": 2.4827858962027994e-06, "loss": 0.4355, "step": 14350 }, { "epoch": 0.7017029490092599, "grad_norm": 0.4261118173599243, "learning_rate": 2.475420613544237e-06, "loss": 0.4357, "step": 14360 }, { "epoch": 0.7021916000879572, "grad_norm": 0.7066627144813538, "learning_rate": 2.468062676087522e-06, "loss": 0.4379, "step": 14370 }, { "epoch": 0.7026802511666544, "grad_norm": 0.7751229405403137, "learning_rate": 2.4607121052403903e-06, "loss": 0.4358, "step": 14380 }, { "epoch": 0.7031689022453517, "grad_norm": 0.3944869041442871, "learning_rate": 2.4533689223891466e-06, "loss": 0.4371, "step": 14390 }, { "epoch": 0.703657553324049, "grad_norm": 0.5122698545455933, "learning_rate": 2.446033148898605e-06, "loss": 0.4361, "step": 14400 }, { "epoch": 0.7041462044027462, "grad_norm": 0.4192598760128021, "learning_rate": 2.438704806112016e-06, "loss": 0.4361, "step": 14410 }, { "epoch": 0.7046348554814434, "grad_norm": 0.4704056680202484, "learning_rate": 2.4313839153510112e-06, "loss": 0.4359, "step": 14420 }, { "epoch": 0.7051235065601408, "grad_norm": 0.3789515197277069, "learning_rate": 2.4240704979155484e-06, "loss": 0.436, "step": 14430 }, { "epoch": 0.705612157638838, "grad_norm": 0.48638996481895447, "learning_rate": 2.4167645750838336e-06, "loss": 0.4366, "step": 14440 }, { "epoch": 0.7061008087175352, "grad_norm": 0.3896729052066803, "learning_rate": 2.4094661681122684e-06, "loss": 0.4372, "step": 14450 }, { "epoch": 0.7065894597962326, "grad_norm": 0.5547624826431274, "learning_rate": 2.4021752982353918e-06, "loss": 0.435, "step": 14460 }, { "epoch": 0.7070781108749298, "grad_norm": 0.4325717091560364, "learning_rate": 2.394891986665811e-06, "loss": 0.4353, "step": 14470 }, { "epoch": 0.707566761953627, "grad_norm": 0.46477776765823364, "learning_rate": 2.387616254594139e-06, "loss": 0.4372, "step": 14480 }, { "epoch": 0.7080554130323242, "grad_norm": 0.39680078625679016, "learning_rate": 2.3803481231889443e-06, "loss": 0.4359, "step": 14490 }, { "epoch": 0.7085440641110216, "grad_norm": 0.34461086988449097, "learning_rate": 2.3730876135966746e-06, "loss": 0.4377, "step": 14500 }, { "epoch": 0.7085440641110216, "eval_loss": 0.4154199957847595, "eval_runtime": 729.053, "eval_samples_per_second": 242.654, "eval_steps_per_second": 0.475, "step": 14500 }, { "epoch": 0.7090327151897188, "grad_norm": 0.4224153757095337, "learning_rate": 2.3658347469416037e-06, "loss": 0.4366, "step": 14510 }, { "epoch": 0.709521366268416, "grad_norm": 0.32037585973739624, "learning_rate": 2.3585895443257705e-06, "loss": 0.4364, "step": 14520 }, { "epoch": 0.7100100173471133, "grad_norm": 0.6405905485153198, "learning_rate": 2.351352026828917e-06, "loss": 0.4359, "step": 14530 }, { "epoch": 0.7104986684258106, "grad_norm": 0.4093703627586365, "learning_rate": 2.3441222155084196e-06, "loss": 0.4372, "step": 14540 }, { "epoch": 0.7109873195045078, "grad_norm": 0.31071528792381287, "learning_rate": 2.3369001313992373e-06, "loss": 0.4367, "step": 14550 }, { "epoch": 0.7114759705832051, "grad_norm": 0.502044141292572, "learning_rate": 2.3296857955138493e-06, "loss": 0.4365, "step": 14560 }, { "epoch": 0.7119646216619023, "grad_norm": 0.5427960753440857, "learning_rate": 2.3224792288421873e-06, "loss": 0.4372, "step": 14570 }, { "epoch": 0.7124532727405996, "grad_norm": 0.6338086128234863, "learning_rate": 2.3152804523515787e-06, "loss": 0.4358, "step": 14580 }, { "epoch": 0.7129419238192969, "grad_norm": 0.36875244975090027, "learning_rate": 2.3080894869866906e-06, "loss": 0.436, "step": 14590 }, { "epoch": 0.7134305748979941, "grad_norm": 0.39585214853286743, "learning_rate": 2.3009063536694588e-06, "loss": 0.4334, "step": 14600 }, { "epoch": 0.7139192259766913, "grad_norm": 0.4556538164615631, "learning_rate": 2.293731073299032e-06, "loss": 0.4367, "step": 14610 }, { "epoch": 0.7144078770553886, "grad_norm": 0.4585372507572174, "learning_rate": 2.286563666751714e-06, "loss": 0.4344, "step": 14620 }, { "epoch": 0.7148965281340859, "grad_norm": 0.3792722821235657, "learning_rate": 2.2794041548809013e-06, "loss": 0.4372, "step": 14630 }, { "epoch": 0.7153851792127831, "grad_norm": 0.5071465969085693, "learning_rate": 2.2722525585170136e-06, "loss": 0.437, "step": 14640 }, { "epoch": 0.7158738302914803, "grad_norm": 0.47391828894615173, "learning_rate": 2.265108898467449e-06, "loss": 0.4361, "step": 14650 }, { "epoch": 0.7163624813701777, "grad_norm": 0.450090229511261, "learning_rate": 2.2579731955165098e-06, "loss": 0.435, "step": 14660 }, { "epoch": 0.7168511324488749, "grad_norm": 0.4352344870567322, "learning_rate": 2.250845470425346e-06, "loss": 0.4358, "step": 14670 }, { "epoch": 0.7173397835275721, "grad_norm": 1.0980722904205322, "learning_rate": 2.2437257439319045e-06, "loss": 0.4349, "step": 14680 }, { "epoch": 0.7178284346062694, "grad_norm": 0.7365118265151978, "learning_rate": 2.2366140367508515e-06, "loss": 0.436, "step": 14690 }, { "epoch": 0.7183170856849667, "grad_norm": 0.3632850646972656, "learning_rate": 2.2295103695735237e-06, "loss": 0.437, "step": 14700 }, { "epoch": 0.7188057367636639, "grad_norm": 0.4772653877735138, "learning_rate": 2.2224147630678698e-06, "loss": 0.434, "step": 14710 }, { "epoch": 0.7192943878423612, "grad_norm": 0.533318042755127, "learning_rate": 2.2153272378783823e-06, "loss": 0.4348, "step": 14720 }, { "epoch": 0.7197830389210584, "grad_norm": 0.649156928062439, "learning_rate": 2.2082478146260394e-06, "loss": 0.4354, "step": 14730 }, { "epoch": 0.7202716899997557, "grad_norm": 0.5530617833137512, "learning_rate": 2.2011765139082514e-06, "loss": 0.436, "step": 14740 }, { "epoch": 0.7207603410784529, "grad_norm": 0.48404207825660706, "learning_rate": 2.194113356298796e-06, "loss": 0.4359, "step": 14750 }, { "epoch": 0.7212489921571502, "grad_norm": 0.6402378082275391, "learning_rate": 2.1870583623477554e-06, "loss": 0.4366, "step": 14760 }, { "epoch": 0.7217376432358474, "grad_norm": 0.4514593183994293, "learning_rate": 2.1800115525814604e-06, "loss": 0.4347, "step": 14770 }, { "epoch": 0.7222262943145447, "grad_norm": 0.4350273013114929, "learning_rate": 2.1729729475024337e-06, "loss": 0.437, "step": 14780 }, { "epoch": 0.722714945393242, "grad_norm": 0.7733496427536011, "learning_rate": 2.165942567589324e-06, "loss": 0.4362, "step": 14790 }, { "epoch": 0.7232035964719392, "grad_norm": 0.3570731282234192, "learning_rate": 2.158920433296846e-06, "loss": 0.435, "step": 14800 }, { "epoch": 0.7236922475506364, "grad_norm": 0.45792272686958313, "learning_rate": 2.151906565055732e-06, "loss": 0.4359, "step": 14810 }, { "epoch": 0.7241808986293338, "grad_norm": 0.3383428454399109, "learning_rate": 2.1449009832726576e-06, "loss": 0.4367, "step": 14820 }, { "epoch": 0.724669549708031, "grad_norm": 0.4315878450870514, "learning_rate": 2.137903708330188e-06, "loss": 0.4359, "step": 14830 }, { "epoch": 0.7251582007867282, "grad_norm": 0.5013752579689026, "learning_rate": 2.130914760586729e-06, "loss": 0.4346, "step": 14840 }, { "epoch": 0.7256468518654255, "grad_norm": 0.5946633815765381, "learning_rate": 2.1239341603764506e-06, "loss": 0.4355, "step": 14850 }, { "epoch": 0.7261355029441228, "grad_norm": 1.4556235074996948, "learning_rate": 2.1169619280092362e-06, "loss": 0.4352, "step": 14860 }, { "epoch": 0.72662415402282, "grad_norm": 0.49753642082214355, "learning_rate": 2.109998083770628e-06, "loss": 0.4369, "step": 14870 }, { "epoch": 0.7271128051015172, "grad_norm": 0.3729608654975891, "learning_rate": 2.103042647921758e-06, "loss": 0.4356, "step": 14880 }, { "epoch": 0.7276014561802145, "grad_norm": 0.39122653007507324, "learning_rate": 2.096095640699295e-06, "loss": 0.4368, "step": 14890 }, { "epoch": 0.7280901072589118, "grad_norm": 0.42691490054130554, "learning_rate": 2.08915708231539e-06, "loss": 0.4357, "step": 14900 }, { "epoch": 0.728578758337609, "grad_norm": 0.38435041904449463, "learning_rate": 2.0822269929576066e-06, "loss": 0.4363, "step": 14910 }, { "epoch": 0.7290674094163063, "grad_norm": 0.8433852195739746, "learning_rate": 2.075305392788868e-06, "loss": 0.4366, "step": 14920 }, { "epoch": 0.7295560604950035, "grad_norm": 0.5046951174736023, "learning_rate": 2.0683923019474016e-06, "loss": 0.4358, "step": 14930 }, { "epoch": 0.7300447115737008, "grad_norm": 0.9538094401359558, "learning_rate": 2.061487740546679e-06, "loss": 0.4358, "step": 14940 }, { "epoch": 0.7305333626523981, "grad_norm": 0.542107343673706, "learning_rate": 2.0545917286753494e-06, "loss": 0.437, "step": 14950 }, { "epoch": 0.7310220137310953, "grad_norm": 0.2896505296230316, "learning_rate": 2.047704286397188e-06, "loss": 0.4355, "step": 14960 }, { "epoch": 0.7315106648097925, "grad_norm": 0.43803542852401733, "learning_rate": 2.040825433751044e-06, "loss": 0.4363, "step": 14970 }, { "epoch": 0.7319993158884899, "grad_norm": 0.5424397587776184, "learning_rate": 2.0339551907507687e-06, "loss": 0.4366, "step": 14980 }, { "epoch": 0.7324879669671871, "grad_norm": 0.5848090648651123, "learning_rate": 2.027093577385163e-06, "loss": 0.4349, "step": 14990 }, { "epoch": 0.7329766180458843, "grad_norm": 0.3782629072666168, "learning_rate": 2.0202406136179275e-06, "loss": 0.4372, "step": 15000 }, { "epoch": 0.7329766180458843, "eval_loss": 0.4146045744419098, "eval_runtime": 728.9472, "eval_samples_per_second": 242.69, "eval_steps_per_second": 0.475, "step": 15000 }, { "epoch": 0.7334652691245815, "grad_norm": 0.27179810404777527, "learning_rate": 2.01339631938759e-06, "loss": 0.4349, "step": 15010 }, { "epoch": 0.7339539202032789, "grad_norm": 0.6157824397087097, "learning_rate": 2.006560714607455e-06, "loss": 0.436, "step": 15020 }, { "epoch": 0.7344425712819761, "grad_norm": 0.38568001985549927, "learning_rate": 1.99973381916555e-06, "loss": 0.4353, "step": 15030 }, { "epoch": 0.7349312223606733, "grad_norm": 0.3673468232154846, "learning_rate": 1.992915652924558e-06, "loss": 0.4365, "step": 15040 }, { "epoch": 0.7354198734393707, "grad_norm": 0.4711572229862213, "learning_rate": 1.986106235721769e-06, "loss": 0.4348, "step": 15050 }, { "epoch": 0.7359085245180679, "grad_norm": 0.30081677436828613, "learning_rate": 1.9793055873690115e-06, "loss": 0.4361, "step": 15060 }, { "epoch": 0.7363971755967651, "grad_norm": 0.49421292543411255, "learning_rate": 1.9725137276526098e-06, "loss": 0.436, "step": 15070 }, { "epoch": 0.7368858266754624, "grad_norm": 0.4806350767612457, "learning_rate": 1.965730676333309e-06, "loss": 0.4352, "step": 15080 }, { "epoch": 0.7373744777541597, "grad_norm": 0.7303268909454346, "learning_rate": 1.9589564531462344e-06, "loss": 0.4351, "step": 15090 }, { "epoch": 0.7378631288328569, "grad_norm": 0.3639063537120819, "learning_rate": 1.952191077800821e-06, "loss": 0.4361, "step": 15100 }, { "epoch": 0.7383517799115542, "grad_norm": 0.3184981048107147, "learning_rate": 1.94543456998076e-06, "loss": 0.4361, "step": 15110 }, { "epoch": 0.7388404309902514, "grad_norm": 0.4460330605506897, "learning_rate": 1.9386869493439485e-06, "loss": 0.4367, "step": 15120 }, { "epoch": 0.7393290820689487, "grad_norm": 0.2961271107196808, "learning_rate": 1.9319482355224235e-06, "loss": 0.435, "step": 15130 }, { "epoch": 0.7398177331476459, "grad_norm": 0.4846443235874176, "learning_rate": 1.9252184481223033e-06, "loss": 0.4354, "step": 15140 }, { "epoch": 0.7403063842263432, "grad_norm": 0.35571032762527466, "learning_rate": 1.918497606723744e-06, "loss": 0.436, "step": 15150 }, { "epoch": 0.7407950353050404, "grad_norm": 0.5735732913017273, "learning_rate": 1.9117857308808687e-06, "loss": 0.4358, "step": 15160 }, { "epoch": 0.7412836863837376, "grad_norm": 0.5794824361801147, "learning_rate": 1.9050828401217142e-06, "loss": 0.436, "step": 15170 }, { "epoch": 0.741772337462435, "grad_norm": 0.25915631651878357, "learning_rate": 1.8983889539481754e-06, "loss": 0.4357, "step": 15180 }, { "epoch": 0.7422609885411322, "grad_norm": 0.582955002784729, "learning_rate": 1.891704091835953e-06, "loss": 0.4368, "step": 15190 }, { "epoch": 0.7427496396198294, "grad_norm": 0.42489010095596313, "learning_rate": 1.8850282732344887e-06, "loss": 0.4354, "step": 15200 }, { "epoch": 0.7432382906985268, "grad_norm": 0.31416329741477966, "learning_rate": 1.8783615175669106e-06, "loss": 0.4354, "step": 15210 }, { "epoch": 0.743726941777224, "grad_norm": 4.887961387634277, "learning_rate": 1.871703844229985e-06, "loss": 0.4385, "step": 15220 }, { "epoch": 0.7442155928559212, "grad_norm": 1.1010756492614746, "learning_rate": 1.8650552725940468e-06, "loss": 0.4357, "step": 15230 }, { "epoch": 0.7447042439346185, "grad_norm": 0.46031710505485535, "learning_rate": 1.8584158220029514e-06, "loss": 0.4363, "step": 15240 }, { "epoch": 0.7451928950133158, "grad_norm": 0.8031518459320068, "learning_rate": 1.851785511774018e-06, "loss": 0.4355, "step": 15250 }, { "epoch": 0.745681546092013, "grad_norm": 0.33631330728530884, "learning_rate": 1.8451643611979746e-06, "loss": 0.4358, "step": 15260 }, { "epoch": 0.7461701971707102, "grad_norm": 0.3979465365409851, "learning_rate": 1.838552389538894e-06, "loss": 0.4353, "step": 15270 }, { "epoch": 0.7466588482494075, "grad_norm": 0.5838291049003601, "learning_rate": 1.831949616034145e-06, "loss": 0.4369, "step": 15280 }, { "epoch": 0.7471474993281048, "grad_norm": 0.34562933444976807, "learning_rate": 1.8253560598943377e-06, "loss": 0.4373, "step": 15290 }, { "epoch": 0.747636150406802, "grad_norm": 0.42259690165519714, "learning_rate": 1.81877174030326e-06, "loss": 0.436, "step": 15300 }, { "epoch": 0.7481248014854993, "grad_norm": 0.47943800687789917, "learning_rate": 1.8121966764178278e-06, "loss": 0.4341, "step": 15310 }, { "epoch": 0.7486134525641965, "grad_norm": 0.4682493805885315, "learning_rate": 1.8056308873680316e-06, "loss": 0.4361, "step": 15320 }, { "epoch": 0.7491021036428938, "grad_norm": 0.5536458492279053, "learning_rate": 1.7990743922568699e-06, "loss": 0.4359, "step": 15330 }, { "epoch": 0.7495907547215911, "grad_norm": 0.3631746768951416, "learning_rate": 1.7925272101603076e-06, "loss": 0.4358, "step": 15340 }, { "epoch": 0.7500794058002883, "grad_norm": 0.480092853307724, "learning_rate": 1.7859893601272077e-06, "loss": 0.4362, "step": 15350 }, { "epoch": 0.7505680568789855, "grad_norm": 0.4252304434776306, "learning_rate": 1.7794608611792873e-06, "loss": 0.4339, "step": 15360 }, { "epoch": 0.7510567079576829, "grad_norm": 0.37599998712539673, "learning_rate": 1.772941732311052e-06, "loss": 0.4346, "step": 15370 }, { "epoch": 0.7515453590363801, "grad_norm": 0.5096463561058044, "learning_rate": 1.7664319924897493e-06, "loss": 0.4361, "step": 15380 }, { "epoch": 0.7520340101150773, "grad_norm": 0.402937650680542, "learning_rate": 1.7599316606553074e-06, "loss": 0.4345, "step": 15390 }, { "epoch": 0.7525226611937745, "grad_norm": 0.5899362564086914, "learning_rate": 1.75344075572028e-06, "loss": 0.4354, "step": 15400 }, { "epoch": 0.7530113122724719, "grad_norm": 0.6552911996841431, "learning_rate": 1.7469592965697985e-06, "loss": 0.4367, "step": 15410 }, { "epoch": 0.7534999633511691, "grad_norm": 0.34461089968681335, "learning_rate": 1.7404873020615092e-06, "loss": 0.4356, "step": 15420 }, { "epoch": 0.7539886144298663, "grad_norm": 0.34054285287857056, "learning_rate": 1.7340247910255193e-06, "loss": 0.4347, "step": 15430 }, { "epoch": 0.7544772655085636, "grad_norm": 0.548925518989563, "learning_rate": 1.7275717822643496e-06, "loss": 0.4356, "step": 15440 }, { "epoch": 0.7549659165872609, "grad_norm": 0.3071838319301605, "learning_rate": 1.7211282945528667e-06, "loss": 0.4346, "step": 15450 }, { "epoch": 0.7554545676659581, "grad_norm": 0.32380637526512146, "learning_rate": 1.714694346638245e-06, "loss": 0.4363, "step": 15460 }, { "epoch": 0.7559432187446554, "grad_norm": 0.3220982253551483, "learning_rate": 1.7082699572398941e-06, "loss": 0.4356, "step": 15470 }, { "epoch": 0.7564318698233526, "grad_norm": 0.48519644141197205, "learning_rate": 1.7018551450494208e-06, "loss": 0.4337, "step": 15480 }, { "epoch": 0.7569205209020499, "grad_norm": 0.49619343876838684, "learning_rate": 1.6954499287305625e-06, "loss": 0.4359, "step": 15490 }, { "epoch": 0.7574091719807472, "grad_norm": 0.31478312611579895, "learning_rate": 1.6890543269191372e-06, "loss": 0.4353, "step": 15500 }, { "epoch": 0.7574091719807472, "eval_loss": 0.4151374399662018, "eval_runtime": 729.456, "eval_samples_per_second": 242.52, "eval_steps_per_second": 0.474, "step": 15500 }, { "epoch": 0.7578978230594444, "grad_norm": 0.5134409666061401, "learning_rate": 4.396678065461651e-08, "loss": 0.4363, "step": 15510 }, { "epoch": 0.7583864741381416, "grad_norm": 0.3412030041217804, "learning_rate": 9.281875915974597e-08, "loss": 0.435, "step": 15520 }, { "epoch": 0.7588751252168389, "grad_norm": 0.3823215365409851, "learning_rate": 1.4167073766487544e-07, "loss": 0.4359, "step": 15530 }, { "epoch": 0.7593637762955362, "grad_norm": 0.3025282323360443, "learning_rate": 1.905227161700049e-07, "loss": 0.4355, "step": 15540 }, { "epoch": 0.7598524273742334, "grad_norm": 0.4344797730445862, "learning_rate": 2.3937469467513437e-07, "loss": 0.4356, "step": 15550 }, { "epoch": 0.7603410784529306, "grad_norm": 0.28436729311943054, "learning_rate": 2.8822667318026384e-07, "loss": 0.436, "step": 15560 }, { "epoch": 0.760829729531628, "grad_norm": 0.3204064667224884, "learning_rate": 3.3707865168539325e-07, "loss": 0.4361, "step": 15570 }, { "epoch": 0.7613183806103252, "grad_norm": 0.3875465989112854, "learning_rate": 3.859306301905227e-07, "loss": 0.4341, "step": 15580 }, { "epoch": 0.7618070316890224, "grad_norm": 0.368078351020813, "learning_rate": 4.347826086956522e-07, "loss": 0.4351, "step": 15590 }, { "epoch": 0.7622956827677198, "grad_norm": 0.36300018429756165, "learning_rate": 4.836345872007817e-07, "loss": 0.4344, "step": 15600 }, { "epoch": 0.762784333846417, "grad_norm": 0.42110690474510193, "learning_rate": 5.324865657059111e-07, "loss": 0.434, "step": 15610 }, { "epoch": 0.7632729849251142, "grad_norm": 0.37072572112083435, "learning_rate": 5.813385442110406e-07, "loss": 0.4354, "step": 15620 }, { "epoch": 0.7637616360038115, "grad_norm": 0.5293629169464111, "learning_rate": 6.3019052271617e-07, "loss": 0.4342, "step": 15630 }, { "epoch": 0.7642502870825088, "grad_norm": 0.31591010093688965, "learning_rate": 6.790425012212995e-07, "loss": 0.4343, "step": 15640 }, { "epoch": 0.764738938161206, "grad_norm": 0.27564629912376404, "learning_rate": 7.278944797264289e-07, "loss": 0.4364, "step": 15650 }, { "epoch": 0.7652275892399032, "grad_norm": 0.29514557123184204, "learning_rate": 7.767464582315585e-07, "loss": 0.4349, "step": 15660 }, { "epoch": 0.7657162403186005, "grad_norm": 0.26547813415527344, "learning_rate": 8.255984367366879e-07, "loss": 0.4357, "step": 15670 }, { "epoch": 0.7662048913972977, "grad_norm": 0.3546208441257477, "learning_rate": 8.744504152418174e-07, "loss": 0.4342, "step": 15680 }, { "epoch": 0.766693542475995, "grad_norm": 0.6953465938568115, "learning_rate": 9.233023937469468e-07, "loss": 0.4339, "step": 15690 }, { "epoch": 0.7671821935546923, "grad_norm": 0.37491822242736816, "learning_rate": 9.721543722520762e-07, "loss": 0.4357, "step": 15700 }, { "epoch": 0.7676708446333895, "grad_norm": 0.4774235486984253, "learning_rate": 1.0210063507572057e-06, "loss": 0.435, "step": 15710 }, { "epoch": 0.7681594957120867, "grad_norm": 0.47825121879577637, "learning_rate": 1.0698583292623353e-06, "loss": 0.4345, "step": 15720 }, { "epoch": 0.7686481467907841, "grad_norm": 0.35943761467933655, "learning_rate": 1.1187103077674646e-06, "loss": 0.4345, "step": 15730 }, { "epoch": 0.7691367978694813, "grad_norm": 0.41238027811050415, "learning_rate": 1.167562286272594e-06, "loss": 0.4351, "step": 15740 }, { "epoch": 0.7696254489481785, "grad_norm": 0.5406340956687927, "learning_rate": 1.2164142647777236e-06, "loss": 0.4347, "step": 15750 }, { "epoch": 0.7701141000268759, "grad_norm": 0.3181721568107605, "learning_rate": 1.265266243282853e-06, "loss": 0.4354, "step": 15760 }, { "epoch": 0.7706027511055731, "grad_norm": 0.37955865263938904, "learning_rate": 1.3141182217879824e-06, "loss": 0.4347, "step": 15770 }, { "epoch": 0.7710914021842703, "grad_norm": 0.3683488667011261, "learning_rate": 1.362970200293112e-06, "loss": 0.4361, "step": 15780 }, { "epoch": 0.7715800532629675, "grad_norm": 0.3671647012233734, "learning_rate": 1.4118221787982415e-06, "loss": 0.4348, "step": 15790 }, { "epoch": 0.7720687043416649, "grad_norm": 0.4749736189842224, "learning_rate": 1.4606741573033708e-06, "loss": 0.4354, "step": 15800 }, { "epoch": 0.7725573554203621, "grad_norm": 0.2920779883861542, "learning_rate": 1.5095261358085003e-06, "loss": 0.4349, "step": 15810 }, { "epoch": 0.7730460064990593, "grad_norm": 0.5698887705802917, "learning_rate": 1.5583781143136298e-06, "loss": 0.4349, "step": 15820 }, { "epoch": 0.7735346575777566, "grad_norm": 0.4958445131778717, "learning_rate": 1.6072300928187593e-06, "loss": 0.4373, "step": 15830 }, { "epoch": 0.7740233086564539, "grad_norm": 0.37633660435676575, "learning_rate": 1.6560820713238887e-06, "loss": 0.4356, "step": 15840 }, { "epoch": 0.7745119597351511, "grad_norm": 0.3820544183254242, "learning_rate": 1.7049340498290182e-06, "loss": 0.4351, "step": 15850 }, { "epoch": 0.7750006108138484, "grad_norm": 0.3899173140525818, "learning_rate": 1.7537860283341477e-06, "loss": 0.4355, "step": 15860 }, { "epoch": 0.7754892618925456, "grad_norm": 0.36729347705841064, "learning_rate": 1.802638006839277e-06, "loss": 0.4353, "step": 15870 }, { "epoch": 0.7759779129712429, "grad_norm": 0.442569762468338, "learning_rate": 1.8514899853444065e-06, "loss": 0.4363, "step": 15880 }, { "epoch": 0.7764665640499402, "grad_norm": 0.5207741260528564, "learning_rate": 1.900341963849536e-06, "loss": 0.4362, "step": 15890 }, { "epoch": 0.7769552151286374, "grad_norm": 0.901549756526947, "learning_rate": 1.9491939423546656e-06, "loss": 0.4359, "step": 15900 }, { "epoch": 0.7774438662073346, "grad_norm": 0.5226088166236877, "learning_rate": 1.998045920859795e-06, "loss": 0.4362, "step": 15910 }, { "epoch": 0.7779325172860319, "grad_norm": 0.7250573635101318, "learning_rate": 2.046897899364924e-06, "loss": 0.4374, "step": 15920 }, { "epoch": 0.7784211683647292, "grad_norm": 0.34755152463912964, "learning_rate": 2.0957498778700537e-06, "loss": 0.4355, "step": 15930 }, { "epoch": 0.7789098194434264, "grad_norm": 0.37030619382858276, "learning_rate": 2.1446018563751832e-06, "loss": 0.4356, "step": 15940 }, { "epoch": 0.7793984705221236, "grad_norm": 0.44449082016944885, "learning_rate": 2.1934538348803127e-06, "loss": 0.4349, "step": 15950 }, { "epoch": 0.779887121600821, "grad_norm": 1.273260235786438, "learning_rate": 2.2423058133854423e-06, "loss": 0.4356, "step": 15960 }, { "epoch": 0.7803757726795182, "grad_norm": 0.6899981498718262, "learning_rate": 2.2911577918905718e-06, "loss": 0.4355, "step": 15970 }, { "epoch": 0.7808644237582154, "grad_norm": 0.5005556344985962, "learning_rate": 2.3400097703957013e-06, "loss": 0.4347, "step": 15980 }, { "epoch": 0.7813530748369127, "grad_norm": 0.5572786331176758, "learning_rate": 2.388861748900831e-06, "loss": 0.4357, "step": 15990 }, { "epoch": 0.78184172591561, "grad_norm": 0.6249894499778748, "learning_rate": 2.43771372740596e-06, "loss": 0.4348, "step": 16000 }, { "epoch": 0.78184172591561, "eval_loss": 0.4141230583190918, "eval_runtime": 729.2488, "eval_samples_per_second": 242.589, "eval_steps_per_second": 0.474, "step": 16000 } ], "logging_steps": 10, "max_steps": 20465, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7679175174984827e+19, "train_batch_size": 256, "trial_name": null, "trial_params": null }